Exemplo n.º 1
0
    def get_graph_data(self):
        consumer = Consumer(self.conf)
        consumer.subscribe([self.topic])

        # update low and high offsets (don't work without it)
        consumer.get_watermark_offsets(self.partition)

        # set local offset
        consumer.assign([self.partition])

        self.__update_que(consumer)

        # convert data to compatible format
        o = {key: list(value) for key, value in self.data.items()}
        return o
Exemplo n.º 2
0
 def kafka_GetOffset(self, p_szTopicName, p_szGroupID=''):
     if self.__kafka_servers__ is None:
         raise SQLCliException(
             "Missed kafka server information. Please use set kafka server first .."
         )
     c = Consumer({
         'bootstrap.servers': self.__kafka_servers__,
         'group.id': p_szGroupID,
     })
     m_OffsetResults = []
     try:
         for pid in c.list_topics(topic=p_szTopicName
                                  ).topics[p_szTopicName].partitions.keys():
             tp = TopicPartition(p_szTopicName, pid)
             (low, high) = c.get_watermark_offsets(tp)
             m_OffsetResults.append([pid, low, high])
         if len(m_OffsetResults) == 0:
             raise SQLCliException("Topic [" + p_szTopicName +
                                   "] does not exist!")
         return m_OffsetResults
     except KafkaException as ke:
         if "SQLCLI_DEBUG" in os.environ:
             print('traceback.print_exc():\n%s' % traceback.print_exc())
             print('traceback.format_exc():\n%s' % traceback.format_exc())
         raise ke
Exemplo n.º 3
0
class KafkaConsumer(object):
    def __init__(self, group_id, topic):
        self.client = Consumer({
            'bootstrap.servers': KAFKA_SERVER_HOSTS,
            'group.id': group_id,
            'session.timeout.ms': 6000,
            'default.topic.config': {
                'auto.offset.reset': 'smallest'
            }
        })
        self.topic = topic

    def query_kafka(self, max_part):
        for p_id in range(0, max_part):
            tp = TopicPartition(self.topic, p_id)
            committed = self.client.committed([tp])
            watermark_offsets = self.client.get_watermark_offsets(tp)
            c_offset = committed[0].offset
            partition = committed[0].partition
            min_offset = watermark_offsets[0]
            max_offset = watermark_offsets[1]
            print("%d %d %d %d %d" % (partition, min_offset, c_offset,
                                      max_offset, max_offset - c_offset))

    def reset_kafka(self, tps):
        for tp in tps:
            self.client.assign([tp])
            print(tp)
            self.client.poll()

    def close(self):
        self.client.close()
def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert ex.match('Consumer closed')
Exemplo n.º 5
0
def get_latest_applied(client_options, topic_name, read_timeout=1.0):
    client_options.update({
        'auto.offset.reset': 'latest',
        'enable.auto.commit': False,
    })
    c = Consumer(client_options)

    partition = TopicPartition(topic_name, 0)
    low, high = c.get_watermark_offsets(partition)

    if low is not None and high is not None and high > 0:
        last_msg_offset = high - 1
    else:
        last_msg_offset = 0

    partition = TopicPartition(topic_name, 0, last_msg_offset)
    c.assign([partition])

    read = None

    msg = c.consume(num_messages=1, timeout=read_timeout)
    if msg:
        read = msg[0].value().decode('utf-8')
        # print('Read: {}'.format(read))

    c.close()
    return read
Exemplo n.º 6
0
def get_metrics_for_partition(consumer: Consumer, partition: TopicPartition) -> dict:
    timeout = 5
    watermarks = consumer.get_watermark_offsets(partition, timeout=timeout, cached=False)
    if watermarks is None:
        raise Exception(f'Getting watermarks for partition:{partition.partition} on topic: {partition.topic} has taken longer than timeout {timeout} seconds')

    (low, high) = watermarks
    # possible negative values for partition offset or high are defined by the following consts
    # confluent_kafka.OFFSET_BEGINNING == -2
    # confluent_kafka.OFFSET_END == -1
    # confluent_kafka.OFFSET_STORED == -1000
    # confluent_kafka.OFFSET_INVALID == -1001
    if high < 0:
        lag = 0  # Unlikely
    elif partition.offset < 0:
        # No committed offset, show total message count as lag.
        # The actual message count may be lower due to compaction
        # and record deletions.
        lag = high - low
    else:
        lag = high - partition.offset
    return {
            "topic_name": partition.topic,
            "partition_id": partition.partition,
            "high": high,
            "low": low,
            "lag": lag,
            "offset": partition.offset
    }
Exemplo n.º 7
0
    def __init__(self, topic, group, que_len=180):
        self.topic = topic

        self.conf = {
            'bootstrap.servers': 'localhost:9092',
            'group.id': group,
            'enable.auto.commit': True,
        }

        # the application needs a maximum of 180 data units
        self.data = {
            'time': deque(maxlen=que_len),
            'Latitude': deque(maxlen=que_len),
            'Longitude': deque(maxlen=que_len),
            'Altitude': deque(maxlen=que_len)
        }

        consumer = Consumer(self.conf)
        consumer.subscribe([self.topic])

        # download first 180 messges
        self.partition = TopicPartition(topic=self.topic, partition=0)
        low_offset, high_offset = consumer.get_watermark_offsets(
            self.partition)

        # move offset back on 180 messages
        if high_offset > que_len:
            self.partition.offset = high_offset - que_len
        else:
            self.partition.offset = low_offset

        # set the moved offset to consumer
        consumer.assign([self.partition])

        self.__update_que(consumer)
def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert 'Consumer closed' == str(ex.value)
Exemplo n.º 9
0
class KafkaQueryConsumer:
    """
    Wraps Kafka library consumer methods which query the
    broker for metadata and poll for single messages.
    It is a thin wrapper but allows a fake to be used
    in unit tests.
    """
    def __init__(self, broker: str):
        # Set "enable.auto.commit" to False, as we do not need to report to the
        # kafka broker where we got to (it usually does this in case of a
        # crash, but we simply restart the process and go and find the last
        # run_start message.
        #
        # Set "queued.min.messages" to 1 as we will consume backwards through
        # the partition one message at a time; we do not want to retrieve
        # multiple messages in the forward direction each time we step
        # backwards by 1 offset
        conf = {
            "bootstrap.servers": broker,
            "group.id": "consumer_group_name",
            "auto.offset.reset": "latest",
            "enable.auto.commit": False,
            "queued.min.messages": 1
        }
        self._consumer = Consumer(**conf)

    def get_topic_partitions(self, topic: str, offset: int = -1):
        metadata = self._consumer.list_topics(topic)
        return [
            TopicPartition(topic, partition[1].id, offset=offset)
            for partition in metadata.topics[topic].partitions.items()
        ]

    def seek(self, partition: TopicPartition):
        """
        Set offset in partition, the consumer will seek to that offset
        """
        self._consumer.seek(partition)

    def poll(self, timeout=2.):
        """
        Poll for a message from Kafka
        """
        return self._consumer.poll(timeout=timeout)

    def get_watermark_offsets(self,
                              partition: TopicPartition) -> Tuple[int, int]:
        """
        Get the offset of the first and last available
        message in the given partition
        """
        return self._consumer.get_watermark_offsets(partition, cached=False)

    def assign(self, partitions: List[TopicPartition]):
        self._consumer.assign(partitions)

    def offsets_for_times(self, partitions: List[TopicPartition]):
        return self._consumer.offsets_for_times(partitions)
Exemplo n.º 10
0
def consume_everything(topic):
    consumer = Consumer({
        "bootstrap.servers": "localhost:9092",
        "group.id": uuid.uuid4()
    })
    topicpart = TopicPartition(topic, 0, 0)
    consumer.assign([topicpart])
    low, high = consumer.get_watermark_offsets(topicpart)

    return consumer.consume(high - 1)
Exemplo n.º 11
0
def poll_everything(topic):
    consumer = Consumer({
        'bootstrap.servers': 'localhost:9092',
        'group.id': uuid.uuid4()
    })
    topicpart = TopicPartition(topic, 0, 0)
    consumer.assign([topicpart])
    low, high = consumer.get_watermark_offsets(topicpart)

    return consumer.consume(high - 1)
Exemplo n.º 12
0
def count_messages(bootstrap_servers):
    c = Consumer({'bootstrap.servers': bootstrap_servers,
                  'group.id': 'group2',
                  'enable.auto.commit': False,
                  'auto.offset.reset': 'beginning'})

    metadata = c.list_topics()
    topics = metadata.topics
    for topic, topicMetadata in topics.items():
        for partition in topicMetadata.partitions:
            (low, high) = c.get_watermark_offsets(TopicPartition(topic, partition))
            print(f"{topic} {partition}: {high}")
Exemplo n.º 13
0
    def initialize_from_kafka(self, kafka_topic: str,
                              kafka_config: Dict[str, Any]) -> None:
        """
        kafka_topic should have type str

        TODO: this method does not fail if client can't connect to host.
        """
        if not kafka_topic:
            return

        print("Fetching state from kafka topic: {}".format(kafka_topic),
              file=sys.stderr)

        def fail_fast(err: Any, _msg: Any) -> None:
            if err:
                raise KafkaException(err)

        conf = kafka_config.copy()
        conf.update({
            "group.id": "dummy_init_group",  # should never be committed
            "enable.auto.commit": False,
            "auto.offset.reset": "earliest",
            "session.timeout.ms": 10000,
        })
        consumer = Consumer(conf)

        # this watermark fetch is mostly to ensure we are connected to broker and
        # fail fast if not, but we also confirm that we read to end below.
        hwm = consumer.get_watermark_offsets(TopicPartition(kafka_topic, 0),
                                             timeout=5.0,
                                             cached=False)
        if not hwm:
            raise Exception(
                "Kafka consumer timeout, or topic {} doesn't exist".format(
                    kafka_topic))

        consumer.assign([TopicPartition(kafka_topic, 0, 0)])
        c = 0
        while True:
            msg = consumer.poll(timeout=2.0)
            if not msg:
                break
            if msg.error():
                raise KafkaException(msg.error())
            # sys.stdout.write('.')
            self.update(msg.value().decode("utf-8"))
            c += 1
        consumer.close()

        # verify that we got at least to HWM
        assert c >= hwm[1]
        print("... got {} state update messages, done".format(c),
              file=sys.stderr)
def get_kafka_old_offset(topic, kafka_broker, partition_count):
    '''获取kafka 最旧的offset 用来跟后面的 batch_loader 所读取到的offset 做对比'''
    kafka_old_offset = {}
    kafka_new_offset = {}

    try:
        #使用kafka 库来获取的方式
        '''
        from kafka import SimpleClient
        from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy
        from kafka.common import OffsetRequestPayload
        client = SimpleClient(broker_list)
        partitions = client.topic_partitions[topic]
        offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()]
        offsets_responses = client.send_offset_request(offset_requests)
        for r in offsets_responses:
            #print("partition = %s, offset = %s"%(r.partition, r.offsets[0]))
            kafka_old_offset[r.partition] = r.offsets[0]
       '''
        from confluent_kafka import TopicPartition, Consumer, KafkaException
        from confluent_kafka.admin import AdminClient

        conf = {'bootstrap.servers': kafka_broker, 'session.timeout.ms': 6000}
        try:
            admin_client = AdminClient(conf)
            consumer_client = Consumer(conf)

            md = admin_client.list_topics(timeout=10)
            for t in iter(md.topics.values()):
                if str(t) == topic:
                    for p in iter(t.partitions.values()):
                        td = TopicPartition(str(t), p.id)
                        oldest_offset, newest_offset = consumer_client.get_watermark_offsets(
                            td)
                        kafka_old_offset[p.id] = oldest_offset
                        kafka_new_offset[p.id] = newest_offset
        except KafkaException as e:
            logger.error("请检查kafka是否存活:%s" % e)
    except ImportError:
        for partition_id in range(partition_count):
            command = 'kafka-run-class kafka.tools.GetOffsetShell --topic %s --broker-list %s --time -2 --partition %d' % (
                topic, kafka_broker, partition_id)
            args = shlex.split(command)
            process = subprocess.Popen(args,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output = '{}'.format(
                process.stdout.read().decode(encoding='UTF-8'))
            offset = output.split(':')[2]
            kafka_old_offset[partition_id] = int(offset)

    return kafka_old_offset
Exemplo n.º 15
0
class KafkaConsumer:
    def __init__(self, conf, group_id='kafka-rest-service'):
        conf = dict(conf)
        conf['group.id'] = group_id
        self.consumer = Consumer(conf)

    # @cached(cache=TTLCache(maxsize=1024, ttl=60))
    def get_topic_partition_count(self, topic_name):
        cmd = self.consumer.list_topics(topic_name)
        tmd = cmd.topics.get(topic_name, None)
        pcount = 0
        if tmd:
            pcount = len(tmd.partitions)
        return pcount

    # @cached(cache=TTLCache(maxsize=1024, ttl=60))
    def get_topic_offsets(self, topic_name):
        pcount = self.get_topic_partition_count(topic_name)
        if pcount == 0:
            return dict(error=f"Requested topic {topic_name} not found",
                        status="ERROR",
                        report=None)

        part_status_map = {}
        for p in range(pcount):
            l, h = self.consumer.get_watermark_offsets(
                TopicPartition(topic_name, p))
            part_status_map[p] = [h, '1 month']

        def get_minute_report(minute, time_text):
            timestamp = (datetime.now() -
                         timedelta(minutes=minute)).timestamp()
            timestamp = int(timestamp) * 1000
            partitions = [
                TopicPartition(topic_name, p, timestamp) for p in range(pcount)
            ]
            partitions = self.consumer.offsets_for_times(partitions)
            for par in partitions:
                if par.offset > -1:
                    part_status_map[par.partition][-1] = time_text

        get_minute_report(60 * 24 * 7, '1 week')
        get_minute_report(60 * 24, '1 day')
        get_minute_report(60, '1 hour')
        get_minute_report(10, '10 minutes')
        get_minute_report(1, '1 minute')

        part_status_map = {k: list(v) for k, v in part_status_map.items()}
        return dict(error=None,
                    status="SUCCESS",
                    topic=topic_name,
                    offsets=part_status_map)
Exemplo n.º 16
0
def morning_notice():
    # 每只股票都创建 1 个 topic,包含 5 个 partition,partition 0 存放 futu 获取的 snapshot,partition 1 存放 futu 的 实时报价,partition 2 存放 futu 的实时 K线,partition 3 存放 futu 的实时 分时,
    # partition 4 存放 futu 的实时 逐比,partition 5 存放 futu 的实时摆盘,partition 6 存放 futu 的实时经纪队列,partition 7-9 暂时空闲
    consumer = Consumer({
        'bootstrap.servers': 'kafka01',
        'group.id': 'test',
        'enable.auto.commit': False,
        'default.topic.config': {
            'auto.offset.reset': 'largest'
        }
    })

    (rise_ratio_list_smallest,
     rise_ratio_list_largest) = consumer.get_watermark_offsets(
         TopicPartition('test', 0))
    (volume_list_smallest,
     volume_list_largest) = consumer.get_watermark_offsets(
         TopicPartition('test', 1))
    try:
        consumer.assign(
            [TopicPartition('test', 0, rise_ratio_list_largest - 1)])
        consumer.seek(TopicPartition('test', 0, rise_ratio_list_largest - 1))
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))
        latest_rise_ratio = consumer.poll(3.0)
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))

        print(latest_rise_ratio)
        consumer.assign([TopicPartition('test', 1, volume_list_largest - 1)])
        consumer.seek(TopicPartition('test', 1, volume_list_largest - 1))
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))
        latest_volume = consumer.poll(3.0).value()
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))
        print(latest_volume)
    finally:
        consumer.close()
def get_kafka_old_offset(topic, kafka_broker, partition_count):

    kafka_old_offset = {}
    #kafka_new_offset = {}

    try:
        #Get kafka offset through kafka module
        '''
        from kafka import SimpleClient
        from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy
        from kafka.common import OffsetRequestPayload
        client = SimpleClient(broker_list)
        partitions = client.topic_partitions[topic]
        offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()]
        offsets_responses = client.send_offset_request(offset_requests)
        for r in offsets_responses:
            #print("partition = %s, offset = %s"%(r.partition, r.offsets[0]))
            kafka_old_offset[r.partition] = r.offsets[0]
       '''
        # Get kafka offset through confluent_kafka module
        from confluent_kafka import TopicPartition, Consumer, KafkaException
        from confluent_kafka.admin import AdminClient

        conf = {'bootstrap.servers': kafka_broker, 'session.timeout.ms': 6000}
        admin_client = AdminClient(conf)
        consumer_client = Consumer(conf)

        md = admin_client.list_topics(timeout=10)
        for t in iter(md.topics.values()):
            if str(t) == topic:
                for p in iter(t.partitions.values()):
                    td = TopicPartition(str(t), p.id)
                    oldest_offset, newest_offset = consumer_client.get_watermark_offsets(
                        td)
                    kafka_old_offset[p.id] = oldest_offset
                    #kafka_new_offset[p.id] = newest_offset
    except ImportError:
        for partition_id in range(partition_count):
            command = 'kafka-run-class kafka.tools.GetOffsetShell --topic {} --broker-list {} --time -2 --partition {}'.format(
                topic, kafka_broker, partition_id)
            #args = shlex.split(command)
            #process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            #output = '{}'.format(process.stdout.read().decode(encoding='UTF-8'))
            output = utils.shell_wrapper.check_output(command)
            offset = output.split(':')[2]
            kafka_old_offset[partition_id] = int(offset)

    return kafka_old_offset
Exemplo n.º 18
0
def get_last_available_status_message(cons: Consumer, status_topic: str):
    """

    :param cons:
    :param status_topic:
    :return: The last status message.
    """
    partitions = cons.assignment()
    _, hi = cons.get_watermark_offsets(partitions[0],
                                       cached=False,
                                       timeout=2.0)
    last_msg_offset = hi - 1
    cons.assign(
        [TopicPartition(status_topic, partition=0, offset=last_msg_offset)])
    status_msg, _ = poll_for_valid_message(cons, expected_file_identifier=None)
    return status_msg
Exemplo n.º 19
0
def get_all_available_messages(consumer: Consumer):
    """
    Consumes all available messages topics subscribed to by the consumer
    :param consumer: The consumer object
    :return: list of messages, empty if none available
    """
    messages = []
    low_offset, high_offset = consumer.get_watermark_offsets(
        consumer.assignment()[0], cached=False)
    number_of_messages_available = high_offset - low_offset
    while len(messages) < number_of_messages_available:
        message = consumer.poll(timeout=2.0)
        if message is None or message.error():
            continue
        messages.append(message)
    return messages
Exemplo n.º 20
0
def most_recent_message(topic, kafka_config):
    """
    Tries to fetch the most recent message from a given topic.

    This only makes sense for single partition topics (it works with only the
    first partition), though could be extended with "last N" behavior.
    """

    print("Fetching most Kafka message from {}".format(topic))

    conf = kafka_config.copy()
    conf.update({
        'group.id': 'worker-init-last-msg',  # should never commit
        'delivery.report.only.error': True,
        'enable.auto.commit': False,
        'default.topic.config': {
            'request.required.acks': -1,
            'auto.offset.reset': 'latest',
        },
    })

    consumer = Consumer(conf)

    hwm = consumer.get_watermark_offsets(TopicPartition(topic, 0),
                                         timeout=5.0,
                                         cached=False)
    if not hwm:
        raise Exception(
            "Kafka consumer timeout, or topic {} doesn't exist".format(topic))
    print("High watermarks: {}".format(hwm))

    if hwm[1] == 0:
        print("topic is new; not 'most recent message'")
        return None

    consumer.assign([TopicPartition(topic, 0, hwm[1] - 1)])
    msg = consumer.poll(2.0)
    consumer.close()
    if not msg:
        raise Exception("Failed to fetch most recent kafka message")
    if msg.error():
        raise KafkaException(msg.error())
    return msg.value()
Exemplo n.º 21
0
class KafkaClient(object):
    def __init__(self,
                 kafka_bootstrap_servers,
                 kafka_topic,
                 guid=None,
                 partition=None):
        self.kafka_bootstrap_servers = kafka_bootstrap_servers
        self.kafka_topic = kafka_topic
        if partition:
            raise NotImplementedError("multiple partitions not supported yet")
        self.guid = guid
        if not self.guid:
            self.guid = str(uuid4())

        self.p = None
        self.c = None

    def produce(self, key, val):
        try:
            if not self.p:
                self.p = Producer({
                    'bootstrap.servers': self.kafka_bootstrap_servers,
                    'api.version.request': True
                })
            if not isinstance(key, bytes):
                raise TypeError(
                    'producing to kafka requires key to be raw bytes')
            if not isinstance(val, bytes) and val is not None:
                raise TypeError(
                    'producing to kafka requires val to be raw bytes or None')
            self.p.produce(topic=self.kafka_topic, value=val, key=key)
        except BufferError:
            self.p.flush()
            self.p.produce(topic=self.kafka_topic, value=val, key=key)

    def flush_producer(self):
        if self.p:
            self.p.flush()

    def consume(self):
        if not self.c:
            self.c = Consumer({
                'bootstrap.servers': self.kafka_bootstrap_servers,
                'group.id': self.guid,
                'api.version.request': True,
                'log.connection.close': False,
                'socket.keepalive.enable': True,
                'session.timeout.ms': 6000,
                'default.topic.config': {
                    'auto.offset.reset': 'smallest'
                }
            })
            self.c.subscribe([self.kafka_topic])

        # must perform an initial poll to get partition assignments
        first_message = True
        msg = self.c.poll(timeout=10.0)

        # grab watermarks from partition
        partitionobjs = self.c.assignment()
        partitions = {}
        for prt in partitionobjs:
            partition = prt.partition
            last_offset = self.c.get_watermark_offsets(prt)[1] - 1
            if last_offset < 0:  # if nothing in partition then this will be -1
                continue
            position = max(
                self.c.position([prt])[0].offset - 1, -1
            )  # if never read before then call returns -1001 for some reason
            if last_offset > position:
                partitions[partition] = last_offset

        # process partitions up to watermarks (but remember that we already consumed a message, so need to yield that)
        while first_message or len(partitions) > 0:
            if not first_message:
                msg = self.c.poll(timeout=10.0)
            else:
                first_message = False
            if msg is None or msg.error(
            ):  # NOTE:  "if not msg" checks if message len = 0, which is different from checking "if msg is None"
                continue  # ignore errors
            partition = msg.partition()
            if partition in partitions and msg.offset() >= partitions[
                    partition]:  # first check is because we might read past the watermark
                # for a partition that we're already done with... but that's ok
                del partitions[partition]
            yield msg.key(), msg.value(), msg.timestamp()[1]

    def __del__(self):
        self.flush_producer()
        if self.c:
            self.c.close()
class TimeOrderedGeneratorWithTimeout(GeneratorInterface):
    """
    A general generator which can read multiple topics and merge their messages in time order.
    A message must be emitted at (arrival_system_time + latency_ms).
    In batch mode (until reaching the first EOP on each stream) the generator will not discard any messages.
    """
    def __init__(self,
                 broker,
                 groupid,
                 topics_infos: List[TopicInfo],
                 latency_ms,
                 commit_interval_sec=None,
                 group_by_time=False,
                 begin_timestamp=None,
                 begin_flag=None,
                 end_timestamp=None,
                 end_flag=None,
                 heartbeat_interval_ms=-1):
        """
        :param broker: Broker to connect to.
        :param groupid: Group id of the consumer.
        :param topics_infos: [TopicInfo()] - list of TopicInfo objects.
        :param latency_ms: (integer >=0) Latency to wait before serving a message.
                            After this messages with lower or equal timestamps will be discarded.
        :param commit_interval_sec: How many seconds to wait between commits.-1 does not commit with the given group id.
        :param group_by_time: Group messages with the same timestamp. This will yield a list of messages.
        :param begin_timestamp: Timestamp of the kafka messages where the generator will start.
        :param begin_flag: BEGINNING, CONTINUE, LIVE - CONTINUE will continue from the last committed offset.
                            If there was no committed offset will start from the end of the stream.
        :param end_timestamp: Timestamp where to end the reading.
        :param end_flag: NEVER, END_OF_PARTITION
        :param heartbeat_interval_ms: -1 does not produce heartbeat. After every interval will produce a HeartBeat typed
                                        message with the timestamp.
        """
        if begin_timestamp is not None and begin_flag is not None:
            raise Exception(
                'You can not set the begin timestamp and a flag in the same time.'
            )
        if end_timestamp is not None and end_flag is not None:
            raise Exception(
                'You can not set the end timestamp and a flag in the same time.'
            )
        if begin_timestamp is not None and end_timestamp is not None and begin_timestamp >= end_timestamp:
            raise Exception(
                'The begin timestamp is larger then the end timestamp.')
        if begin_flag is not None and end_flag is not None and \
              begin_flag == BeginFlag.LIVE and end_flag == EndFlag.END_OF_PARTITION:
            raise Exception(
                'You can not start in live and process until the end of the streams.'
            )
        if end_flag is not None and not (end_flag == EndFlag.END_OF_PARTITION
                                         or end_flag == EndFlag.NEVER):
            raise Exception(
                'Unknow end flag: {} . Please use the given enum to use proper end flag.'
                .format(end_flag))
        self.end_ts = end_timestamp
        self.end_flag = end_flag
        self.commit_interval_sec = commit_interval_sec
        self.latency_ms = latency_ms
        self.group_by_time = group_by_time
        self.consumer = Consumer({
            'bootstrap.servers': broker,
            'group.id': groupid,
            'enable.auto.commit': False,
            'auto.offset.reset': 'latest',
            'enable.partition.eof': True,
            'fetch.wait.max.ms': 50
        })
        self.tps = []
        self.queues = {}
        self.messages_to_be_committed = {}
        self.begin_timestamp = begin_timestamp
        for ti in topics_infos:
            topic_name = ti.topic
            self.messages_to_be_committed[topic_name] = {
                'last_msg': None,
                'committed': True
            }
            if begin_timestamp is not None:
                self.tps.extend(
                    self.consumer.offsets_for_times([
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=begin_timestamp)
                    ]))
            elif begin_flag is not None:
                if begin_flag == BeginFlag.BEGINNING:
                    self.tps.append(
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=OFFSET_BEGINNING))
                elif begin_flag == BeginFlag.CONTINUE:
                    self.tps.append(
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=OFFSET_STORED))
                elif begin_flag == BeginFlag.LIVE:
                    self.tps.append(
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=OFFSET_END))
                else:
                    raise Exception(
                        'Unknown begin flag. Please use the enum to provide proper begin flag.'
                    )
            else:
                self.tps.append(
                    TopicPartition(topic_name,
                                   partition=ti.partition,
                                   offset=OFFSET_END))
            end_offset = None
            if end_flag is not None and end_flag == EndFlag.END_OF_PARTITION:
                end_offset = self.consumer.get_watermark_offsets(
                    TopicPartition(topic_name, 0))[1] - 1
            if end_offset is None or end_offset >= 0:
                self.queues[topic_name] = Topic(topic_name,
                                                self.consumer,
                                                end_offset=end_offset,
                                                partition=ti.partition,
                                                drop=ti.drop)
        self.consumer.assign(self.tps)
        self.last_commit = time.time()
        self.running = True
        self.heartbeat_interval_ms = heartbeat_interval_ms
        self.next_hb = None

    def stopGenerator(self):
        self.running = False

    def _serve_messages(self, message_to_serve):
        if self.commit_interval_sec is not None and self.group_by_time:
            for msg in message_to_serve:
                self.messages_to_be_committed[msg.topic()]['last_msg'] = msg
                self.messages_to_be_committed[msg.topic()]['committed'] = False

        # serve messages
        if self.group_by_time:
            yield message_to_serve
        else:
            for msg in message_to_serve:
                self.messages_to_be_committed[msg.topic()]['last_msg'] = msg
                self.messages_to_be_committed[msg.topic()]['committed'] = False
                yield msg
                if not self.running:
                    break

        # commit messages when they were delivered
        current_time = time.time()
        if self.commit_interval_sec is not None and (
                current_time - self.last_commit) > self.commit_interval_sec:
            for k in self.messages_to_be_committed.keys():
                if not self.messages_to_be_committed[k]['committed']:
                    self.consumer.commit(
                        self.messages_to_be_committed[k]['last_msg'])
                    self.messages_to_be_committed[k]['committed'] = True
            self.last_commit = current_time

    def _serve_heartbeat(self, current_timestamp_ms):
        if self.next_hb is None:
            if self.begin_timestamp is not None:
                self.next_hb = self.begin_timestamp
            else:
                self.next_hb = current_timestamp_ms
        while self.next_hb <= current_timestamp_ms:
            yield HeartBeat(self.next_hb)
            self.next_hb += self.heartbeat_interval_ms

    def _can_serve(self):
        min_ets = min([
            q.queue[0].message.timestamp()[1]
            for q in self.queues.values() if len(q.queue) > 0
        ],
                      default=-1)
        if min_ets == -1:
            return None
        deadline = getSystemTimestamp() - self.latency_ms
        if all([q.can_be_emitted(min_ets) for q in self.queues.values()]) and \
              any([q.queue[0].ts < deadline for q in self.queues.values()
                if len(q.queue) > 0 and q.queue[0].message.timestamp()[1] == min_ets]):
            return min_ets
        else:
            return None

    def getMessages(self):
        while self.running:
            if all([v.stopped for v in self.queues.values()]):
                message_to_serve = []
                for q in self.queues.values():
                    message_to_serve.extend(q.queue)
                message_to_serve = [m.message for m in message_to_serve]
                message_to_serve.sort(key=lambda x: x.timestamp()[1])
                while len(message_to_serve) > 0:
                    ts = message_to_serve[0].timestamp()[1]
                    serve_it = []
                    while len(message_to_serve) > 0 and message_to_serve[
                            0].timestamp()[1] == ts:
                        serve_it.append(message_to_serve.pop(0))
                        if not self.heartbeat_interval_ms == -1:
                            yield from self._serve_heartbeat(ts)
                        yield from self._serve_messages(serve_it)
                logging.debug('Exiting from generator.')
                break
            msg = self.consumer.poll(0.001)
            if msg is not None:
                if msg.error():
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        if msg.topic() in self.queues:
                            self.queues[msg.topic()].first_eop_reached = True
                            self.queues[msg.topic()].end_of_partition = True
                    else:
                        logging.error('Unhandle error: {}'.format(msg.error()))
                        break
                else:
                    self.queues[msg.topic()].end_of_partition = False
                    if self.end_ts is not None and msg.timestamp(
                    )[1] > self.end_ts:
                        self.queues[msg.topic()].stop_topic()
                    else:
                        self.queues[msg.topic()].add_message(msg)
            while self.running:
                event_ts_to_serve = self._can_serve()
                if event_ts_to_serve is None:
                    if self.end_flag == EndFlag.NEVER and self.heartbeat_interval_ms != -1 \
                      and any([q.end_of_partition for q in self.queues.values()]):
                        if self.next_hb is None:
                            self.next_hb = getSystemTimestamp(
                            ) - self.latency_ms
                        yield from self._serve_heartbeat(getSystemTimestamp() -
                                                         self.latency_ms)
                    break
                if self.heartbeat_interval_ms != -1:
                    yield from self._serve_heartbeat(event_ts_to_serve)
                message_to_serve = []
                for q in self.queues.values():
                    message_to_serve.extend(q.get_messages(event_ts_to_serve))
                yield from self._serve_messages(message_to_serve)
                if self.end_ts is not None and self.end_ts <= event_ts_to_serve:
                    self.running = False
        self.consumer.close()
Exemplo n.º 23
0
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb (err, partitions):
        pass

    kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100',
                   'session.timeout.ms': 1000, # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke (consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3)))
    kc.assign(partitions)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE), str(e.args([0]))

    kc.unassign()

    kc.commit(async=True)

    try:
        kc.commit(async=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        offsets = kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT


    kc.close()
Exemplo n.º 24
0
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({
        'group.id': 'test',
        'socket.timeout.ms': '100',
        'session.timeout.ms': 1000,  # Avoid close() blocking too long
        'on_commit': dummy_commit_cb
    })

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"],
                 on_assign=dummy_assign_revoke,
                 on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(
        map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0],
                                          timeout=0.5,
                                          cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions
                if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    kc.close()
Exemplo n.º 25
0
class KafkaHandler(BaseHandler[KafkaHandlerConfig]):
    config_cls = KafkaHandlerConfig
    _eof_reached: Dict[int, bool]
    OFFSET_AT_FIRST_MESSAGE = OFFSET_BEGINNING
    OFFSET_AFTER_LAST_MESSAGE = OFFSET_END

    # hopefully this number won't get assigned any semantics by the Kafka Devs any time soon
    OFFSET_AT_LAST_MESSAGE = -101

    def __init__(self, config: KafkaHandlerConfig):
        super().__init__(config)
        self._assignment_created = False
        self._seek = OFFSET_BEGINNING
        self._high_watermarks: Dict[int, int] = {}
        self._consumer: Optional[Consumer] = None
        self._producer: Optional[Producer] = None
        self._errors: List[KafkaError] = []

    def _get_producer(self) -> Producer:
        if self._producer is not None:
            return self._producer

        config_instance = esque_config.Config()
        with config_instance.temporary_context(self.config.esque_context):
            self._producer = Producer(
                config_instance.create_confluent_config(
                    include_schema_registry=False))
        return self._producer

    def _get_consumer(self) -> Consumer:
        if self._consumer is not None:
            return self._consumer

        config_instance = esque_config.Config()
        with config_instance.temporary_context(self.config.esque_context):
            group_id = self.config.consumer_group_id
            self._consumer = Consumer({
                "group.id":
                group_id,
                "enable.partition.eof":
                True,
                "enable.auto.commit":
                False,
                **config_instance.create_confluent_config(include_schema_registry=False),
            })

        topic_metadata: TopicMetadata = self._consumer.list_topics(
            self.config.topic_name).topics[self.config.topic_name]
        if topic_metadata.error is not None:
            raise EsqueIOHandlerReadException(
                f"Topic {self.config.topic_name!r} not found.")

        self._eof_reached = {
            partition_id: False
            for partition_id in topic_metadata.partitions.keys()
        }
        for partition_id in topic_metadata.partitions.keys():
            self._high_watermarks[
                partition_id] = self._consumer.get_watermark_offsets(
                    TopicPartition(topic=self.config.topic_name,
                                   partition=partition_id))[1]

        return self._consumer

    def get_serializer_configs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        raise EsqueIOSerializerConfigNotSupported

    def put_serializer_configs(
            self, config: Tuple[Dict[str, Any], Dict[str, Any]]) -> None:
        raise EsqueIOSerializerConfigNotSupported

    def write_message(
            self, binary_message: Union[BinaryMessage, StreamEvent]) -> None:
        self._produce_single_message(binary_message=binary_message)
        self._flush()

    def write_many_messages(
            self, message_stream: Iterable[Union[BinaryMessage,
                                                 StreamEvent]]) -> None:
        for binary_message in message_stream:
            self._produce_single_message(binary_message=binary_message)
        self._flush()

    def _produce_single_message(self, binary_message: BinaryMessage) -> None:
        if isinstance(binary_message, StreamEvent):
            return
        partition_arg = {}
        partition = self._io_to_confluent_partition(binary_message.partition)
        if partition is not None:
            partition_arg["partition"] = partition
        self._get_producer().produce(
            topic=self.config.topic_name,
            value=binary_message.value,
            key=binary_message.key,
            headers=self._io_to_confluent_headers(binary_message.headers),
            timestamp=self._io_to_confluent_timestamp(
                binary_message.timestamp),
            on_delivery=self._delivery_callback,
            **partition_arg,
        )

    def _delivery_callback(self, err: Optional[KafkaError], msg: str):
        if err is None:
            return
        self._errors.append(err)

    def _flush(self):
        self._get_producer().flush()
        if self._errors:
            exception = EsqueIOHandlerWriteException(
                "The following exception(s) occurred while writing to Kafka:\n  "
                + "\n  ".join(map(str, self._errors)))
            self._errors.clear()
            raise exception

    @staticmethod
    def _io_to_confluent_partition(partition: int) -> Optional[int]:
        # TODO: introduce something like the config.send_timestamp flag to make it possible to always return None here.
        #  This would allow for moving messages between topics with different amounts of partitions without making them
        #  unbalanced.
        if partition < 0:
            return None
        return partition

    def _io_to_confluent_timestamp(self, message_ts: datetime.datetime):
        return int(message_ts.timestamp() *
                   1000) if self.config.send_timestamp else 0

    @staticmethod
    def _io_to_confluent_headers(
        headers: List[MessageHeader]
    ) -> Optional[List[Tuple[str, Optional[bytes]]]]:
        if not headers:
            return None
        confluent_headers: List[Tuple[str, Optional[bytes]]] = []
        for header in headers:
            key = header.key
            if header.value is not None:
                value = header.value.encode("utf-8")
            else:
                value = None
            confluent_headers.append((key, value))
        return confluent_headers

    def read_message(self) -> Union[BinaryMessage, StreamEvent]:
        if not self._assignment_created:
            self._assign()

        consumed_message: Optional[Message] = None
        while consumed_message is None:
            consumed_message = self._get_consumer().poll(timeout=0.1)
            if consumed_message is None and all(self._eof_reached.values()):
                return TemporaryEndOfPartition(
                    "Reached end of all partitions",
                    partition=EndOfStream.ALL_PARTITIONS)
        # TODO: process other error cases (connection issues etc.)
        if consumed_message.error() is not None and consumed_message.error(
        ).code() == KafkaError._PARTITION_EOF:
            self._eof_reached[consumed_message.partition()] = True
            return TemporaryEndOfPartition(
                "Reached end of partition",
                partition=consumed_message.partition())
        else:
            self._eof_reached[consumed_message.partition()] = False

            binary_message = self._confluent_to_binary_message(
                consumed_message)

            return binary_message

    def _confluent_to_binary_message(
            self, consumed_message: Message) -> BinaryMessage:
        binary_message = BinaryMessage(
            key=consumed_message.key(),
            value=consumed_message.value(),
            partition=consumed_message.partition(),
            offset=consumed_message.offset(),
            timestamp=self._confluent_to_io_timestamp(consumed_message),
            headers=self._confluent_to_io_headers(consumed_message.headers()),
        )
        return binary_message

    @staticmethod
    def _confluent_to_io_timestamp(
            consumed_message: Message) -> datetime.datetime:
        return datetime.datetime.fromtimestamp(
            consumed_message.timestamp()[1] / 1000, tz=datetime.timezone.utc)

    @staticmethod
    def _confluent_to_io_headers(
        confluent_headers: Optional[List[Tuple[str, Optional[bytes]]]]
    ) -> List[MessageHeader]:
        io_headers: List[MessageHeader] = []

        if confluent_headers is None:
            return io_headers

        for confluent_header in confluent_headers:
            key, value = confluent_header
            if value is not None:
                value = value.decode("utf-8")
            io_headers.append(MessageHeader(key, value))

        return io_headers

    def message_stream(self) -> Iterable[Union[BinaryMessage, StreamEvent]]:
        while True:
            yield self.read_message()

    def seek(self, position: int) -> None:
        self._seek = position

    def _assign(self) -> None:
        self._assignment_created = True
        if self._seek == self.OFFSET_AT_LAST_MESSAGE:
            self._get_consumer().assign([
                TopicPartition(topic=self.config.topic_name,
                               partition=partition_id,
                               offset=high_watermark - 1) for partition_id,
                high_watermark in self._high_watermarks.items()
            ])
        else:
            self._get_consumer().assign([
                TopicPartition(topic=self.config.topic_name,
                               partition=partition_id,
                               offset=self._seek)
                for partition_id in self._eof_reached.keys()
            ])

    def close(self) -> None:
        if self._consumer is not None:
            self._consumer.close()
            self._consumer = None
        if self._producer is not None:
            self._producer.flush()
            self._producer = None
Exemplo n.º 26
0
    def get_last_n_messages(
            self, n: int) -> Optional[List[Tuple[datetime.datetime, Dict]]]:
        '''
        Returns the last n published timestamps and messages or None, if no message has been published yet.
        If the configured topic has more than one partition, you will receive more messages than requested
        (at most partitions * n). You might receive less messages than requested, if the broker has cleared messages.

        :return: List of tuples with timestamp and message or None if no message has been published yet
        '''

        consumer = Consumer({
            'bootstrap.servers': self.__kafka_bootstrap,
            'group.id': self.__import_id
        })
        partitions = consumer.list_topics(topic=self.__kafka_topic).topics[
            self.__kafka_topic].partitions.keys()
        self.__logger.debug("Found " + str(len(partitions)) +
                            " partition(s) of topic " + self.__kafka_topic)
        num_messages = 0
        topic_partitions = []
        for partition in partitions:
            high_low_offset = consumer.get_watermark_offsets(
                cimpl.TopicPartition(self.__kafka_topic, partition=partition))
            high_offset = high_low_offset[1]
            low_offset = high_low_offset[0]
            available_messages = high_offset - low_offset
            self.__logger.debug("Low/High offset of partition " +
                                str(partition) + " is " + str(low_offset) +
                                "/" + str(high_offset))
            if high_offset > 0:  # Ignore partitions without data
                if available_messages >= n:
                    offset = high_offset - n
                    num_messages += n
                else:
                    offset = low_offset
                    num_messages += available_messages
                partition = cimpl.TopicPartition(self.__kafka_topic,
                                                 partition=partition,
                                                 offset=offset)
                topic_partitions.append(partition)
                self.__logger.debug("Setting offset of partition " +
                                    str(partition))

        if len(topic_partitions) == 0:  # No partition has any data
            return None

        consumer.assign(topic_partitions)
        consumer.commit(offsets=topic_partitions)
        tuples = []
        consumed_messages = 0
        batch_size = 10000
        self.__logger.debug("Consuming last " + str(num_messages) +
                            " message(s)")

        while consumed_messages < num_messages:
            if consumed_messages + batch_size <= num_messages:
                to_consume = batch_size
            else:
                to_consume = num_messages - consumed_messages

            consumed_messages += to_consume
            self.__logger.debug("Consuming batch of " + str(to_consume) +
                                " messages")
            msgs = consumer.consume(num_messages=to_consume, timeout=30)

            for msg in msgs:
                value = json.loads(msg.value())
                if 'time' not in value:
                    self.__logger.warning(
                        "time field missing in message, is someone else using this topic? Ignoring "
                        "message")
                    continue
                if 'value' not in value or not isinstance(
                        value['value'], Dict):
                    self.__logger.warning(
                        "value field missing or malformed in message, is someone else using this topic? "
                        "Ignoring message")
                    continue

                try:
                    date_time = datetime.datetime.strptime(
                        value["time"], "%Y-%m-%dT%H:%M:%SZ")
                except ValueError:
                    self.__logger.warning(
                        "time field not in rfc3339 format, is someone else using this topic? Ignoring "
                        "message")
                    continue
                tuples.append((date_time, value["value"]))

        consumer.close()
        return tuples
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100',
                   'session.timeout.ms': 1000,  # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    kc.close()
Exemplo n.º 28
0
        'auto.offset.reset': 'smallest'
    }
})

# 消费整个 topic
# subscribe和assign是不能同时使用的。subscribe表示订阅topic,从kafka记录的offset开始消费。assign表示从指定的offset开始消费。
consumer.subscribe(['test'])

# 消费 topic 里某一个或几个特定的 partition
consumer.assign([TopicPartition('test', 4)])

# 重置 offset
consumer.assign([TopicPartition('test', 4, 2)])

# 获取一个 partition 的最小、最大 offset
consumer.get_watermark_offsets(TopicPartition('test', 4))
# (0, 19)

# 如果是一个新的 group.id 必须先消费一条消息,这样后面的重置 offset 才有效, 如果不消费,重置 offset 前后获取到的 offset 值都是-1001
# 获取当前 offset 位置
consumer.position([TopicPartition('test', 3)])

# 重置 offset 到任意位置,committed 决定了下一次连接后的 offset 位置(以 group 为维度),本次连接无效。本次连接的 offset 位置由 position 决定。
# 重置 offset 后,要 close 重新连才有效。position 决定本次连接的 offset 位置,用 seek() 修改。
consumer.seek(TopicPartition('test', 3, 1))
consumer.commit(offsets=[TopicPartition('test', 3, 7)])

# 检查重置的位置
msg = consumer.committed([TopicPartition('test', 3)])
print(msg)
Exemplo n.º 29
0
    if len(argv) > 1 and argv[1] == "global":
        pfile_name = argv[0]
        option = argv[1]

        p_config = producer_global  # default option for producer
        c_config = consumer_global  # default option for consumer
        topic = global_topic

    # Kafka Producer
    p = Producer(p_config)

    # Kafka Consumer
    c = Consumer(c_config)
    c.subscribe([topic])

    low, high = c.get_watermark_offsets(TopicPartition(topic, partition=0))
    print("low offset: ", low)
    print("high offset: ", high)

    c.assign([TopicPartition(topic, partition=0, offset=high)])

    while True:
        msg = c.poll(1.0)

        if msg is None:
            continue
        if msg.error():
            print("Consumer error: {}".format(msg.error()))
            continue

        if msg.key() is None:
Exemplo n.º 30
0
class KafkaConsumer(Consumer[TPayload]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(
        self,
        configuration: Mapping[str, Any],
        codec: Codec[KafkaPayload, TPayload],
        *,
        commit_retry_policy: Optional[RetryPolicy] = None,
    ) -> None:
        if commit_retry_policy is None:
            commit_retry_policy = NoRetryPolicy()

        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.commit", "true")) is not False):
            raise ValueError(
                "invalid value for 'enable.auto.commit' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.offset.store", "true"))
                is not False):
            raise ValueError(
                "invalid value for 'enable.auto.offset.store' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__codec = codec

        self.__offsets: MutableMapping[Partition, int] = {}
        self.__staged_offsets: MutableMapping[Partition, int] = {}
        self.__paused: Set[Partition] = set()

        self.__commit_retry_policy = commit_retry_policy

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[Topic],
        on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None,
    ) -> None:
        """
        Subscribe to topics. This replaces a previous subscription.

        This method does not block. The subscription may not be fulfilled
        immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks
        are called when the subscription state changes with the updated
        assignment for this consumer.

        If provided, the ``on_assign`` callback is called with a mapping of
        partitions to their offsets (at this point, the working offset and the
        committed offset are the same for each partition) on each subscription
        change. Similarly, the ``on_revoke`` callback (if provided) is called
        with a sequence of partitions that are being removed from this
        consumer's assignment. (This callback does not include the offsets,
        as the working offset and committed offset may differ, in some cases
        by substantial margin.)

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[Partition, int] = {
                    Partition(Topic(i.topic), i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)

                # Ensure that all partitions are resumed on assignment to avoid
                # carrying over state from a previous assignment.
                self.__consumer.resume([
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in offsets.items()
                ])

                for partition in offsets:
                    self.__paused.discard(partition)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(offsets)
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            partitions = [
                Partition(Topic(i.topic), i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(partitions)
            finally:
                for partition in partitions:
                    # Staged offsets are deleted during partition revocation to
                    # prevent later committing offsets for partitions that are
                    # no longer owned by this consumer.
                    if partition in self.__staged_offsets:
                        logger.warning(
                            "Dropping staged offset for revoked partition (%r)!",
                            partition,
                        )
                        del self.__staged_offsets[partition]

                    try:
                        self.__offsets.pop(partition)
                    except KeyError:
                        # If there was an error during assignment, this
                        # partition may have never been added to the offsets
                        # mapping.
                        logger.warning(
                            "failed to delete offset for unknown partition: %r",
                            partition,
                        )

                    self.__paused.discard(partition)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(
            [topic.name for topic in topics],
            on_assign=assignment_callback,
            on_revoke=revocation_callback,
        )

    def unsubscribe(self) -> None:
        """
        Unsubscribe from topics.

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self,
             timeout: Optional[float] = None) -> Optional[Message[TPayload]]:
        """
        Return the next message available to be consumed, if one is
        available. If no message is available, this method will block up to
        the ``timeout`` value before returning ``None``. A timeout of
        ``0.0`` represents "do not block", while a timeout of ``None``
        represents "block until a message is available (or forever)".

        Calling this method may also invoke subscription state change
        callbacks.

        This method may also raise an ``EndOfPartition`` error (a subtype of
        ``ConsumerError``) when the consumer has reached the end of a
        partition that it is subscribed to and no additional messages are
        available. The ``partition`` attribute of the raised exception
        specifies the end which partition has been reached. (Since this
        consumer is multiplexing a set of partitions, this exception does not
        mean that *all* of the partitions that the consumer is subscribed to
        do not have any messages, just that it has reached the end of one of
        them. This also does not mean that additional messages won't be
        available in future poll calls.) Not every backend implementation
        supports this feature or is configured to raise in this scenario.

        Raises an ``InvalidState`` exception if called on a closed consumer.

        Raises a ``TransportError`` for various other consumption-related
        errors.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfPartition(
                    Partition(Topic(message.topic()), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        headers: Optional[Headers] = message.headers()
        result = Message(
            Partition(Topic(message.topic()), message.partition()),
            message.offset(),
            self.__codec.decode(
                KafkaPayload(
                    message.key(),
                    message.value(),
                    headers if headers is not None else [],
                )),
            datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0),
        )

        self.__offsets[result.partition] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[Partition, int]:
        """
        Return the read offsets for all assigned partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None:
        invalid_offsets: Mapping[Partition, int] = {
            partition: offset
            for partition, offset in offsets.items() if offset < 0
        }

        if invalid_offsets:
            raise ConsumerError(f"invalid offsets: {invalid_offsets!r}")

    def __seek(self, offsets: Mapping[Partition, int]) -> None:
        self.__validate_offsets(offsets)

        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(partition.topic.name, partition.index,
                                        offset)
                for partition, offset in offsets.items()
            ])
        else:
            for partition, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[Partition, int]) -> None:
        """
        Change the read offsets for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned partitions")

        self.__seek(offsets)

    def pause(self, partitions: Sequence[Partition]) -> None:
        """
        Pause the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot pause unassigned partitions")

        self.__consumer.pause([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        self.__paused.update(partitions)

        # XXX: Seeking to a specific partition offset and immediately pausing
        # that partition causes the seek to be ignored for some reason.
        self.seek({
            partition: offset
            for partition, offset in self.__offsets.items()
            if partition in partitions
        })

    def resume(self, partitions: Sequence[Partition]) -> None:
        """
        Resume the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot resume unassigned partitions")

        self.__consumer.resume([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        for partition in partitions:
            self.__paused.discard(partition)

    def paused(self) -> Sequence[Partition]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return [*self.__paused]

    def stage_offsets(self, offsets: Mapping[Partition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError(
                "cannot stage offsets for unassigned partitions")

        self.__validate_offsets(offsets)

        # TODO: Maybe log a warning if these offsets exceed the current
        # offsets, since that's probably a side effect of an incorrect usage
        # pattern?
        self.__staged_offsets.update(offsets)

    def __commit(self) -> Mapping[Partition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]]

        if self.__staged_offsets:
            result = self.__consumer.commit(
                offsets=[
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in self.__staged_offsets.items()
                ],
                asynchronous=False,
            )
        else:
            result = []

        assert result is not None  # synchronous commit should return result immediately

        self.__staged_offsets.clear()

        offsets: MutableMapping[Partition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``Partition`` objects returned by ``commit``. These
            # are an implementation detail of the Kafka Consumer, so we don't
            # expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[Partition(Topic(value.topic),
                              value.partition)] = value.offset

        return offsets

    def commit_offsets(self) -> Mapping[Partition, int]:
        """
        Commit staged offsets for all partitions that this consumer is
        assigned to. The return value of this method is a mapping of
        partitions with their committed offsets as values.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        return self.__commit_retry_policy.call(self.__commit)

    def close(self, timeout: Optional[float] = None) -> None:
        """
        Close the consumer. This stops consuming messages, *may* commit
        staged offsets (depending on the configuration), and ends its
        subscription.

        Raises a ``InvalidState`` if the consumer is unable to be closed
        before the timeout is reached.
        """
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED

    @property
    def closed(self) -> bool:
        return self.__state is KafkaConsumerState.CLOSED
Exemplo n.º 31
0
class ConsoleConsumer:
    def __init__(self, brokers, topic, offset, key_decoder, value_decoder,
                 registry_url, additional_properties):
        config = {
            'bootstrap.servers': brokers,
            'enable.partition.eof': 'true',
            'group.id': 'not-used',
            'auto.offset.reset': 'earliest',
            'enable.auto.commit': 'false'
        }
        self.consumer = Consumer({**additional_properties, **config})
        self.topic = topic
        self.offset = offset.lower()
        self.key_decoder = key_decoder.lower()
        self.value_decoder = value_decoder.lower()
        self.avro_serializer = None
        if registry_url:
            client = CachedSchemaRegistryClient(registry_url)
            self.avro_serializer = MessageSerializer(client)

    def run(self):
        try:
            partition_ends = 0
            total_parts, partitions = self._partitions()
            self.consumer.assign(partitions)
            while True:
                msg = self.consumer.poll(timeout=0.5)
                if msg is None:
                    continue

                if msg.error():
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        eprint(
                            f'{msg.topic()} reached end of partition [{msg.partition()}] at offset {msg.offset()}'
                        )
                        partition_ends += 1
                        if partition_ends == total_parts:
                            break
                    elif msg.error():
                        raise KafkaException(msg.error())
                else:
                    record = {
                        'key': self._decode(self.key_decoder, msg.key()),
                        'payload': self._decode(self.value_decoder,
                                                msg.value()),
                        'topic': msg.topic(),
                        'partition': msg.partition(),
                        'offset': msg.offset(),
                        'timestamp': msg.timestamp()[1]
                    }
                    print(json.dumps(record))
        finally:
            self.consumer.close()

    def _partitions(self):
        parts = []
        topic_data = self.consumer.list_topics(topic=self.topic)
        total_parts = len(topic_data.topics[self.topic].partitions)
        for i in range(0, total_parts):
            partition = TopicPartition(self.topic, i, offset=OFFSET_BEGINNING)
            if self.offset == 'earliest':
                parts.append(partition)
            else:
                try:
                    start, end = self.consumer.get_watermark_offsets(
                        partition, timeout=0.5)
                    real_offset = int(self.offset)
                    ass_offset = (end + real_offset) if (
                        real_offset < 0) else (start + real_offset)
                    parts.append(
                        TopicPartition(self.topic, i, offset=ass_offset))
                except ValueError:
                    eprint(f"Could not parse offset: {self.offset}")
                    exit(1)
        return total_parts, parts

    def _decode(self, data_type, payload):
        if data_type == "avro":
            return self.avro_serializer.decode_message(payload)
        payload_str = payload.decode('utf-8')
        try:
            return json.loads(payload_str)
        except (JSONDecodeError, TypeError):
            return payload_str
Exemplo n.º 32
0
Arquivo: kafka.py Projeto: Appva/snuba
class KafkaConsumer(Consumer[TopicPartition, int, bytes]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(self, configuration: Mapping[str, Any]) -> None:
        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__offsets: MutableMapping[TopicPartition, int] = {}

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[str],
        on_assign: Optional[Callable[[Sequence[TopicPartition]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[TopicPartition]], None]] = None,
    ) -> None:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[TopicPartition, int] = {
                    TopicPartition(i.topic, i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(list(offsets.keys()))
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            streams = [
                TopicPartition(i.topic, i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(streams)
            finally:
                for stream in streams:
                    try:
                        self.__offsets.pop(stream)
                    except KeyError:
                        # If there was an error during assignment, this stream
                        # may have never been added to the offsets mapping.
                        logger.warning(
                            "failed to delete offset for unknown stream: %r",
                            stream)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(topics,
                                  on_assign=assignment_callback,
                                  on_revoke=revocation_callback)

    def unsubscribe(self) -> None:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self, timeout: Optional[float] = None) -> Optional[KafkaMessage]:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfStream(
                    TopicPartition(message.topic(), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        result = KafkaMessage(
            TopicPartition(message.topic(), message.partition()),
            message.offset(),
            message.value(),
        )

        self.__offsets[result.stream] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[TopicPartition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __seek(self, offsets: Mapping[TopicPartition, int]) -> None:
        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(stream.topic, stream.partition, offset)
                for stream, offset in offsets.items()
            ])
        else:
            for stream, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(stream.topic, stream.partition,
                                            offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[TopicPartition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned streams")

        self.__seek(offsets)

    def commit(self) -> Mapping[TopicPartition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]] = None

        retries_remaining = 3
        while result is None:
            try:
                result = self.__consumer.commit(asynchronous=False)
                assert result is not None
            except KafkaException as e:
                if not e.args[0].code() in (
                        KafkaError.REQUEST_TIMED_OUT,
                        KafkaError.NOT_COORDINATOR_FOR_GROUP,
                        KafkaError._WAIT_COORD,
                ):
                    raise

                if not retries_remaining:
                    raise

                logger.warning(
                    "Commit failed: %s (%d retries remaining)",
                    str(e),
                    retries_remaining,
                )
                retries_remaining -= 1
                time.sleep(1)

        offsets: MutableMapping[TopicPartition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``TopicPartition`` objects returned by ``commit``.
            # These are an implementation detail of the Kafka Consumer, so we
            # don't expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[TopicPartition(value.topic,
                                   value.partition)] = value.offset

        return offsets

    def close(self, timeout: Optional[float] = None) -> None:
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED