Exemplo n.º 1
0
    def get_consumer_for_topic(self, topic_name, group_id, partition, offset=None):
        """
        Method to instantiate the kafka consumer for the given topic, consumer group and partition
        :param topic_name: topic name
        :param group_id: consumer group id
        :param partition: partition id
        :return: consumer instance
        """
        try:
            log.info("Fetching consumer for topic: " + topic_name)
            if topic_name + "_" + str(partition) in self.consumer_dict:
                return self.consumer_dict[topic_name + "_" + str(partition)]
            conf = {'bootstrap.servers': self.bootstrap_servers,
                    'group.id': group_id,
                    # 'session.timeout.ms': 1000,
                    'default.topic.config': {
                        'auto.offset.reset': 'earliest'
                    }
                    }
            consumer = confluent_kafka.Consumer(**conf)

            if offset is None:
                tp = confluent_kafka.TopicPartition(topic_name, partition)
            else:
                tp = confluent_kafka.TopicPartition(topic_name, partition, offset)
            consumer.assign([tp])
            self.consumer_dict[topic_name + "_" + str(partition)] = consumer
        except Exception as e:
            print(e)
            log.error("Error while setting up the consumer for topic: " + topic_name)
            raise e
        return consumer
Exemplo n.º 2
0
    def poll_kafka(self):
        import confluent_kafka as ck

        def commit(_part):
            topic, part_no, _, _, offset = _part[1:]
            _tp = ck.TopicPartition(topic, part_no, offset + 1)
            self.consumer.commit(offsets=[_tp], asynchronous=True)

        @gen.coroutine
        def checkpoint_emit(_part):
            ref = RefCounter(cb=lambda: commit(_part))
            yield self._emit(_part, metadata=[{'ref': ref}])

        tps = []
        for partition in range(self.npartitions):
            tps.append(ck.TopicPartition(self.topic, partition))

        while True:
            try:
                committed = self.consumer.committed(tps, timeout=1)
            except ck.KafkaException:
                pass
            else:
                for tp in committed:
                    self.positions[tp.partition] = tp.offset
                break

        try:
            while not self.stopped:
                out = []
                for partition in range(self.npartitions):
                    tp = ck.TopicPartition(self.topic, partition, 0)
                    try:
                        low, high = self.consumer.get_watermark_offsets(
                            tp, timeout=0.1)
                    except (RuntimeError, ck.KafkaException):
                        continue
                    if 'auto.offset.reset' in self.consumer_params.keys():
                        if self.consumer_params[
                                'auto.offset.reset'] == 'latest':
                            self.positions[partition] = high
                    current_position = self.positions[partition]
                    lowest = max(current_position, low)
                    if high > lowest + self.max_batch_size:
                        high = lowest + self.max_batch_size
                    if high > lowest:
                        out.append((self.consumer_params, self.topic,
                                    partition, self.keys, lowest, high - 1))
                        self.positions[partition] = high
                self.consumer_params['auto.offset.reset'] = 'earliest'

                for part in out:
                    yield self.loop.add_callback(checkpoint_emit, part)

                else:
                    yield gen.sleep(self.poll_interval)
        finally:
            self.consumer.unsubscribe()
            self.consumer.close()
Exemplo n.º 3
0
    def cb(self):
        while True:
            started = self.get_source_consumer()
            if started:
                delete_from_db = []
                for topic_partition in self.db.keys():
                    topic, partition = str_topic_partition(topic_partition)
                    if len(self.db[topic_partition]):
                        while True:
                            try:
                                low_offset, high_offset = self.consumer.get_watermark_offsets(
                                    ck.TopicPartition(topic, partition))
                                current_offset = self.consumer.committed(
                                    [ck.TopicPartition(topic,
                                                       partition)])[0].offset
                                break
                            except Exception as e:
                                logger.warning(e)

                        for offset in sorted(self.db[topic_partition].keys()):
                            if self.db[topic_partition][offset]:
                                if offset < current_offset:
                                    logger.warning(
                                        f'topic partition: {topic_partition}, offset: {offset}, current offset: {current_offset}, current offset higher than message offset'
                                    )
                                    delete_from_db.append(
                                        ck.TopicPartition(
                                            topic, partition, offset + 1))
                                else:
                                    self.partitions.append(
                                        ck.TopicPartition(
                                            topic, partition, offset + 1))
                            else:
                                break

                if self.commit():
                    L = []
                    for p in self.partitions:
                        self.db[topic_partition_str(
                            p.topic, p.partition)].pop(p.offset - 1)
                        L.append({
                            'topic': p.topic,
                            'partition': p.partition,
                            'offset': p.offset - 1
                        })
                    self.last = self._emit(L, emit_id=self.last_emit_id)
                    self.partitions = []
                    self.last_emit_id = None

                for p in delete_from_db:
                    self.db[topic_partition_str(p.topic,
                                                p.partition)].pop(p.offset - 1)

            yield self.last
            yield gen.sleep(self.interval)
Exemplo n.º 4
0
    def inner(topic: str, options=None):
        topic_name = get_topic_name(topic)
        topics = [topic_name]
        options = processing_config(options)
        # look for the servers (it is the only config we are interested in)
        servers = [
            elm["value"]
            for elm in options["processing"]["kafka_config"]
            if elm["name"] == "bootstrap.servers"
        ]
        if len(servers) < 1:
            raise ValueError(
                "Bad kafka_config, could not find 'bootstrap.servers'.\n"
                "The configuration should have an entry of the format \n"
                "{name:'bootstrap.servers', value:'127.0.0.1'} at path 'processing.kafka_config'"
            )

        servers = servers[0]

        settings = {
            "bootstrap.servers": servers,
            "group.id": "test-consumer-%s" % uuid.uuid4().hex,
            "enable.auto.commit": True,
            "auto.offset.reset": "earliest",
        }

        consumer = kafka.Consumer(settings)
        consumer.assign([kafka.TopicPartition(t, 0) for t in topics])

        def die():
            consumer.close()

        request.addfinalizer(die)
        return consumer, options, topic_name
Exemplo n.º 5
0
    def committed(self, partitions, timeout=10000):

        """
        Retrieves the last successfully committed Kafka offset of the
        underlying KafkaDatasource connection.

        Parameters
        ----------
        partitions : list,
            Topic/Partition instances that specify the TOPPAR
            instances the offsets should be retrieved for
        timeout : int, default 10000,
            Max time to wait on the response from
            the Kafka broker in milliseconds

        Returns
        -------
        tuple
            Tuple of ck.TopicPartition objects
        """

        toppars = [
            ck.TopicPartition(
                part.topic,
                part.partition,
                self.kafka_meta_client.get_committed_offset(
                    part.topic.encode(), part.partition
                ),
            )
            for part in partitions
        ]

        return toppars
def verify_consumer_seek(c, seek_to_msg):
    """ Seek to message and verify the next consumed message matches.
        Must only be performed on an actively consuming consumer. """

    tp = confluent_kafka.TopicPartition(seek_to_msg.topic(),
                                        seek_to_msg.partition(),
                                        seek_to_msg.offset())
    print('seek: Seeking to %s' % tp)
    c.seek(tp)

    while True:
        msg = c.poll()
        assert msg is not None
        if msg.error():
            print('seek: Ignoring non-message: %s' % msg.error())
            continue

        if msg.topic() != seek_to_msg.topic() or msg.partition(
        ) != seek_to_msg.partition():
            continue

        print('seek: message at offset %d' % msg.offset())
        assert msg.offset() == seek_to_msg.offset(), \
            'expected message at offset %d, not %d' % (seek_to_msg.offset(), msg.offset())
        break
Exemplo n.º 7
0
    def subscribe(self, topic, timeout=10.0):
        """Subscribes to a topic for consuming. This method doesn't use Kafka's
        Consumer Groups; it assigns all partitions manually to this
        process.

        Parameters
        ----------
        topic : `str`
            The name of the topic to subscribe to.
        timeout : `float`
            How long, in seconds, to block when fetching topic metadata
        """
        logger.debug(f"subscribing to topic {topic}")
        topic_meta = self.describe_topic(topic, timeout)
        assignment = []
        for partition_id in topic_meta.partitions.keys():
            logger.debug(
                f"adding subscription to topic partition={partition_id}")
            tp = confluent_kafka.TopicPartition(
                topic=topic,
                partition=partition_id,
                offset=confluent_kafka.OFFSET_BEGINNING,
            )
            assignment.append(tp)

        logger.debug("registering topic assignment")
        self.consumer.assign(assignment)
Exemplo n.º 8
0
    def poll_kafka(self):
        import confluent_kafka as ck

        try:
            while not self.stopped:
                out = []
                for partition in range(self.npartitions):
                    tp = ck.TopicPartition(self.topic, partition, 0)
                    try:
                        low, high = self.consumer.get_watermark_offsets(
                            tp, timeout=0.1)
                    except (RuntimeError, ck.KafkaException):
                        continue
                    current_position = self.positions[partition]
                    lowest = max(current_position, low)
                    if high > lowest:
                        out.append((self.consumer_params, self.topic, partition,
                                    lowest, high - 1))
                        self.positions[partition] = high

                for part in out:
                    yield self._emit(part)

                else:
                    yield gen.sleep(self.poll_interval)
        finally:
            self.consumer.unsubscribe()
            self.consumer.close()
Exemplo n.º 9
0
    def subscribe(self,
                  topic: str,
                  timeout: timedelta = timedelta(seconds=10)):
        """Subscribes to a topic for consuming. This method doesn't use Kafka's
        Consumer Groups; it assigns all partitions manually to this
        process.

        The topic must already exist for the subscription to succeed.
        """
        self.logger.debug(f"subscribing to topic {topic}")

        try:
            topic_meta = self.describe_topic(topic, timeout)
        except KeyError:
            raise ValueError(f"topic {topic} does not exist on the broker, so can't subscribe")

        assignment = []
        for partition_id in topic_meta.partitions.keys():
            self.logger.debug(f"adding subscription to topic partition={partition_id}")
            tp = confluent_kafka.TopicPartition(
                topic=topic,
                partition=partition_id,
            )
            assignment.append(tp)

        self.logger.debug("registering topic assignment")
        self._consumer.assign(assignment)
Exemplo n.º 10
0
 def get_consumer_offsets(
     self, topics: List[str], ignore_group_regex: str = IGNORE_GROUP_REGEX,
     no_of_threads: int = 1
         ) -> List[Offset]:
     broker_topics = self.client.list_topics().topics
     partitions = []
     for topic_name in topics:
         partitions.extend([ck.TopicPartition(topic_name, k)
                            for k in broker_topics[topic_name].partitions])
     consumer_groups = []
     logger.info('Fetch consumer groups from broker')
     for consumer_group in self.get_consumer_groups():
         if re.findall(ignore_group_regex, consumer_group):
             logger.debug(f'Ignoring consumer group: {consumer_group}')
             continue
         consumer_groups.append(consumer_group)
     logger.info(f'Fetch consumer offsets for {len(consumer_groups)} '
                 'consumer groups')
     if no_of_threads == 1:
         offsets: List[Offset] = []
         for cg in consumer_groups:
             _offsets = ConfluentAdminClient._get_offsets(
                 cg, partitions, self.consumer_config,)
             offsets.extend(_offsets)
         return offsets
     return self._threaded_get_offsets(partitions, consumer_groups,
                                       no_of_threads)
Exemplo n.º 11
0
Arquivo: tap.py Projeto: hkwi/osara
            def on_assign(consumer, partitions):
                ready = True
                current = {a.topic for a in self._consumer.assignment()}
                current.update({p.topic for p in partitions})
                if ensure_topics and set(ensure_topics) - current:
                    ready = False

                seeks = []
                for p in partitions:
                    if p.topic in seek_topics:
                        seeks.append(
                            confluent_kafka.TopicPartition(
                                p.topic, p.partition, timestamp))

                if seeks:
                    consumer.assign(consumer.offsets_for_times(seeks))
                    seek_topics.difference_update({p.topic for p in seeks})

                if seek_topics:
                    ready = False

                if ready:
                    ret.set()
                else:
                    raise NotReady()
Exemplo n.º 12
0
def get_message_batch(kafka_params, topic, partition, keys, low, high, timeout=None):
    """Fetch a batch of kafka messages (keys & values) in given topic/partition

    This will block until messages are available, or timeout is reached.
    """
    import confluent_kafka as ck
    t0 = time.time()
    consumer = ck.Consumer(kafka_params)
    tp = ck.TopicPartition(topic, partition, low)
    consumer.assign([tp])
    out = []
    try:
        while True:
            msg = consumer.poll(0)
            if msg and msg.value() and msg.error() is None:
                if high >= msg.offset():
                    if keys:
                        out.append({'key':msg.key(), 'value':msg.value()})
                    else:
                        out.append(msg.value())
                if high <= msg.offset():
                    break
            else:
                time.sleep(0.1)
                if timeout is not None and time.time() - t0 > timeout:
                    break
    finally:
        consumer.close()
    return out
Exemplo n.º 13
0
    def start(self):
        if self.stopped:
            self.stopped = False
            self.consumer = ck.Consumer(self.cpars)
            self.consumer.subscribe(self.topics)
            tp = ck.TopicPartition(self.topics[0], 0, 0)

            # blocks for consumer thread to come up
            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 14
0
    def start(self):
        import confluent_kafka as ck

        if self.stopped:
            self.stopped = False
            self.consumer = ck.Consumer(self.cpars)
            tp = ck.TopicPartition(self.topics[0], 0, 0)

            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 15
0
    def start(self):
        import confluent_kafka as ck
        if self.stopped:
            self.consumer = ck.Consumer(self.consumer_params)
            self.stopped = False
            tp = ck.TopicPartition(self.topic, 0, 0)

            # blocks for consumer thread to come up
            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 16
0
 def set_consumer_offsets(self, offsets: List[Offset]):
     grouped_offsets = ConfluentAdminClient._group_offsets(
         offsets)
     for consumer_group, _offsets in grouped_offsets.items():
         consumer = ck.Consumer({**self.consumer_config,
                                 'group.id': consumer_group})
         tps = [ck.TopicPartition(o.topic, o.partition, o.value)
                for o in _offsets]
         logger.info(f'Set {len(tps)} offsets for consumer '
                     f'group: {consumer_group}')
         consumer.commit(offsets=tps, asynchronous=False)
Exemplo n.º 17
0
def test_kafka_offset(kafka_client, topic, commit_offset):
    offsets = [ck.TopicPartition(topic, 0, commit_offset)]
    kafka_client.commit(offsets=offsets)

    # Get the offsets that were just committed to Kafka
    retrieved_offsets = kafka_client.committed(offsets)

    for off in retrieved_offsets:
        assert_eq(off.topic, offsets[0].topic)
        assert_eq(off.partition, offsets[0].partition)
        assert_eq(off.offset, offsets[0].offset)
Exemplo n.º 18
0
def fetch_udf(params):
    """
    PySpark UDF to fetch available messages from the Kafka cluster,
    starting at the most recent commit point according to Kafka's own
    internal log.

    Joins a Kafka group and lets Kafka handle partition assignments.

    Does NOT commit offsets to Kafka.

    NOTE: Because PySpark doesn't deal well with the concept of modules,
    your Spark application will need to wrap this function itself.
    The process to follow is:
    1. Start up a SparkSession
    2. `spark.sparkContext.addPyFile(etl_lib.__file__)`
    3. ```
        fetch_udf = pandas_udf(etl_lib.fetch_udf,
                               "partition_id long, offset long, value string",
                               PandasUDFType.GROUPED_MAP)
       ```

    Arguments:
        params: One-line dataframe containing information on how to connect
                to Kafka. Columns of this dataframe:
            partition_id: Which Kafka partition to fetch from
            bootstrap_servers: String to pass for the eponymous argument
                when connecting to Kafka
            topic_name: Name of Kafka topic to subscribe to

    Returns a Pandas dataframe with the schema (partition_id, offset, message)
    """
    partition_ix = params["partition_id"][0]
    bootstrap_servers = params["bootstrap_servers"][0]
    topic_name = params["topic_name"][0]

    max_messages = 100000

    # NOTE: Timeouts MUST be several seconds, or Kafka won't reliably return
    # any data, even when running locally.
    timeout_sec = 10.0

    c = _make_consumer(bootstrap_servers)

    c.assign([kafka.TopicPartition(topic_name, partition_ix)])
    msgs = c.consume(num_messages=max_messages, timeout=timeout_sec)
    c.close()

    # Convert list of surrogate objects to a list of tuples and then to a dataframe.
    msg_tuples = [(partition_ix, m.offset(), m.value()) for m in msgs]

    # Convert buffered data to a dataframe.
    return pd.DataFrame.from_records(
        msg_tuples, columns=["partition_id", "offset", "value"])
Exemplo n.º 19
0
    def start(self):
        import confluent_kafka as ck
        if self.stopped:
            self.stopped = False
            self.consumer = ck.Consumer(self.cpars)
            self.consumer.subscribe(self.topics)
            weakref.finalize(
                self, lambda consumer=self.consumer: _close_consumer(consumer))
            tp = ck.TopicPartition(self.topics[0], 0, 0)

            # blocks for consumer thread to come up
            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 20
0
def _read_partition(part_metadata, topic, conf, batch_size):
    """
    TODO: describe func
    """
    print("Here1")
    part_no, low, high = part_metadata

    tp = ck.TopicPartition(topic, part_no, low)

    # create the consumer only the first time it is called
    # then store it in the worker state dict
    worker_state = get_worker() 
    if not hasattr(worker_state, 'consumer'):
        worker_state.consumer = None

    # first call -> create consumer
    if worker_state.consumer == None:
        worker_state.consumer = ck.Consumer(conf)

    # use the consumer stored in the worker dict
    c = worker_state.consumer

    last_offset = low
    c.assign([tp])
    
    print("Created consumer")
    # get a batch of messages
    messages = c.consume(min(batch_size, high - last_offset))
    print("Read messages")
    values = []
    for m in messages:
        last_offset = m.offset()
        values.append(json.loads(m.value().decode('utf-8')))

    # commit the current offset
    _tp = ck.TopicPartition(topic, part_no, last_offset+1)
    c.commit(offsets=[_tp], asynchronous=True)

    return values
Exemplo n.º 21
0
    def start(self):
        import confluent_kafka as ck
        if self.stopped:
            self.stopped = False
            self.consumer = ck.Consumer(self.cpars)
            self.consumer.subscribe(self.topics)
            weakref.finalize(
                self, lambda consumer=self.consumer: _close_consumer(consumer)
            )
            tp = ck.TopicPartition(self.topics[0], 0, 0)

            # blocks for consumer thread to come up and invoke poll to
            # establish connection with broker to fetch oauth token for kafka
            self.consumer.poll(timeout=1)
            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 22
0
def commit_offsets(partition_offset_pairs: List[Tuple[int, int]],
                   kafka_bootstrap_servers: str, kafka_topic: str):
    """
    Tell Kafka that we have consumed all messages up to a set of offsets.

    Args:
        partition_offset_pairs: List of commit offsets by partition.
            Must have exactly one offset per partition.
        kafka_bootstrap_servers: Kafka connection string
    """
    c = _make_consumer(kafka_bootstrap_servers)
    c.commit(offsets=[
        kafka.TopicPartition(kafka_topic, partition, offset)
        for partition, offset in partition_offset_pairs
    ])
    c.close()
Exemplo n.º 23
0
    def start(self):
        import confluent_kafka as ck
        if self.engine == "cudf":  # pragma: no cover
            from custreamz import kafka

        if self.stopped:
            if self.engine == "cudf":  # pragma: no cover
                self.consumer = kafka.Consumer(self.consumer_params)
            else:
                self.consumer = ck.Consumer(self.consumer_params)
            self.stopped = False
            tp = ck.TopicPartition(self.topic, 0, 0)

            # blocks for consumer thread to come up
            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 24
0
    def read_batch(self):
        """
        TODO: describe func
        """

        # get partitions metadata
        parts_metadata = self.master_consumer \
            .list_topics(self.topic) \
            .topics[self.topic].partitions

        # create a list of partitions to be processed
        partitions_list = []
        for part_num in range(len(parts_metadata)):
            # create a TopicPartion object for the current partition
            tp = ck.TopicPartition(self.topic, part_num)

            # get first and last offsets for the current partition
            low, high = self.master_consumer.get_watermark_offsets(tp)

            # get the last committed position (take the first element)
            committed_offset = self.master_consumer.committed([tp], timeout=1)[0].offset
            if committed_offset==-1001:
                committed_offset = 0

            # check if there are new messages
            if low + committed_offset == high:
                #print(f"No new messages in partition {part_num}.")
                continue
            else:
                partitions_list.append((part_num, low+committed_offset, high))

        if len(partitions_list)==0:
            #print("All partitions have been processed. Skipping.")
            return []
        else:
            # send the partitions to the workers
            clients_fut = self.client.scatter(partitions_list, broadcast=True)

            # read the partition in each worker
            partitions_fut = [
                self.client.submit(
                    _read_partition, fut, self.topic, self.conf, self.batch_size
                ) for fut in clients_fut
            ]

            # return the list of futures
            return partitions_fut
Exemplo n.º 25
0
def get_message_batch(kafka_params,
                      topic,
                      partition,
                      low,
                      high,
                      done=set(),
                      timeout=None):
    import confluent_kafka as ck

    t0 = time.time()
    consumer = ck.Consumer(kafka_params)
    tp = ck.TopicPartition(topic, partition, low)
    consumer.assign([tp])
    out = []
    try:
        while True:
            msg = consumer.poll(0)
            if msg and msg.value() and msg.error() is None:
                partition = msg.partition()

                if partition in done:
                    continue

                offset = msg.offset()
                topic = msg.topic()
                val = msg.value()

                id_val = {
                    'partition': partition,
                    'offset': offset,
                    'topic': topic,
                }
                if high >= msg.offset():
                    out.append((id_val, val))
                if high <= msg.offset():
                    break
            else:
                time.sleep(0.1)
                if timeout is not None and time.time() - t0 > timeout:
                    break
    finally:
        consumer.close()
    return out
Exemplo n.º 26
0
    def start(self):
        import confluent_kafka as ck
        if self.engine == "cudf":  # pragma: no cover
            from custreamz import kafka

        if self.stopped:
            if self.engine == "cudf":  # pragma: no cover
                self.consumer = kafka.Consumer(self.consumer_params)
            else:
                self.consumer = ck.Consumer(self.consumer_params)
            weakref.finalize(self, lambda consumer=self.consumer: _close_consumer(consumer))
            self.stopped = False
            tp = ck.TopicPartition(self.topic, 0, 0)

            # blocks for consumer thread to come up and invoke poll to establish
            # connection with broker to fetch oauth token for kafka
            self.consumer.poll(timeout=1)
            self.consumer.get_watermark_offsets(tp)
            self.loop.add_callback(self.poll_kafka)
Exemplo n.º 27
0
    def __init__(self,
                 topic_prefix,
                 channel,
                 consumer_group,
                 brokers,
                 partition=None,
                 reset_offsets=False,
                 commit_offsets=True):
        if sys.version_info[0] == 2:
            self.channel = channel
        else:
            self.channel = bytes(channel, 'ascii')
        # connect to kafka
        self.topic_name = ".".join([topic_prefix, channel])
        self.consumer_group = ".".join([consumer_group, self.topic_name])
        self.partition = partition
        conf = {
            'bootstrap.servers': brokers,
            'group.id': self.consumer_group,
            'default.topic.config': {
                'auto.offset.reset': 'earliest'
            },
            'heartbeat.interval.ms': 60000,
            'api.version.request': True,
            'enable.auto.commit': commit_offsets,
        }
        self.kc = confluent_kafka.Consumer(conf)

        if self.partition:
            topic_list = [
                confluent_kafka.TopicPartition(self.topic_name, self.partition)
            ]
            self.kc.assign(topic_list)
        else:
            self.kc.subscribe([self.topic_name])

        if reset_offsets:
            logging.info("Resetting commited offsets")
            raise NotImplementedError
Exemplo n.º 28
0
def read_messages(topic, timeout=10):
    availableTopics = adminClient.list_topics().topics

    if topic not in availableTopics:
        raise Exception("Topic {} not found".format(topic))

    topicPartitions = []

    for partition in availableTopics[topic].partitions.keys():
        topicPartitions.append(
            ck.TopicPartition(topic, partition, ck.OFFSET_BEGINNING))

    consumer.assign(topicPartitions)

    messages = []

    while True:
        msg = consumer.poll(timeout=timeout)
        if not msg or msg.error():
            break
        messages.append(msg.value())

    return messages
Exemplo n.º 29
0
def verify_batch_consumer():
    """ Verify basic batch Consumer functionality """

    # Consumer config
    conf = {
        'bootstrap.servers': bootstrap_servers,
        'group.id': 'test.py',
        'session.timeout.ms': 6000,
        'enable.auto.commit': False,
        'api.version.request': api_version_request,
        'on_commit': print_commit_result,
        'error_cb': error_cb,
        'default.topic.config': {
            'auto.offset.reset': 'earliest'
        }
    }

    # Create consumer
    c = confluent_kafka.Consumer(**conf)

    # Subscribe to a list of topics
    c.subscribe([topic])

    max_msgcnt = 1000
    batch_cnt = 100
    msgcnt = 0

    while msgcnt < max_msgcnt:
        # Consume until we hit max_msgcnt

        # Consume messages (error()==0) or event (error()!=0)
        msglist = c.consume(batch_cnt, 10.0)
        assert len(msglist) == batch_cnt, 'expected %d messages, not %d' % (
            batch_cnt, len(msglist))

        for msg in msglist:
            if msg.error():
                print('Consumer error: %s: ignoring' % msg.error())
                continue

            tstype, timestamp = msg.timestamp()
            print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' %
                  (msg.topic(), msg.partition(), msg.offset(), msg.key(),
                   msg.value(), tstype, timestamp))

            if (msg.offset() % 5) == 0:
                # Async commit
                c.commit(msg, async=True)
            elif (msg.offset() % 4) == 0:
                offsets = c.commit(msg, async=False)
                assert len(
                    offsets) == 1, 'expected 1 offset, not %s' % (offsets)
                assert offsets[0].offset == msg.offset()+1, \
                    'expected offset %d to be committed, not %s' % \
                    (msg.offset(), offsets)
                print('Sync committed offset: %s' % offsets)

            msgcnt += 1

    print('max_msgcnt %d reached' % msgcnt)

    # Get current assignment
    assignment = c.assignment()

    # Get cached watermark offsets
    # Since we're not making use of statistics the low offset is not known so ignore it.
    lo, hi = c.get_watermark_offsets(assignment[0], cached=True)
    print('Cached offsets for %s: %d - %d' % (assignment[0], lo, hi))

    # Query broker for offsets
    lo, hi = c.get_watermark_offsets(assignment[0], timeout=1.0)
    print('Queried offsets for %s: %d - %d' % (assignment[0], lo, hi))

    # Close consumer
    c.close()

    # Start a new client and get the committed offsets
    c = confluent_kafka.Consumer(**conf)
    offsets = c.committed(
        list(
            map(lambda p: confluent_kafka.TopicPartition(topic, p),
                range(0, 3))))
    for tp in offsets:
        print(tp)

    c.close()
Exemplo n.º 30
0
def verify_consumer():
    """ Verify basic Consumer functionality """

    # Consumer config
    conf = {
        'bootstrap.servers': bootstrap_servers,
        'group.id': 'test.py',
        'session.timeout.ms': 6000,
        'enable.auto.commit': False,
        'api.version.request': api_version_request,
        'on_commit': print_commit_result,
        'error_cb': error_cb,
        'default.topic.config': {
            'auto.offset.reset': 'earliest'
        }
    }

    # Create consumer
    c = confluent_kafka.Consumer(**conf)

    def print_wmark(consumer, parts):
        # Verify #294: get_watermark_offsets() should not fail on the first call
        #              This is really a librdkafka issue.
        for p in parts:
            wmarks = consumer.get_watermark_offsets(parts[0])
            print('Watermarks for %s: %s' % (p, wmarks))

    # Subscribe to a list of topics
    c.subscribe([topic], on_assign=print_wmark)

    max_msgcnt = 100
    msgcnt = 0

    while True:
        # Consume until EOF or error

        # Consume message (error()==0) or event (error()!=0)
        msg = c.poll()
        if msg is None:
            raise Exception(
                'Got timeout from poll() without a timeout set: %s' % msg)

        if msg.error():
            if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF:
                print('Reached end of %s [%d] at offset %d' %
                      (msg.topic(), msg.partition(), msg.offset()))
                break
            else:
                print('Consumer error: %s: ignoring' % msg.error())
                break

        tstype, timestamp = msg.timestamp()
        print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' %
              (msg.topic(), msg.partition(), msg.offset(), msg.key(),
               msg.value(), tstype, timestamp))

        if (msgcnt == 11):
            parts = c.assignment()
            print('Pausing partitions briefly')
            c.pause(parts)
            exp_None = c.poll(timeout=2.0)
            assert exp_None is None, "expected no messages during pause, got %s" % exp_None
            print('Resuming partitions')
            c.resume(parts)

        if (msg.offset() % 5) == 0:
            # Async commit
            c.commit(msg, async=True)
        elif (msg.offset() % 4) == 0:
            offsets = c.commit(msg, async=False)
            assert len(offsets) == 1, 'expected 1 offset, not %s' % (offsets)
            assert offsets[0].offset == msg.offset()+1, \
                'expected offset %d to be committed, not %s' % \
                (msg.offset(), offsets)
            print('Sync committed offset: %s' % offsets)

        msgcnt += 1
        if msgcnt >= max_msgcnt:
            print('max_msgcnt %d reached' % msgcnt)
            break

    # Get current assignment
    assignment = c.assignment()

    # Get cached watermark offsets
    # Since we're not making use of statistics the low offset is not known so ignore it.
    lo, hi = c.get_watermark_offsets(assignment[0], cached=True)
    print('Cached offsets for %s: %d - %d' % (assignment[0], lo, hi))

    # Query broker for offsets
    lo, hi = c.get_watermark_offsets(assignment[0], timeout=1.0)
    print('Queried offsets for %s: %d - %d' % (assignment[0], lo, hi))

    # Query offsets for timestamps by setting the topic partition offset to a timestamp. 123456789000 + 1
    topic_partions_to_search = list(
        map(lambda p: confluent_kafka.TopicPartition(topic, p, 123456789001),
            range(0, 3)))
    print("Searching for offsets with %s" % topic_partions_to_search)

    offsets = c.offsets_for_times(topic_partions_to_search, timeout=1.0)
    print("offsets_for_times results: %s" % offsets)

    # Close consumer
    c.close()

    # Start a new client and get the committed offsets
    c = confluent_kafka.Consumer(**conf)
    offsets = c.committed(
        list(
            map(lambda p: confluent_kafka.TopicPartition(topic, p),
                range(0, 3))))
    for tp in offsets:
        print(tp)

    c.close()