示例#1
0
class NativeKafkaConsumer(BackgroundTask):
    def __init__(self,
                 brokers,
                 topic_partitions,
                 num_records=1,
                 batch_size=4092):
        super(NativeKafkaConsumer, self).__init__()
        self._topic_partitions = topic_partitions
        self._num_records = num_records
        self._brokers = brokers
        self._batch_size = batch_size
        self._max_attempts = 20
        self.results = TopicsResultSet()

    def task_name(self):
        return f"consumer-worker-{str(random.randint(0,9999))}"

    def _init_consumer(self):
        # Setting 'auto_offset_reset' to something other then "earliest" or
        # "latest" throws "OffsetOutOfRangeError" in the case there is a gap
        # in the log or read of an offset too new / old.
        consumer = KafkaConsumer(client_id=self.task_name(),
                                 bootstrap_servers=self._brokers,
                                 request_timeout_ms=1000,
                                 enable_auto_commit=False,
                                 auto_offset_reset="crash")
        consumer.assign(self._topic_partitions)
        for tps in self._topic_partitions:
            consumer.seek_to_beginning(tps)
        return consumer

    def _run(self):
        def stop_consume(empty_attempts):
            read_all = self.results.num_records() >= self._num_records
            waited_enough = empty_attempts >= self._max_attempts
            return self.is_finished() or read_all or waited_enough

        consumer = self._init_consumer()
        empty_reads = 0
        while not stop_consume(empty_reads):
            try:
                results = consumer.poll(timeout_ms=1000,
                                        max_records=self._batch_size)
                if results is None or len(results) == 0:
                    empty_reads += 1
                    time.sleep(1)
                else:
                    empty_reads = 0
                    self.results.append(results)
            except OffsetOutOfRangeError:
                # Ensure that the element at this offset is read, otherwise
                # there will be gaps in the result set. In other words this
                # class does manage its own offset to its subscriptions.
                time.sleep(1)
                empty_reads += 1
                for tp, values in self.results.rset.items():
                    offset = values[-1].offset + 1
                    # print(f"Offset OOR tp: {tp} - offset {offset}")
                    consumer.seek(tp, offset)
示例#2
0
 def __init__(self,
              brokers,
              topic_partitions,
              num_records=1,
              batch_size=4092):
     super(NativeKafkaConsumer, self).__init__()
     self._topic_partitions = topic_partitions
     self._num_records = num_records
     self._brokers = brokers
     self._batch_size = batch_size
     self._max_attempts = 20
     self.results = TopicsResultSet()
示例#3
0
 def __init__(self,
              brokers,
              topic_partitions,
              num_records=1,
              batch_size=4092):
     super(NativeKafkaConsumer, self).__init__()
     self._topic_partitions = topic_partitions
     self._num_records = num_records
     self._batch_size = batch_size
     self._consumer = KafkaConsumer(client_id=self.task_name(),
                                    bootstrap_servers=brokers,
                                    request_timeout_ms=1000,
                                    enable_auto_commit=False,
                                    auto_offset_reset="earliest")
     self.results = TopicsResultSet()
示例#4
0
class NativeKafkaConsumer(BackgroundTask):
    def __init__(self,
                 brokers,
                 topic_partitions,
                 num_records=1,
                 batch_size=4092):
        super(NativeKafkaConsumer, self).__init__()
        self._topic_partitions = topic_partitions
        self._num_records = num_records
        self._brokers = brokers
        self._batch_size = batch_size
        self._max_attempts = 20
        self.results = TopicsResultSet()

    def task_name(self):
        return f"consumer-worker-{str(random.randint(0,9999))}"

    def _init_consumer(self):
        consumer = KafkaConsumer(client_id=self.task_name(),
                                 bootstrap_servers=self._brokers,
                                 request_timeout_ms=1000,
                                 enable_auto_commit=False,
                                 auto_offset_reset="latest")
        consumer.assign(self._topic_partitions)
        for tps in self._topic_partitions:
            consumer.seek_to_beginning(tps)
        return consumer

    def _run(self):
        def stop_consume(empty_attempts):
            read_all = self.results.num_records() >= self._num_records
            waited_enough = empty_attempts >= self._max_attempts
            return self.is_finished() or read_all or waited_enough

        consumer = self._init_consumer()
        empty_reads = 0
        while not stop_consume(empty_reads):
            results = consumer.poll(timeout_ms=1000,
                                    max_records=self._batch_size)
            if results is None or len(results) == 0:
                empty_reads += 1
                time.sleep(1)
            else:
                empty_reads = 0
                self.results.append(results)
示例#5
0
class NativeKafkaConsumer(BackgroundTask):
    def __init__(self,
                 brokers,
                 topic_partitions,
                 num_records=1,
                 batch_size=4092):
        super(NativeKafkaConsumer, self).__init__()
        self._topic_partitions = topic_partitions
        self._num_records = num_records
        self._batch_size = batch_size
        self._consumer = KafkaConsumer(client_id=self.task_name(),
                                       bootstrap_servers=brokers,
                                       request_timeout_ms=1000,
                                       enable_auto_commit=False,
                                       auto_offset_reset="earliest")
        self.results = TopicsResultSet()

    def task_name(self):
        return f"consumer-worker-{str(random.randint(0,9999))}"

    def _run(self):
        def stop_consume(empty_iterations):
            read_all = self.results.num_records() >= self._num_records
            waited_enough = empty_iterations <= 0
            return self.is_finished() or read_all or waited_enough

        self._consumer.assign(self._topic_partitions)
        empty_iterations = 10
        total = 0
        while not stop_consume(empty_iterations):
            r = self._consumer.poll(timeout_ms=100,
                                    max_records=self._batch_size)
            if len(r) == 0:
                empty_iterations -= 1
                time.sleep(1)
            else:
                total += reduce(lambda acc, x: acc + len(x), r.values(), 0)
                empty_iterations = 10
                self.results.append(r)
示例#6
0
class NativeKafkaConsumer(BackgroundTask):
    def __init__(self,
                 brokers,
                 topic_partitions,
                 max_records_per_topic,
                 batch_size=4092):
        super(NativeKafkaConsumer, self).__init__()
        self._topic_partitions = topic_partitions
        self._max_records_per_topic = max_records_per_topic
        self._brokers = brokers
        self._batch_size = batch_size
        self._max_attempts = 20
        self.results = TopicsResultSet()

    def task_name(self):
        return f"consumer-worker-{str(random.randint(0,9999))}"

    def total_expected_records(self):
        return sum(self._max_records_per_topic.values())

    def _init_consumer(self):
        consumer = KafkaConsumer(client_id=self.task_name(),
                                 bootstrap_servers=self._brokers,
                                 request_timeout_ms=1000,
                                 enable_auto_commit=False,
                                 metadata_max_age_ms=5000,
                                 reconnect_backoff_max_ms=0,
                                 reconnect_backoff_ms=1000,
                                 auto_offset_reset="throw")
        consumer.assign(self._topic_partitions)
        for tps in self._topic_partitions:
            consumer.seek_to_beginning(tps)
        return consumer

    def _finished_consume(self):
        for topic, throughput in self._max_records_per_topic.items():
            if self.results.num_records_for_topic(topic) < throughput:
                return False
        return True

    def _run(self):
        consumer = self._init_consumer()
        empty_reads = 0
        empty_reads_post_complete = 0
        while True:
            if self.is_finished():
                break  # User stopped background task
            if self._finished_consume():
                # The idea is to not stop consuming even if the bounds
                # have been reached, and stop when there really is not more data
                empty_reads_post_complete += 1
                if empty_reads_post_complete >= 3:
                    break
            if empty_reads >= self._max_attempts:
                # However if a lower bound hasn't been reached, wait longer
                # possibly to avert situations where log hasn't been yet populated
                break

            results = consumer.poll(timeout_ms=1000,
                                    max_records=self._batch_size)
            if results is None or len(results) == 0:
                empty_reads += 1
                time.sleep(1)
            else:
                empty_reads = 0
                empty_reads_post_complete = 0
                self.results.append(results)