def _partition(self): """Consume messages from kafka Consume messages from kafka using the Kazoo SetPartitioner to allow multiple consumer processes to negotiate access to the kafka partitions """ # KazooClient and SetPartitioner objects need to be instantiated after # the consumer process has forked. Instantiating prior to forking # gives the appearance that things are working but after forking the # connection to zookeeper is lost and no state changes are visible if not self._kazoo_client: self._kazoo_client = KazooClient(hosts=self._zookeeper_url) self._kazoo_client.start() state_change_event = threading.Event() self._set_partitioner = (SetPartitioner( self._kazoo_client, path=self._zookeeper_path, set=self._consumer.fetch_offsets.keys(), state_change_event=state_change_event, identifier=str(datetime.datetime.now()))) try: while 1: if self._set_partitioner.failed: raise Exception("Failed to acquire partition") elif self._set_partitioner.release: log.info("Releasing locks on partition set {} " "for topic {}".format(self._partitions, self._kafka_topic)) self._set_partitioner.release_set() self._partitions = [] elif self._set_partitioner.acquired: if not self._partitions: self._partitions = [p for p in self._set_partitioner] if not self._partitions: log.info("Not assigned any partitions on topic {}," " waiting for a Partitioner state change". format(self._kafka_topic)) state_change_event.wait() state_change_event.clear() continue log.info("Acquired locks on partition set {} " "for topic {}".format(self._partitions, self._kafka_topic)) # Reconstruct the kafka consumer object because the # consumer has no API that allows the set of partitons # to be updated outside of construction. self._consumer.stop() self._consumer = self._create_kafka_consumer( self._partitions) return elif self._set_partitioner.allocating: log.info("Waiting to acquire locks on partition set") self._set_partitioner.wait_for_acquire() except Exception: log.exception('KafkaConsumer encountered fatal exception ' 'processing messages.') raise
def __iter__(self): """Consume messages from kafka using the Kazoo SetPartitioner to allow multiple consumer processes to negotiate access to the kafka partitions """ # KazooClient and SetPartitioner objects need to be instantiated after # the consumer process has forked. Instantiating prior to forking # gives the appearance that things are working but after forking the # connection to zookeeper is lost and no state changes are visible kazoo_client = KazooClient(hosts=self._zookeeper_url) kazoo_client.start() set_partitioner = (SetPartitioner( kazoo_client, path=self._zookeeper_path, set=self._consumer.fetch_offsets.keys())) consumed_from_kafka = self._statsd.get_counter( name='consumed_from_kafka') try: partitions = [] while 1: if set_partitioner.failed: raise Exception("Failed to acquire partition") elif set_partitioner.release: log.info("Releasing locks on partition set {} " "for topic {}".format(partitions, self._kafka_topic)) set_partitioner.release_set() partitions = [] elif set_partitioner.acquired: if not partitions: partitions = [p for p in set_partitioner] log.info("Acquired locks on partition set {} " "for topic {}".format(partitions, self._kafka_topic)) # Refresh the last known offsets again to make sure # that they are the latest after having acquired the # lock. Updates self._consumer.fetch_offsets. self._consumer.fetch_last_known_offsets() # Modify self._consumer.fetch_offsets to hold only the # offsets for the set of Kafka partitions acquired # by this instance of the persister. partitioned_fetch_offsets = {} for p in partitions: partitioned_fetch_offsets[p] = ( self._consumer.fetch_offsets[p]) self._consumer.fetch_offsets = partitioned_fetch_offsets # When Kafka resizes the partitions it's possible that it # will remove data at our current offset. When this # happens the next attempt to read from Kafka will generate # an OffsetOutOfRangeError. We trap this error and seek to # the head of the current Kafka data. Because this error # only happens when Kafka removes data we're currently # pointing at we're gauranteed that we won't read any # duplicate data however we will lose any information # between our current offset and the new Kafka head. try: for message in self._consumer: if not set_partitioner.acquired: break consumed_from_kafka += 1 log.debug("Consuming message from kafka, " "partition {}, offset {}".format( message[0], message[1].offset)) yield message except kafka.common.OffsetOutOfRangeError: log.error("Kafka OffsetOutOfRange. Jumping to head.") self._consumer.seek(0, 0) elif set_partitioner.allocating: log.info("Waiting to acquire locks on partition set") set_partitioner.wait_for_acquire() except: log.exception('KafkaConsumer encountered fatal exception ' 'processing messages.') raise