예제 #1
0
 def consume(self, topic):
     #consumer=KafkaConsumer(topic,group_id=self.configs["group_id"],bootstrap_servers=self.configs["zookeeper"].split(","), auto_commit_enable=False)
     client=KafkaClient(self.configs["broker_list"].split(","))
     consumer=SimpleConsumer(topic=topic,group=self.configs["group_id"],client=client, auto_commit=False)
     while(True):
         # message is (partition, msg). for msg, it has key, value as param.
         for message in consumer.get_messages(10):
             #print("[%s.consumer] %s-part-%d: value=%s" % (self.configs["group_id"],topic,message[0],message[1]))
             print("[%s.consumer] %s-part-%d: key=%s value=%s" % (self.configs["group_id"],topic, message[0], message[1].key,message[1].value))
예제 #2
0
 def assert_kafka(self, expected_file_name):
     #print("reading server "+config.KAFKA_SERVER+" on topic:"+config.KAFKA_TOPIC)
     kafka_client = KafkaClient(config.KAFKA_SERVER)
     #simpleconsumer takes its timeout in seconds... hence 1, allowing all messages to appear but not hanging too long
     consumer = SimpleConsumer(kafka_client, b"my_group", config.KAFKA_TOPIC.encode("utf8"),
                                   iter_timeout=1)
     #seek(1,0) means to start processing from the begining (the 0) but skip 1 message from this index  (the first msg)
     #we bypass the first message since it is just used to autostart the topic
     consumer.seek(1, 0)
     actual = ""
     for msg in consumer:
         #the linefeed at the end is not really needed but it makes for more readable error reports
         actual += msg.message.value.decode('utf8')+"\n"
     expected = pkg_resources.resource_string(__name__, expected_file_name).decode('utf8')
     t_assert.equal(actual, expected)
예제 #3
0
 def assert_kafka(self, expected_file_name):
     #print("reading server "+config.KAFKA_SERVER+" on topic:"+config.KAFKA_TOPIC)
     kafka_client = KafkaClient(config.KAFKA_SERVER)
     #simpleconsumer takes its timeout in seconds... hence 1, allowing all messages to appear but not hanging too long
     consumer = SimpleConsumer(kafka_client, b"my_group", config.KAFKA_TOPIC.encode("utf8"),
                                   iter_timeout=1)
     #seek(1,0) means to start processing from the begining (the 0) but skip 1 message from this index  (the first msg)
     #we bypass the first message since it is just used to autostart the topic
     consumer.seek(1, 0)
     actual = ""
     for msg in consumer:
         #the linefeed at the end is not really needed but it makes for more readable error reports
         actual += msg.message.value.decode('utf8')+"\n"
     expected = pkg_resources.resource_string(__name__, expected_file_name).decode('utf8')
     t_assert.equal(actual, expected)
예제 #4
0
 def set_consumer_partition(self, consumerPartitions):
     if not consumerPartitions:
         logger.warning('consumer partitions can not be empty')
         return
         
     if self.consumer:
         self.consumer.commit()
         self.consumer.stop()
         self.consumer = None
     self.consumerPartitions = consumerPartitions
     try:
         self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup,
                                        self.kafkaTopic, partitions=self.consumerPartitions)
     except KafkaError as e:
         logger.warning('Exception {}'.format(e))
         logger.debug(traceback.format_exc())
         self.reconnect()
     except Exception as e:
         logger.warning('Exception {}'.format(e))
         logger.debug(traceback.format_exc())
예제 #5
0
파일: kafka.py 프로젝트: disqus/pgshovel
def test_writer():
    topic = '%s-mutations' % (uuid.uuid1().hex,)

    client = KafkaClient('kafka')
    producer = SimpleProducer(client)
    writer = KafkaWriter(producer, topic)

    inputs = list(transaction)
    writer.push(inputs)

    consumer = SimpleConsumer(client, 'test', topic, auto_offset_reset='smallest')

    outputs = map(
        writer.codec.decode,
        map(
            operator.attrgetter('message.value'),
            list(consumer.get_messages(count=3)),
        ),
    )

    assert outputs == inputs
예제 #6
0
 def __init__(self, kafkaHost=None, kafkaGroup=None, kafkaTopic=None, 
              consumerType=NON_CONSUMER, consumerPartitions=[],
              producerType=NON_PRODUCER, producerPartitions=[]):
     self.kafkaHost = kafkaHost
     self.kafkaGroup = kafkaGroup
     self.kafkaTopic = kafkaTopic
     self.consumerPartitions = consumerPartitions
     self.producerPartitions = producerPartitions
     self.connect(kafkaHost)
     try:
         if producerType == self.SIMPLE_PRODUCER:
             self.producer = SimpleProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
         elif producerType == self.FIXED_PRODUCER:
             self.producer = FixedProducer(self.kafkaClient, producerPartitions[0], async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
         elif producerType == self.USER_PRODUCER:
             self.producer = UserProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
         elif producerType == self.NON_PRODUCER:
             self.producer = None
         else:
             raise Exception("wrong producer type {}".format(producerType))
         
         if consumerType == self.SIMPLE_CONSUMER:
             if not consumerPartitions:
                 self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic)
             else:
                 self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, 
                                                self.kafkaTopic, partitions=self.consumerPartitions)
             logger.debug('consumer is listening on {}@{}'.format(self.kafkaTopic, self.consumerPartitions))
         elif consumerType == self.NON_CONSUMER:
             self.consumer = None
         else:
             raise Exception("wrong consumer type {}".format(consumerType))
             
     except Exception as e:
         logger.warning('Exception {}'.format(e))
         logger.debug(traceback.format_exc())
         self.consumer = None
         self.producer = None
         self.kafkaClient = None
예제 #7
0
파일: k.py 프로젝트: Balhau/pyutils
    def run(self):
        client = KafkaClient("localhost:9092")

        consumer = SimpleConsumer(client, "test-group", "topic.test.min.v1",
            max_buffer_size = None,
        )

        self.valid = 0
        self.invalid = 0

        m_len=len("Hello master wayne" * 10)

        consumer.seek(0,0)

        for message in consumer:
            try:
                if len(message.message.value) == m_len:
                    self.valid += 1
                else:
                    self.invalid += 1
            except:
                print "Reset Offset"
                consumer.seek(0,0)
예제 #8
0
def test_handler():
    topic = '%s-mutations' % (uuid.uuid1().hex,)

    codec = BinaryCodec(Message)

    client = KafkaClient('kafka')
    producer = SimpleProducer(client)
    writer = KafkaWriter(producer, topic, codec)

    inputs = list(transaction)
    writer.push(inputs)

    consumer = SimpleConsumer(client, 'test', topic, auto_offset_reset='smallest')

    outputs = map(
        codec.decode,
        map(
            operator.attrgetter('message.value'),
            list(consumer.get_messages(count=3)),
        ),
    )

    assert outputs == inputs
예제 #9
0
class KafkaBroker(object):
    USER_PRODUCER = 0
    FIXED_PRODUCER = 1
    SIMPLE_PRODUCER = 2
    NON_PRODUCER = 3
    SIMPLE_CONSUMER = 0
    NON_CONSUMER = 1
    SOCKET_TIMEOUT = 60 #second
    
    def __init__(self, kafkaHost=None, kafkaGroup=None, kafkaTopic=None, 
                 consumerType=NON_CONSUMER, consumerPartitions=[],
                 producerType=NON_PRODUCER, producerPartitions=[]):
        self.kafkaHost = kafkaHost
        self.kafkaGroup = kafkaGroup
        self.kafkaTopic = kafkaTopic
        self.consumerPartitions = consumerPartitions
        self.producerPartitions = producerPartitions
        self.connect(kafkaHost)
        try:
            if producerType == self.SIMPLE_PRODUCER:
                self.producer = SimpleProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
            elif producerType == self.FIXED_PRODUCER:
                self.producer = FixedProducer(self.kafkaClient, producerPartitions[0], async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
            elif producerType == self.USER_PRODUCER:
                self.producer = UserProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
            elif producerType == self.NON_PRODUCER:
                self.producer = None
            else:
                raise Exception("wrong producer type {}".format(producerType))
            
            if consumerType == self.SIMPLE_CONSUMER:
                if not consumerPartitions:
                    self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic)
                else:
                    self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, 
                                                   self.kafkaTopic, partitions=self.consumerPartitions)
                logger.debug('consumer is listening on {}@{}'.format(self.kafkaTopic, self.consumerPartitions))
            elif consumerType == self.NON_CONSUMER:
                self.consumer = None
            else:
                raise Exception("wrong consumer type {}".format(consumerType))
                
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.consumer = None
            self.producer = None
            self.kafkaClient = None
            
    def close(self):
        if self.consumer:
            self.consumer.commit()
            self.consumer.stop()
            self.consumer = None
        if self.producer:
            self.producer.stop()
            self.producer = None
        if self.kafkaClient:
            self.kafkaClient.close()
            self.kafkaClient = None
        logger.info('Kafka connection closed')
    
    def connect(self, kafkaHost, countdown=COUNT_DOWN):
        if countdown == 0:
            logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN))
            return
            
        try:
            self.kafkaClient = KafkaClient(kafkaHost, timeout=self.SOCKET_TIMEOUT)
        except:
            logger.warning('try to connect kafka server again {}'.format(countdown))
            self.connect(kafkaHost, countdown - 1)
            
        logger.info('Kafka client connected {}'.format(self.kafkaClient))
        
    def reconnect(self, countdown=COUNT_DOWN):
        if countdown == 0:
            logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN))
            return
            
        try:
            self.kafkaClient.reinit()
        except:
            self.reconnect(countdown - 1)
        
    def produce(self, op, name, **kwargs):
        # TODO: when name is None, the operation is propagated to all partitions 
        if not op or not name:
            logger.warning('op or name must not be empty')
            return
        try:
            dictMessage = dict(kwargs)
            dictMessage['op'] = op
            dictMessage['name'] = name
            encodedMessage = simplejson.dumps(dictMessage)
            self.producer.send(self.kafkaTopic, name, encodedMessage)
        except KafkaError as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())

    def echo(self, message=''):
        self.produce('Echo', 'testing', message=message)
        
    def set_consumer_partition(self, consumerPartitions):
        if not consumerPartitions:
            logger.warning('consumer partitions can not be empty')
            return
            
        if self.consumer:
            self.consumer.commit()
            self.consumer.stop()
            self.consumer = None
        self.consumerPartitions = consumerPartitions
        try:
            self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup,
                                           self.kafkaTopic, partitions=self.consumerPartitions)
        except KafkaError as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
    
    def is_consumer_ready(self):
        if not self.consumer:
            logger.warning('Consumer is not ready yet')
            return False
        return True
        
    def seek(self, skip):
        if self.is_consumer_ready():
            if skip == -1:
                self.consumer.seek(0, 2)
            else:
                self.consumer.seek(skip, 1)
            
    def commit(self):
        if self.is_consumer_ready():
            self.consumer.commit()
            
    def consume_one(self):
        if not self.is_consumer_ready():
            return None
            
        try:
            message = self.consumer.get_message()
            if not message:
                return None
            logger.debug('received message {}'.format(message.message.value))
            return message.message.value
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        return None
        
    def consume(self, count=10):
        if not self.is_consumer_ready():
            return []
            
        try:
            messages = self.consumer.get_messages(count=count)
            return [message.message.value for message in messages]
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        return []
예제 #10
0
파일: kafka.py 프로젝트: disqus/pgshovel
    def consume(self, state):
        """
        Starts consuming from the configured Kafka topic given a possible
        existing ``pgshovel.interfaces.replication_pb2:State``.

        If the provided ``state`` does not contain a
        ``stream_state.consumer_state`` value, the ``KafaStream`` attempts to
        start reading from the Kafka topic after first "priming" the stream.
        Priming involves consuming messages from the topic looking for a
        ``BeginOperation``. Any message that is not a ``BeginOperation`` is
        dropped, until a ``BeginOperation`` is seen or the ``prime_threshold``
        is reached. The latter of which raises a
        ``pgshovel.streams.utilities:UnableToPrimeError`` error.

        In general, it makes sense to set the ``prime_threshold`` to high enough
        value that exceeds the max transaction size you expect to see in your
        data.  Generally speaking a  ``prime_threshold`` can effectively be
        infinite (and you could construct the stream with ``float('inf')``,
        however the lack of a ``BeginOperation`` in the stream would cause the
        stream to hang, possibly forever, so the ``prime_threshold`` config
        parameter is provided to raise an exception if this unexpected behavior
        occurs.
        """
        consumer = SimpleConsumer(KafkaClient(self.hosts), None, self.topic)

        # You can only update one offset at a time with kafka-python, plus
        # dealing with reconstituting global order from a partitioned stream is
        # hard we don't really need to deal with it right now.
        assert len(consumer.offsets) is 1

        decoded = imap(
            lambda (offset, msg): (offset, self.codec.decode(msg.value)),
            consumer
        )

        if state.stream_state.HasField('consumer_state'):
            # Seeking to a direct offset was not in the PyPI release of
            # kafka-python when this was implemented:
            # https://github.com/mumrah/kafka-python/pull/412
            current = consumer.offsets[0]
            offset = state.stream_state.consumer_state.offset + 1
            delta = offset - current
            logger.debug('Moving to previous replication log offset: %s (current position: %s)...', offset, current)
            consumer.seek(delta, 1)
            assert consumer.offsets[0] == offset
        else:
            logger.info('No consumer state provided, will attempt to prime to begin BeginOperation')
            # The call to ``prime_for_batch_start`` "primes" the stream by
            # dropping messages until it sees a message that is an intance of
            # one of the types in
            # ``pgshovel.replication.validation.TRANSACTION_START_EVENT_TYPES``
            decoded = prime_for_batch_start(
                max_messages=self.prime_threshold,
                stream=decoded
            )

        for offset, message in decoded:
            state = validate_state(state, offset, message)
            # XXX: This is necessary because of a bug in protocol buffer oneof.
            state = type(state).FromString(state.SerializeToString())
            yield state, offset, message
예제 #11
0
def _mp_consume(client, group, topic, message_queue, size, events, **consumer_options):
    """
    A child process worker which consumes messages based on the
    notifications given by the controller process

    NOTE: Ideally, this should have been a method inside the Consumer
    class. However, multiprocessing module has issues in windows. The
    functionality breaks unless this function is kept outside of a class
    """

    # Initial interval for retries in seconds.
    interval = 1
    while not events.exit.is_set():
        try:
            # Make the child processes open separate socket connections
            client.reinit()

            # We will start consumers without auto-commit. Auto-commit will be
            # done by the master controller process.
            consumer = SimpleConsumer(client, group, topic,
                                      auto_commit=False,
                                      auto_commit_every_n=None,
                                      auto_commit_every_t=None,
                                      **consumer_options)

            # Ensure that the consumer provides the partition information
            consumer.provide_partition_info()

            while True:
                # Wait till the controller indicates us to start consumption
                events.start.wait()

                # If we are asked to quit, do so
                if events.exit.is_set():
                    break

                # Consume messages and add them to the queue. If the controller
                # indicates a specific number of messages, follow that advice
                count = 0

                message = consumer.get_message()
                if message:
                    while True:
                        try:
                            message_queue.put(message, timeout=FULL_QUEUE_WAIT_TIME_SECONDS)
                            break
                        except queue.Full:
                            if events.exit.is_set(): break

                    count += 1

                    # We have reached the required size. The controller might have
                    # more than what he needs. Wait for a while.
                    # Without this logic, it is possible that we run into a big
                    # loop consuming all available messages before the controller
                    # can reset the 'start' event
                    if count == size.value:
                        events.pause.wait()

                else:
                    # In case we did not receive any message, give up the CPU for
                    # a while before we try again
                    time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS)

            consumer.stop()

        except KafkaError as e:
            # Retry with exponential backoff
            log.exception("Problem communicating with Kafka, retrying in %d seconds...", interval)
            time.sleep(interval)
            interval = interval*2 if interval*2 < MAX_BACKOFF_SECONDS else MAX_BACKOFF_SECONDS
예제 #12
0
def _mp_consume(client, group, topic, message_queue, size, events,
                **consumer_options):
    """
    A child process worker which consumes messages based on the
    notifications given by the controller process

    NOTE: Ideally, this should have been a method inside the Consumer
    class. However, multiprocessing module has issues in windows. The
    functionality breaks unless this function is kept outside of a class
    """

    # Initial interval for retries in seconds.
    interval = 1
    while not events.exit.is_set():
        try:
            # Make the child processes open separate socket connections
            client.reinit()

            # We will start consumers without auto-commit. Auto-commit will be
            # done by the master controller process.
            consumer = SimpleConsumer(client,
                                      group,
                                      topic,
                                      auto_commit=False,
                                      auto_commit_every_n=None,
                                      auto_commit_every_t=None,
                                      **consumer_options)

            # Ensure that the consumer provides the partition information
            consumer.provide_partition_info()

            while True:
                # Wait till the controller indicates us to start consumption
                events.start.wait()

                # If we are asked to quit, do so
                if events.exit.is_set():
                    break

                # Consume messages and add them to the queue. If the controller
                # indicates a specific number of messages, follow that advice
                count = 0

                message = consumer.get_message()
                if message:
                    while True:
                        try:
                            message_queue.put(
                                message, timeout=FULL_QUEUE_WAIT_TIME_SECONDS)
                            break
                        except queue.Full:
                            if events.exit.is_set(): break

                    count += 1

                    # We have reached the required size. The controller might have
                    # more than what he needs. Wait for a while.
                    # Without this logic, it is possible that we run into a big
                    # loop consuming all available messages before the controller
                    # can reset the 'start' event
                    if count == size.value:
                        events.pause.wait()

                else:
                    # In case we did not receive any message, give up the CPU for
                    # a while before we try again
                    time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS)

            consumer.stop()

        except KafkaError as e:
            # Retry with exponential backoff
            log.error(
                "Problem communicating with Kafka (%s), retrying in %d seconds..."
                % (e, interval))
            time.sleep(interval)
            interval = interval * 2 if interval * 2 < MAX_BACKOFF_SECONDS else MAX_BACKOFF_SECONDS
예제 #13
0
 def consumeLatest(self,topic):
     consumer = SimpleConsumer(topic=topic,group=self.group_id,client=self.client,auto_commit=True)
     consumer.get_message(timeout=1)
     consumer.stop()