Пример #1
0
 def assert_kafka(self, expected_file_name):
     #print("reading server "+config.KAFKA_SERVER+" on topic:"+config.KAFKA_TOPIC)
     kafka_client = KafkaClient(config.KAFKA_SERVER)
     #simpleconsumer takes its timeout in seconds... hence 1, allowing all messages to appear but not hanging too long
     consumer = SimpleConsumer(kafka_client, b"my_group", config.KAFKA_TOPIC.encode("utf8"),
                                   iter_timeout=1)
     #seek(1,0) means to start processing from the begining (the 0) but skip 1 message from this index  (the first msg)
     #we bypass the first message since it is just used to autostart the topic
     consumer.seek(1, 0)
     actual = ""
     for msg in consumer:
         #the linefeed at the end is not really needed but it makes for more readable error reports
         actual += msg.message.value.decode('utf8')+"\n"
     expected = pkg_resources.resource_string(__name__, expected_file_name).decode('utf8')
     t_assert.equal(actual, expected)
Пример #2
0
 def assert_kafka(self, expected_file_name):
     #print("reading server "+config.KAFKA_SERVER+" on topic:"+config.KAFKA_TOPIC)
     kafka_client = KafkaClient(config.KAFKA_SERVER)
     #simpleconsumer takes its timeout in seconds... hence 1, allowing all messages to appear but not hanging too long
     consumer = SimpleConsumer(kafka_client, b"my_group", config.KAFKA_TOPIC.encode("utf8"),
                                   iter_timeout=1)
     #seek(1,0) means to start processing from the begining (the 0) but skip 1 message from this index  (the first msg)
     #we bypass the first message since it is just used to autostart the topic
     consumer.seek(1, 0)
     actual = ""
     for msg in consumer:
         #the linefeed at the end is not really needed but it makes for more readable error reports
         actual += msg.message.value.decode('utf8')+"\n"
     expected = pkg_resources.resource_string(__name__, expected_file_name).decode('utf8')
     t_assert.equal(actual, expected)
Пример #3
0
    def run(self):
        client = KafkaClient("localhost:9092")

        consumer = SimpleConsumer(client, "test-group", "topic.test.min.v1",
            max_buffer_size = None,
        )

        self.valid = 0
        self.invalid = 0

        m_len=len("Hello master wayne" * 10)

        consumer.seek(0,0)

        for message in consumer:
            try:
                if len(message.message.value) == m_len:
                    self.valid += 1
                else:
                    self.invalid += 1
            except:
                print "Reset Offset"
                consumer.seek(0,0)
Пример #4
0
class KafkaBroker(object):
    USER_PRODUCER = 0
    FIXED_PRODUCER = 1
    SIMPLE_PRODUCER = 2
    NON_PRODUCER = 3
    SIMPLE_CONSUMER = 0
    NON_CONSUMER = 1
    SOCKET_TIMEOUT = 60 #second
    
    def __init__(self, kafkaHost=None, kafkaGroup=None, kafkaTopic=None, 
                 consumerType=NON_CONSUMER, consumerPartitions=[],
                 producerType=NON_PRODUCER, producerPartitions=[]):
        self.kafkaHost = kafkaHost
        self.kafkaGroup = kafkaGroup
        self.kafkaTopic = kafkaTopic
        self.consumerPartitions = consumerPartitions
        self.producerPartitions = producerPartitions
        self.connect(kafkaHost)
        try:
            if producerType == self.SIMPLE_PRODUCER:
                self.producer = SimpleProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
            elif producerType == self.FIXED_PRODUCER:
                self.producer = FixedProducer(self.kafkaClient, producerPartitions[0], async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
            elif producerType == self.USER_PRODUCER:
                self.producer = UserProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED)
            elif producerType == self.NON_PRODUCER:
                self.producer = None
            else:
                raise Exception("wrong producer type {}".format(producerType))
            
            if consumerType == self.SIMPLE_CONSUMER:
                if not consumerPartitions:
                    self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic)
                else:
                    self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, 
                                                   self.kafkaTopic, partitions=self.consumerPartitions)
                logger.debug('consumer is listening on {}@{}'.format(self.kafkaTopic, self.consumerPartitions))
            elif consumerType == self.NON_CONSUMER:
                self.consumer = None
            else:
                raise Exception("wrong consumer type {}".format(consumerType))
                
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.consumer = None
            self.producer = None
            self.kafkaClient = None
            
    def close(self):
        if self.consumer:
            self.consumer.commit()
            self.consumer.stop()
            self.consumer = None
        if self.producer:
            self.producer.stop()
            self.producer = None
        if self.kafkaClient:
            self.kafkaClient.close()
            self.kafkaClient = None
        logger.info('Kafka connection closed')
    
    def connect(self, kafkaHost, countdown=COUNT_DOWN):
        if countdown == 0:
            logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN))
            return
            
        try:
            self.kafkaClient = KafkaClient(kafkaHost, timeout=self.SOCKET_TIMEOUT)
        except:
            logger.warning('try to connect kafka server again {}'.format(countdown))
            self.connect(kafkaHost, countdown - 1)
            
        logger.info('Kafka client connected {}'.format(self.kafkaClient))
        
    def reconnect(self, countdown=COUNT_DOWN):
        if countdown == 0:
            logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN))
            return
            
        try:
            self.kafkaClient.reinit()
        except:
            self.reconnect(countdown - 1)
        
    def produce(self, op, name, **kwargs):
        # TODO: when name is None, the operation is propagated to all partitions 
        if not op or not name:
            logger.warning('op or name must not be empty')
            return
        try:
            dictMessage = dict(kwargs)
            dictMessage['op'] = op
            dictMessage['name'] = name
            encodedMessage = simplejson.dumps(dictMessage)
            self.producer.send(self.kafkaTopic, name, encodedMessage)
        except KafkaError as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())

    def echo(self, message=''):
        self.produce('Echo', 'testing', message=message)
        
    def set_consumer_partition(self, consumerPartitions):
        if not consumerPartitions:
            logger.warning('consumer partitions can not be empty')
            return
            
        if self.consumer:
            self.consumer.commit()
            self.consumer.stop()
            self.consumer = None
        self.consumerPartitions = consumerPartitions
        try:
            self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup,
                                           self.kafkaTopic, partitions=self.consumerPartitions)
        except KafkaError as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
    
    def is_consumer_ready(self):
        if not self.consumer:
            logger.warning('Consumer is not ready yet')
            return False
        return True
        
    def seek(self, skip):
        if self.is_consumer_ready():
            if skip == -1:
                self.consumer.seek(0, 2)
            else:
                self.consumer.seek(skip, 1)
            
    def commit(self):
        if self.is_consumer_ready():
            self.consumer.commit()
            
    def consume_one(self):
        if not self.is_consumer_ready():
            return None
            
        try:
            message = self.consumer.get_message()
            if not message:
                return None
            logger.debug('received message {}'.format(message.message.value))
            return message.message.value
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        return None
        
    def consume(self, count=10):
        if not self.is_consumer_ready():
            return []
            
        try:
            messages = self.consumer.get_messages(count=count)
            return [message.message.value for message in messages]
        except Exception as e:
            logger.warning('Exception {}'.format(e))
            logger.debug(traceback.format_exc())
            self.reconnect()
        return []
Пример #5
0
    def consume(self, state):
        """
        Starts consuming from the configured Kafka topic given a possible
        existing ``pgshovel.interfaces.replication_pb2:State``.

        If the provided ``state`` does not contain a
        ``stream_state.consumer_state`` value, the ``KafaStream`` attempts to
        start reading from the Kafka topic after first "priming" the stream.
        Priming involves consuming messages from the topic looking for a
        ``BeginOperation``. Any message that is not a ``BeginOperation`` is
        dropped, until a ``BeginOperation`` is seen or the ``prime_threshold``
        is reached. The latter of which raises a
        ``pgshovel.streams.utilities:UnableToPrimeError`` error.

        In general, it makes sense to set the ``prime_threshold`` to high enough
        value that exceeds the max transaction size you expect to see in your
        data.  Generally speaking a  ``prime_threshold`` can effectively be
        infinite (and you could construct the stream with ``float('inf')``,
        however the lack of a ``BeginOperation`` in the stream would cause the
        stream to hang, possibly forever, so the ``prime_threshold`` config
        parameter is provided to raise an exception if this unexpected behavior
        occurs.
        """
        consumer = SimpleConsumer(KafkaClient(self.hosts), None, self.topic)

        # You can only update one offset at a time with kafka-python, plus
        # dealing with reconstituting global order from a partitioned stream is
        # hard we don't really need to deal with it right now.
        assert len(consumer.offsets) is 1

        decoded = imap(
            lambda (offset, msg): (offset, self.codec.decode(msg.value)),
            consumer
        )

        if state.stream_state.HasField('consumer_state'):
            # Seeking to a direct offset was not in the PyPI release of
            # kafka-python when this was implemented:
            # https://github.com/mumrah/kafka-python/pull/412
            current = consumer.offsets[0]
            offset = state.stream_state.consumer_state.offset + 1
            delta = offset - current
            logger.debug('Moving to previous replication log offset: %s (current position: %s)...', offset, current)
            consumer.seek(delta, 1)
            assert consumer.offsets[0] == offset
        else:
            logger.info('No consumer state provided, will attempt to prime to begin BeginOperation')
            # The call to ``prime_for_batch_start`` "primes" the stream by
            # dropping messages until it sees a message that is an intance of
            # one of the types in
            # ``pgshovel.replication.validation.TRANSACTION_START_EVENT_TYPES``
            decoded = prime_for_batch_start(
                max_messages=self.prime_threshold,
                stream=decoded
            )

        for offset, message in decoded:
            state = validate_state(state, offset, message)
            # XXX: This is necessary because of a bug in protocol buffer oneof.
            state = type(state).FromString(state.SerializeToString())
            yield state, offset, message