예제 #1
0
    def assert_message_count(self,
                             topic,
                             check_count,
                             timeout=10,
                             partitions=None,
                             at_least=False):
        hosts = ','.join(
            ['%s:%d' % (broker.host, broker.port) for broker in self.brokers])

        client = KafkaClient(hosts)
        consumer = SimpleConsumer(client,
                                  None,
                                  topic,
                                  partitions=partitions,
                                  auto_commit=False,
                                  iter_timeout=timeout)

        started_at = time.time()
        pending = consumer.pending(partitions)

        # Keep checking if it isn't immediately correct, subject to timeout
        while pending < check_count and (time.time() - started_at < timeout):
            pending = consumer.pending(partitions)
            time.sleep(0.5)

        consumer.stop()
        client.close()

        if pending < check_count:
            self.fail('Too few pending messages: found %d, expected %d' %
                      (pending, check_count))
        elif pending > check_count and not at_least:
            self.fail('Too many pending messages: found %d, expected %d' %
                      (pending, check_count))
        return True
예제 #2
0
    def test_simple_consumer_commit_does_not_raise(self):
        client = MagicMock()
        client.get_partition_ids_for_topic.return_value = [0, 1]

        def mock_offset_fetch_request(group, payloads, **kwargs):
            return [
                OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0)
                for p in payloads
            ]

        client.send_offset_fetch_request.side_effect = mock_offset_fetch_request

        def mock_offset_commit_request(group, payloads, **kwargs):
            raise FailedPayloadsError(payloads[0])

        client.send_offset_commit_request.side_effect = mock_offset_commit_request

        consumer = SimpleConsumer(client,
                                  group='foobar',
                                  topic='topic',
                                  partitions=[0, 1],
                                  auto_commit=False)

        # Mock internal commit check
        consumer.count_since_commit = 10

        # This should not raise an exception
        self.assertFalse(consumer.commit(partitions=[0, 1]))
예제 #3
0
 def __init__(self, name, host='web14', port=51092, **kwargs):
     QueueBase.QueueBase.__init__(self, name, host, port)
     self.__queue = []
     self.__kafka = KafkaClient('%s:%d' % (host, port))
     self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False))
     self.__producer.client.ensure_topic_exists(self.name)
     self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1)
예제 #4
0
    def assert_message_count(self,
                             topic,
                             check_count,
                             timeout=10,
                             partitions=None):
        hosts = ','.join(
            ['%s:%d' % (broker.host, broker.port) for broker in self.brokers])

        client = KafkaClient(hosts)
        group = random_string(10)
        consumer = SimpleConsumer(client,
                                  group,
                                  topic,
                                  partitions=partitions,
                                  auto_commit=False,
                                  iter_timeout=timeout)

        started_at = time.time()
        pending = consumer.pending(partitions)

        # Keep checking if it isn't immediately correct, subject to timeout
        while pending != check_count and (time.time() - started_at < timeout):
            pending = consumer.pending(partitions)

        consumer.stop()
        client.close()

        self.assertEqual(pending, check_count)
예제 #5
0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    #get timestamp
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp,
                                                    batch_counter)
    tempfile = open(tempfile_path, "w")
    log_has_at_least_one = False  #did we log at least one entry?
    while True:
        messages = kafka_consumer.get_messages(
            count=1000,
            block=False)  #get 1000 messages at a time, non blocking
        if not messages:
            break
        for message in messages:  #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            log_has_at_least_one = True
            #print(message.message.value)
            tempfile.write(message.message.value + "\n")
        if tempfile.tell() > 10000000:  #file size > 10MB
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit()
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit()  #save position in the kafka queue
    return 0
 def test_switch_leader_simple_consumer(self):
     producer = Producer(self.client, async=False)
     consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10)
     self._send_random_messages(producer, self.topic, 0, 2)
     consumer.get_messages()
     self._kill_leader(self.topic, 0)
     consumer.get_messages()
 def createConsumer(self):
     self.consumer = SimpleConsumer(self.client,
         topic=self.config["topic"],
         group=self.config["consumerGroup"],
         auto_commit= True,
         max_buffer_size=3000000,
         iter_timeout=5)
예제 #8
0
    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load(
            "localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(
            self.redis_monitor.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(self.kafka_conn, "demo-id",
                                       "demo_test.outbound_firehose")
    def assert_message_count(self, topic, check_count, timeout=10,
                             partitions=None, at_least=False):
        hosts = ','.join(['%s:%d' % (broker.host, broker.port)
                          for broker in self.brokers])

        client = SimpleClient(hosts, timeout=2)
        consumer = SimpleConsumer(client, None, topic,
                                  partitions=partitions,
                                  auto_commit=False,
                                  iter_timeout=timeout)

        started_at = time.time()
        pending = -1
        while pending < check_count and (time.time() - started_at < timeout):
            try:
                pending = consumer.pending(partitions)
            except FailedPayloadsError:
                pass
            time.sleep(0.5)

        consumer.stop()
        client.close()

        if pending < check_count:
            self.fail('Too few pending messages: found %d, expected %d' %
                      (pending, check_count))
        elif pending > check_count and not at_least:
            self.fail('Too many pending messages: found %d, expected %d' %
                      (pending, check_count))
        return True
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    #get timestamp
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    while True:
        messages = kafka_consumer.get_messages(
            count=1000,
            block=False)  #get 5000 messages at a time, non blocking
        if not messages:
            os.system("sleep 30s")
        continue
        #break
        for message in messages:  #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            print message
        kafka_consumer.commit()  #save position in the kafka queue
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit()  #save position in the kafka queue
    return 0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consumer Loading topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp,
                                                    batch_counter)
    tempfile = open(tempfile_path, "w")
    #log_has_at_least_one = False #did we log at least one entry?
    while True:
        # get 1000 messages at a time, non blocking
        messages = kafka_consumer.get_messages(count=100, block=False)
        if not messages:
            #print "no messages to read"
            continue  # If no messages are received, wait until there are more
        for message in messages:
            #log_has_at_least_one = True
            print(message.message.value)
            #tempfile.write(message.message.value + "\n")    # lose the '\n'?
            tempfile.write(message.message.value)
            tempfile.write("\n")
        if tempfile.tell() > 12000:  # file size > 120MB
            print "Note: file is large enough to write to hdfs. Writing now..."
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit(
        )  # inform zookeeper of position in the kafka queue
    def setup_kafka(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name
            self.topic ='general-starturls'

        _server=self.settings.get("KAFKA_LOCATION", 'localhost:9092')
        _partition_id = int(self.settings.get('SPIDER_PARTITION_ID', 0))
        _group = self.settings.get("GROUP","scrapy-crawler")
        _conn = KafkaClient(_server)
        self.topic1 = self.settings.get('TOPIC', 'frontier-todo')
        mongo_server = self.settings.get("MONGODB_SERVER", 'localhost')
        mongo_port = self.settings.get("MONGODB_PORT", 'MONGODB_PORT')
        self.mng_client = MongoClient(mongo_server, mongo_port)
        
        self.consumer = SimpleConsumer(_conn,_group,self.topic1, partitions=[_partition_id], buffer_size=131072, max_buffer_size=1048576) 
        self.producer = KafkaProducer(bootstrap_servers=[_server])
        self.MONGODB_DB = self.settings.get("MONGODB_DB")
        self.MONGODB_COLLECTION = "shop"
        self.SPIDER_NAME = self.settings.get("SPIDER_NAME")
        self.JOB_NAME = self.settings.get("JOB_NAME")
        self.LOCALE = self.settings.get("LOCALE",'us')
        self.MONGODB_DB_INPUT = self.settings.get("MONGODB_DB_INPUT", "scr")
        self.NUM_REPETE = self.settings.get("NUMBER_REPETE_SCRAPE", 7)
        self.JOB_INPUT_COLLECTION =  self.settings.get("JOB_INPUT_COLLECTION", "job_input3")
        self.ITEM_INPUT_COLLECTION =  self.settings.get("ITEM_INPUT_COLLECTION" ,'scrap_input4')
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
예제 #13
0
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
예제 #14
0
    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       "demo-id",
                                       "demo_test.crawled_firehose",
                                       buffer_size=1024 * 100,
                                       fetch_size_bytes=1024 * 100,
                                       max_buffer_size=None)
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)
 def __init__(self, topic, kafka_broker, consumer_group):
     self.kafka = KafkaClient(kafka_broker)
     self.consumer = SimpleConsumer(self.kafka,
                                    consumer_group,
                                    topic,
                                    fetch_size_bytes=self.__max_buffer_size,
                                    buffer_size=self.__max_buffer_size,
                                    max_buffer_size=self.__max_buffer_size)
예제 #16
0
def connection_time():
    cnt = 0
    kafka = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(kafka, "test", "twitter")
    start_time = time.time()
    for msg in consumer:
        cnt += 1
        if cnt > 0:
            return time.time() - start_time
예제 #17
0
        def consume_topic(callback_url, consumer_group, topic):
            consumer = None
            try:
                consumer = SimpleConsumer(self.kafka,
                                          consumer_group,
                                          topic,
                                          auto_commit=False)
                messages_read = 0

                # we can't read messages infinitely here as we have
                # a lot of topics/subscribers (much more than threadpool size)
                while messages_read < self.max_read_messages_per_cycle:

                    # get one message and monitor the time
                    start = monitoring.start_time_measure()
                    message = consumer.get_message(block=False)
                    ms_elapsed = monitoring.stop_time_measure(start)
                    self.metrics['kafka_read'].add({'topic': topic},
                                                   ms_elapsed)

                    # if we don't have messages for this topic/subscriber - quit and give chance to others
                    if message is None:
                        logging.info(
                            'No messages for topic: %s and callback: %s, quiting the thread',
                            topic, callback_url)
                        break

                    try:
                        event = json.loads(
                            message.message.value.decode('utf-8'))
                        response_status = self.forward_event(
                            callback_url, event, topic)

                        # if status is success - mark message as consumed by this subscriber
                        if 200 <= response_status < 300:
                            consumer.commit()
                        else:
                            logging.info(
                                'Received error response fro consumer: %s',
                                response_status)
                    except:
                        logging.error(
                            "Exception while sending event to consumer")
                        logging.error(traceback.format_exc())
                    finally:
                        messages_read += 1
                return messages_read

            except UnknownTopicOrPartitionError:
                logging.error('Adding %s to skip list', topic)
            except:
                logging.exception('failed to create kafka client')
            finally:
                if consumer is not None:
                    consumer.stop()
예제 #18
0
    def consume_topic(self, topic, group, temp_dir):
        '''
        This function receive messages from Friendsquare topic then save it to a temporary
        file: temp_dir, then transfer the file to hdfs.
        Create a kafka receiver to grap messages
        '''

        kafka_receiver = SimpleConsumer(kafka,
                                        group,
                                        topic,
                                        max_buffer_size=1310720000)

        # Create a temp file to store messages
        self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count))

        temp_file = open(self.temp_file_path, 'w')

        hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic)

        # Create a hdfs directory to store output files
        os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir)

        while self.count < self.max_count:

            # Get 1000 messages each time
            messages = kafka_receiver.get_messages(count=1000, block=False)

            if not messages:
                continue

            # Write the messages to a file, one message per line
            for message in messages:
                temp_file.write(message.message.value + '\n')

            # Set each file size at 20 M
            if temp_file.tell() > 20000000:
                temp_file.close()

                # Put the file to hdfs
                hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count)
                os.system("hdfs dfs -put -f %s %s" %
                          (self.temp_file_path, hdfs_path))

                #remove the old file
                os.remove(self.temp_file_path)

                #  Create a new temp file to store messages
                self.count += 1
                self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count))
                temp_file = open(self.temp_file_path, 'w')

            # Inform zookeeper of position in the kafka queue
            kafka_receiver.commit()

        temp_file.close()
    def consume_topic(self, topic, group, temp_dir):
        '''
        This function receive messages from Kafka then save it to a temporary
        first, then transfer the file to hdfs.
        '''
        # Create a kafka receiver to grap messages
        kafka_receiver = SimpleConsumer(kafka,
                                        group,
                                        topic,
                                        max_buffer_size=1310720000)

        self.timestamp = self.getTimestamp()
        # Create a temp file to store messages
        self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp,
                                                str(self.count))

        temp_file = open(self.temp_file_path, 'w')

        while self.count < self.max_count:
            # Get 100 messages each time
            messages = kafka_receiver.get_messages(count=100, block=False)
            if not messages:
                continue

            # Write the messages to a file, one message per line
            for message in messages:
                temp_file.write(message.message.value + '\n')

            # For structured streaming, files need to be small at this point, set the size at 2 M
            if temp_file.tell() > 2000000:
                temp_file.close()

                # Copy the file to hdfs
                output_dir = "%s/%s" % (self.hdfs_dir, topic)
                os.system("hdfs dfs -mkdir %s" % output_dir)
                hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp,
                                              self.count)
                os.system("hdfs dfs -put -f %s %s" %
                          (self.temp_file_path, hdfs_path))

                #remove the old file
                os.remove(self.temp_file_path)

                #  Create a new temp file to store messages
                self.count += 1
                self.timestamp = self.getTimestamp()
                self.temp_file_path = "%s/%s_%s.txt" % (
                    temp_dir, self.timestamp, str(self.count))
                temp_file = open(self.temp_file_path, 'w')

            # Inform zookeeper of position in the kafka queue
            kafka_receiver.commit()

        temp_file.close()
예제 #20
0
def consumer(seconds):
    time.sleep(5)
    cnt = 0
    kafka = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(kafka, "test", "twitter")
    start_time = time.time()
    for msg in consumer:
        if time.time() >= start_time + seconds:
            return cnt
        cnt += 1

    return cnt
예제 #21
0
def get_max_offsets(consumer_group, topic_name):
    print("conmeiiiiiiiii")
    kfk_ins = kafka_ins()
    kafka = KafkaClient(kfk_ins.BROKERS_LIST, kfk_ins.CLIENT_ID,
                        kfk_ins.TIME_OUT)
    topic_offset = {}
    consumer = SimpleConsumer(kafka,
                              consumer_group,
                              topic_name,
                              api_version='0.8.2')
    topic_offset[topic_name] = consumer.offsets
    return topic_offset
예제 #22
0
    def test_simple_consumer_failed_payloads(self):
        client = MagicMock()
        consumer = SimpleConsumer(client, group=None,
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        def failed_payloads(payload):
            return FailedPayloadsError(payload)

        client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads)

        # This should not raise an exception
        consumer.get_messages(5)
예제 #23
0
    def __init__(self, kafka_addr, topic, vin, web_url):
        threading.Thread.__init__(self)

        self.kafka = KafkaClient(kafka_addr) #kafka_addr
        self.cons = SimpleConsumer(self.kafka, None, topic)
        self.cons.seek(0,2)

        self.vin = vin
        self.web_url = web_url 
        self.flag = True
        self.count = 0
        self.sleep_count = 0
        self.headers = {'Content-Type' : 'application/json'}
예제 #24
0
파일: consume_base.py 프로젝트: vnisor/cotl
	def run(self):
		consumer = SimpleConsumer(self.kafka, self.group_name, self.topic_name,iter_timeout=self.timeout, buffer_size=4096*8,max_buffer_size=None)
		print "setting max_buffer_size=None"
		try:
			for message in consumer:
				parsed_msg = json.loads(message.message.value)
				self.handle_msg(parsed_msg)
				# print parsed_msg
				self.logger.info('OK%s'%str(consumer.offsets))
		except Exception as e:
			self.logger.warning( "Exception %s offset, %s" % (self.group_name, consumer.offsets))
			self.logger.warning( str(e) )
			raise
예제 #25
0
def kafka_stream():
    topic = request.args.get('topic')
    print topic
    kafka = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(kafka, "python", topic)
    topic = None

    def gen():
        for message in consumer:
            yield 'data: %s\n\n' % str(message.message.value)

    print "DEBUG: Kafka Stream Connected"
    return Response(gen(), mimetype="text/event-stream")
예제 #26
0
    def test_simple_consumer_reset_partition_offset(self):
        client = MagicMock()

        def mock_offset_request(payloads, **kwargs):
            raise FailedPayloadsError(payloads[0])

        client.send_offset_request.side_effect = mock_offset_request

        consumer = SimpleConsumer(client, group='foobar',
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        # This should not raise an exception
        self.assertEqual(consumer.reset_partition_offset(0), None)
예제 #27
0
 def _connect_consumer(self):
     if self._cons is None:
         try:
             self._cons = SimpleConsumer(self._conn,
                                         self._group,
                                         self._topic,
                                         partitions=self._partition_ids,
                                         buffer_size=1048576,
                                         max_buffer_size=10485760)
         except BrokerResponseError:
             self._cons = None
             logger.warning("Could not connect consumer to Kafka server")
             return False
     return True
예제 #28
0
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KeyedProducer(self._kafka,
                                       partitioner=Crc32NamePartitioner,
                                       codec=CODEC_SNAPPY)

        self._in_consumer = SimpleConsumer(self._kafka,
                                           settings.get('FRONTIER_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = SimpleConsumer(
                self._kafka,
                settings.get('FRONTIER_GROUP'),
                settings.get('SCORING_TOPIC'),
                buffer_size=262144,
                max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka,
                                       settings.get('OUTGOING_TOPIC'),
                                       settings.get('FRONTIER_GROUP'))

        self._manager = FrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}
예제 #29
0
def kafka_consumer(kafka_hosts,
                   schema_host,
                   schema_port,
                   topic,
                   consumer_group="python"):
    """
    消费kafka对应topic的记录, 非实时消费
    :param kafka_hosts:
    :param schema_host:
    :param schema_port:
    :param topic:
    :param consumer_group:
    :return:
    """
    # 获取topic最新schema
    topic_schema, topic_schema_id, schema_version = get_latest_schema_info(
        schema_host, schema_port, topic)
    # 消费kafka记录
    client = KafkaClient(hosts=kafka_hosts)
    simple_consumer = SimpleConsumer(client,
                                     consumer_group,
                                     topic,
                                     auto_offset_reset="smallest")
    collect_logs = []  # 存放消息记录的partition,offset,value
    msg_exist = True
    while msg_exist:
        msg = simple_consumer.get_message(get_partition_info=True)
        # print "kafka log:", msg
        # 判断此次获取的记录是否为None,为None则停止消费
        if msg is None:
            msg_exist = False
        else:
            msg_partition = msg[0]
            msg_offset = msg[1].offset
            msg_value = msg[1].message.value
            # 对单条记录解码
            bytes_msg = io.BytesIO(msg_value[5:])
            decode_msg = avro.io.BinaryDecoder(bytes_msg)
            recode_msg = avro.io.DatumReader(
                avro.schema.parse(topic_schema)).read(decode_msg)
            # 收集该log的partition,offset,value信息
            msg_collect = [msg_partition, msg_offset, recode_msg]
            collect_logs.append(msg_collect)
    collect_logs.sort(key=lambda x: x[0])  # 按partition id排序
    print "+++++++Topic: %s+++++++" % topic
    for index, log in enumerate(collect_logs):
        print index, log
    print "Successfully received."
    return collect_logs
예제 #30
0
파일: smoke_tests.py 프로젝트: kharus/samza
def validate_samza_job():
  """
  Validates that negate-number negated all messages, and sent the output to 
  samza-test-topic-output.
  """
  logger.info('Running validate_samza_job')
  kafka = _get_kafka_client()
  kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
  consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC)
  messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=60)
  message_count = len(messages)
  assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count)
  for message in map(lambda m: m.message.value, messages):
    assert int(message) < 0 , 'Expected negative integer but received {0}'.format(message)
  kafka.close()