예제 #1
0
    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load(
            "localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(
            self.redis_monitor.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(self.kafka_conn, "demo-id",
                                       "demo_test.outbound_firehose")
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consumer Loading topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp,
                                                    batch_counter)
    tempfile = open(tempfile_path, "w")
    #log_has_at_least_one = False #did we log at least one entry?
    while True:
        # get 1000 messages at a time, non blocking
        messages = kafka_consumer.get_messages(count=100, block=False)
        if not messages:
            #print "no messages to read"
            continue  # If no messages are received, wait until there are more
        for message in messages:
            #log_has_at_least_one = True
            print(message.message.value)
            #tempfile.write(message.message.value + "\n")    # lose the '\n'?
            tempfile.write(message.message.value)
            tempfile.write("\n")
        if tempfile.tell() > 12000:  # file size > 120MB
            print "Note: file is large enough to write to hdfs. Writing now..."
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit(
        )  # inform zookeeper of position in the kafka queue
 def test_switch_leader_simple_consumer(self):
     producer = Producer(self.client, async=False)
     consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10)
     self._send_random_messages(producer, self.topic, 0, 2)
     consumer.get_messages()
     self._kill_leader(self.topic, 0)
     consumer.get_messages()
    def assert_message_count(self, topic, check_count, timeout=10,
                             partitions=None, at_least=False):
        hosts = ','.join(['%s:%d' % (broker.host, broker.port)
                          for broker in self.brokers])

        client = SimpleClient(hosts, timeout=2)
        consumer = SimpleConsumer(client, None, topic,
                                  partitions=partitions,
                                  auto_commit=False,
                                  iter_timeout=timeout)

        started_at = time.time()
        pending = -1
        while pending < check_count and (time.time() - started_at < timeout):
            try:
                pending = consumer.pending(partitions)
            except FailedPayloadsError:
                pass
            time.sleep(0.5)

        consumer.stop()
        client.close()

        if pending < check_count:
            self.fail('Too few pending messages: found %d, expected %d' %
                      (pending, check_count))
        elif pending > check_count and not at_least:
            self.fail('Too many pending messages: found %d, expected %d' %
                      (pending, check_count))
        return True
예제 #5
0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consumer Loading topic '%s' in consumer group %s into %s..." % (topic, group, output_dir)
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter)
    tempfile = open(tempfile_path, "w")
    #log_has_at_least_one = False #did we log at least one entry?
    while True:
        # get 1000 messages at a time, non blocking
        messages = kafka_consumer.get_messages(count=100, block=False)
        if not messages:
            #print "no messages to read"
            continue   # If no messages are received, wait until there are more
        for message in messages:
            #log_has_at_least_one = True
            #print(message.message.value)
            #tempfile.write(message.message.value + "\n")    # lose the '\n'?
            tempfile.write(message.message.value)
        if tempfile.tell() > 120000000:  # file size > 120MB
            print "Note: file is large enough to write to hdfs. Writing now..."
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit()  # inform zookeeper of position in the kafka queue
    def assert_message_count(self, topic, check_count, timeout=10,
                             partitions=None, at_least=False):
        hosts = ','.join(['%s:%d' % (broker.host, broker.port)
                          for broker in self.brokers])

        client = KafkaClient(hosts)
        consumer = SimpleConsumer(client, None, topic,
                                  partitions=partitions,
                                  auto_commit=False,
                                  iter_timeout=timeout)

        started_at = time.time()
        pending = consumer.pending(partitions)

        # Keep checking if it isn't immediately correct, subject to timeout
        while pending < check_count and (time.time() - started_at < timeout):
            pending = consumer.pending(partitions)
            time.sleep(0.5)

        consumer.stop()
        client.close()

        if pending < check_count:
            self.fail('Too few pending messages: found %d, expected %d' %
                      (pending, check_count))
        elif pending > check_count and not at_least:
            self.fail('Too many pending messages: found %d, expected %d' %
                      (pending, check_count))
        return True
 def createConsumer(self):
     self.consumer = SimpleConsumer(self.client,
         topic=self.config["topic"],
         group=self.config["consumerGroup"],
         auto_commit= True,
         max_buffer_size=3000000,
         iter_timeout=5)
예제 #8
0
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
예제 #9
0
    def test_ts(self):

        kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2"))

        # consumer = SimpleConsumer(kafka, "my-group112", "test")
        consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC,
                                  fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000)

        while True:
            print("HELLO")
            # Prepare data for insert and copy to S3
            # data_str = StringIO()
            count = 0
            # last_offset = 2

            consumer.seek(2, 0)

            for message in consumer.get_messages(count=100, block=False, timeout=0.1):
                count += 1

                print(message.message.value)

            #     # Write tweets to StringIO
            #     self.write_to_data_str(message, data_str)

            # # Store batch tweets to S3
            # self.write_to_s3(data_str, last_offset)

            if count != 100:
                break
    def setup_kafka(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name
            self.topic ='general-starturls'

        _server=self.settings.get("KAFKA_LOCATION", 'localhost:9092')
        _partition_id = int(self.settings.get('SPIDER_PARTITION_ID', 0))
        _group = self.settings.get("GROUP","scrapy-crawler")
        _conn = KafkaClient(_server)
        self.topic1 = self.settings.get('TOPIC', 'frontier-todo')
        mongo_server = self.settings.get("MONGODB_SERVER", 'localhost')
        mongo_port = self.settings.get("MONGODB_PORT", 'MONGODB_PORT')
        self.mng_client = MongoClient(mongo_server, mongo_port)
        
        self.consumer = SimpleConsumer(_conn,_group,self.topic1, partitions=[_partition_id], buffer_size=131072, max_buffer_size=1048576) 
        self.producer = KafkaProducer(bootstrap_servers=[_server])
        self.MONGODB_DB = self.settings.get("MONGODB_DB")
        self.MONGODB_COLLECTION = "shop"
        self.SPIDER_NAME = self.settings.get("SPIDER_NAME")
        self.JOB_NAME = self.settings.get("JOB_NAME")
        self.LOCALE = self.settings.get("LOCALE",'us')
        self.MONGODB_DB_INPUT = self.settings.get("MONGODB_DB_INPUT", "scr")
        self.NUM_REPETE = self.settings.get("NUMBER_REPETE_SCRAPE", 7)
        self.JOB_INPUT_COLLECTION =  self.settings.get("JOB_INPUT_COLLECTION", "job_input3")
        self.ITEM_INPUT_COLLECTION =  self.settings.get("ITEM_INPUT_COLLECTION" ,'scrap_input4')
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
    def assert_message_count(self, topic, check_count, timeout=10,
                             partitions=None, at_least=False):
        hosts = ','.join(['%s:%d' % (broker.host, broker.port)
                          for broker in self.brokers])

        client = SimpleClient(hosts, timeout=2)
        consumer = SimpleConsumer(client, None, topic,
                                  partitions=partitions,
                                  auto_commit=False,
                                  iter_timeout=timeout)

        started_at = time.time()
        pending = -1
        while pending < check_count and (time.time() - started_at < timeout):
            try:
                pending = consumer.pending(partitions)
            except FailedPayloadsError:
                pass
            time.sleep(0.5)

        consumer.stop()
        client.close()

        if pending < check_count:
            self.fail('Too few pending messages: found %d, expected %d' %
                      (pending, check_count))
        elif pending > check_count and not at_least:
            self.fail('Too many pending messages: found %d, expected %d' %
                      (pending, check_count))
        return True
예제 #12
0
    def test_simple_consumer_commit_does_not_raise(self):
        client = MagicMock()
        client.get_partition_ids_for_topic.return_value = [0, 1]

        def mock_offset_fetch_request(group, payloads, **kwargs):
            return [
                OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0)
                for p in payloads
            ]

        client.send_offset_fetch_request.side_effect = mock_offset_fetch_request

        def mock_offset_commit_request(group, payloads, **kwargs):
            raise FailedPayloadsError(payloads[0])

        client.send_offset_commit_request.side_effect = mock_offset_commit_request

        consumer = SimpleConsumer(client,
                                  group='foobar',
                                  topic='topic',
                                  partitions=[0, 1],
                                  auto_commit=False)

        # Mock internal commit check
        consumer.count_since_commit = 10

        # This should not raise an exception
        self.assertFalse(consumer.commit(partitions=[0, 1]))
예제 #13
0
    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       "demo-id",
                                       "demo_test.crawled_firehose",
                                       buffer_size=1024 * 100,
                                       fetch_size_bytes=1024 * 100,
                                       max_buffer_size=None)
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)
예제 #14
0
 def __init__(self, name, host='web14', port=51092, **kwargs):
     QueueBase.QueueBase.__init__(self, name, host, port)
     self.__queue = []
     self.__kafka = KafkaClient('%s:%d' % (host, port))
     self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False))
     self.__producer.client.ensure_topic_exists(self.name)
     self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1)
예제 #15
0
def consume(kafka_host):
    kafka = KafkaClient(kafka_host)
    consumer = SimpleConsumer(kafka, 'fetcher', cfg['kafka']['pages'])
    producer = SimpleProducer(kafka)
    consumer.max_buffer_size=20*1024*1024
    for msg in consumer:
        page = json.loads(msg.message.value)
        process(page, producer)
    kafka.close()
def consume_save(group,topic):
#	tmp_save=open(tmp_file_path,"w")
	while True:
		kafka_consumer=SimpleConsumer(kafka,group,topic)
		messages= kafka_consumer.get_messages(count=1000, block=False)
		if not messages:
			print "Consumer didn't read any messages"
		for message in messages:
	#		tmp_save.write( message.message.value+"\n")
			print message.message.value+"\n"
예제 #17
0
    def consume_topic(self, topic, group, temp_dir):
        '''
        This function receive messages from Friendsquare topic then save it to a temporary
        file: temp_dir, then transfer the file to hdfs.
        Create a kafka receiver to grap messages
        '''

        kafka_receiver = SimpleConsumer(kafka,
                                        group,
                                        topic,
                                        max_buffer_size=1310720000)

        # Create a temp file to store messages
        self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count))

        temp_file = open(self.temp_file_path, 'w')

        hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic)

        # Create a hdfs directory to store output files
        os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir)

        while self.count < self.max_count:

            # Get 1000 messages each time
            messages = kafka_receiver.get_messages(count=1000, block=False)

            if not messages:
                continue

            # Write the messages to a file, one message per line
            for message in messages:
                temp_file.write(message.message.value + '\n')

            # Set each file size at 20 M
            if temp_file.tell() > 20000000:
                temp_file.close()

                # Put the file to hdfs
                hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count)
                os.system("hdfs dfs -put -f %s %s" %
                          (self.temp_file_path, hdfs_path))

                #remove the old file
                os.remove(self.temp_file_path)

                #  Create a new temp file to store messages
                self.count += 1
                self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count))
                temp_file = open(self.temp_file_path, 'w')

            # Inform zookeeper of position in the kafka queue
            kafka_receiver.commit()

        temp_file.close()
예제 #18
0
class HBaseServer(threading.Thread):
    """
    HBase thread that will continuously read from Kafka queue
    """
    def __init__(self, kafka_url, kafka_topic, hbase_url, hbase_thrift_port,
                 hbase_table):
        threading.Thread.__init__(self)

        self.kafka = KafkaClient(kafka_url)
        self.cons = SimpleConsumer(self.kafka, None, kafka_topic)
        self.cons.seek(0, 2)

        self.hbase_connect = happybase.Connection(hbase_url, hbase_thrift_port)
        self.car_table = self.hbase_connect.table(hbase_table)

        self.server_on_flag = True
        self.m = None
        self.payload = None
        self.vin = None
        self.time = None
        self.data = None
        self.row_key = None
        self.count = 0

    def run(self):
        while self.server_on_flag:

            self.m = self.cons.get_message(block=False)

            if (self.m is not None):
                self.payload = json.loads(self.m.message.value)
                self.vin = str(self.payload['vin'])
                self.time = str(self.payload['timestamp'])
                self.data = str(self.payload['data'])

                self.row_key = self.vin + self.time
                try:
                    self.car_table.put(self.vin,
                                       {'user:mostrecent': self.time})
                    self.car_table.put(self.row_key, {'car:data': self.data})
                    self.count = self.count + 1
                    logger.info(
                        'HBase Server: key: %s, table: %s, car{data: %s}. Message number: %s',
                        self.row_key, 'rvi', self.data, str(self.count))

                except Exception as e:
                    logger.info('%s,Data Push into HBase unsuccessful...', e)

            else:
                sleep(1 / 5)

    def shutdown(self):
        self.server_on_flag = False
        logger.info('HBase Server shutting down...')
    def consume_topic(self, topic, group, temp_dir):
        '''
        This function receive messages from Kafka then save it to a temporary
        first, then transfer the file to hdfs.
        '''
        # Create a kafka receiver to grap messages
        kafka_receiver = SimpleConsumer(kafka,
                                        group,
                                        topic,
                                        max_buffer_size=1310720000)

        self.timestamp = self.getTimestamp()
        # Create a temp file to store messages
        self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp,
                                                str(self.count))

        temp_file = open(self.temp_file_path, 'w')

        while self.count < self.max_count:
            # Get 100 messages each time
            messages = kafka_receiver.get_messages(count=100, block=False)
            if not messages:
                continue

            # Write the messages to a file, one message per line
            for message in messages:
                temp_file.write(message.message.value + '\n')

            # For structured streaming, files need to be small at this point, set the size at 2 M
            if temp_file.tell() > 2000000:
                temp_file.close()

                # Copy the file to hdfs
                output_dir = "%s/%s" % (self.hdfs_dir, topic)
                os.system("hdfs dfs -mkdir %s" % output_dir)
                hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp,
                                              self.count)
                os.system("hdfs dfs -put -f %s %s" %
                          (self.temp_file_path, hdfs_path))

                #remove the old file
                os.remove(self.temp_file_path)

                #  Create a new temp file to store messages
                self.count += 1
                self.timestamp = self.getTimestamp()
                self.temp_file_path = "%s/%s_%s.txt" % (
                    temp_dir, self.timestamp, str(self.count))
                temp_file = open(self.temp_file_path, 'w')

            # Inform zookeeper of position in the kafka queue
            kafka_receiver.commit()

        temp_file.close()
예제 #20
0
def setup_capture_new_messages_consumer(topic):
    """Seeks to the tail of the topic then returns a function that can
    consume messages from that point.
    """
    kafka = KafkaClient(get_config().cluster_config.broker_list)
    group = str('data_pipeline_clientlib_test')
    consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=_ONE_MEGABYTE)
    consumer.seek(0, 2)  # seek to tail, 0 is the offset, and 2 is the tail

    yield consumer

    kafka.close()
예제 #21
0
def consume_save(group,topic):
	tmp_save=open(tmp_file_path,"w")
	kafka_consumer=SimpleConsumer(kafka,group,topic)
	messages= kafka_consumer.get_messages(count=1000, block=False)
	if not messages:
		print "Consumer didn't read any messages"
	for message in messages:
		tmp_save.write( message.message.value+"\n")
#		print message.message.value+"\n"
	kafka_consumer.commit() # inform zookeeper of position in the kafka queu
	print ".... ... .. .."
	print "Message from topic \"%s\" consumed \n" % topic
예제 #22
0
    def run(self, options=None):

        # try:

        # Create table if it doesn't exist in the database
        if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False:
            self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE)

        kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2"))

        consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000,
                                  buffer_size=2000000000, max_buffer_size=2000000000)

        while True:

            # Prepare data for insert and copy to S3
            data_str = StringIO()
            csv_str = StringIO()
            count = 0

            # Get Offset from previous read
            s3_last_offset = self.get_s3_offset()

            (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0]
            last_offset = last_offset if last_offset else 0

            # Resolve difference in offset (s3 offset does not carry over from day to day)
            if s3_last_offset > last_offset:
                last_offset = s3_last_offset
                self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset))

            print(last_offset)

            # Read from Offset
            consumer.seek(last_offset, 0)

            for message in consumer.get_messages(count=self.BATCH_SIZE, block=False, timeout=5):

                # Write tweets to StringIO
                self.write_to_data_str(message, data_str, csv_str)

                count += 1
                last_offset += 1

            # Store batch tweets to S3
            self.write_to_s3(data_str, csv_str, last_offset)

            # Track Kafka Offset
            self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset))

            if count != self.BATCH_SIZE:
                break
예제 #23
0
    def test_simple_consumer_failed_payloads(self):
        client = MagicMock()
        consumer = SimpleConsumer(client, group=None,
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        def failed_payloads(payload):
            return FailedPayloadsError(payload)

        client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads)

        # This should not raise an exception
        consumer.get_messages(5)
예제 #24
0
    def __init__(self, kafka_addr, topic, vin, web_url):
        threading.Thread.__init__(self)

        self.kafka = KafkaClient(kafka_addr) #kafka_addr
        self.cons = SimpleConsumer(self.kafka, None, topic)
        self.cons.seek(0,2)

        self.vin = vin
        self.web_url = web_url 
        self.flag = True
        self.count = 0
        self.sleep_count = 0
        self.headers = {'Content-Type' : 'application/json'}
예제 #25
0
class HBaseServer(threading.Thread):
    """
    HBase thread that will continuously read from Kafka queue
    """

    def __init__(self, kafka_url, kafka_topic, hbase_url, hbase_thrift_port, hbase_table):
        threading.Thread.__init__(self)
        
        self.kafka = KafkaClient(kafka_url)
        self.cons = SimpleConsumer(self.kafka, None, kafka_topic)
        self.cons.seek(0,2)
        
        self.hbase_connect = happybase.Connection(hbase_url,hbase_thrift_port)
        self.car_table = self.hbase_connect.table(hbase_table)
        
        self.server_on_flag = True        
        self.m = None
        self.payload = None
        self.vin = None
        self.time = None
        self.data = None
        self.row_key = None
        self.count = 0

    def run(self):
        while self.server_on_flag:

            self.m = self.cons.get_message(block=False)
           
            if (self.m is not None):
                self.payload = json.loads(self.m.message.value)
                self.vin = str(self.payload['vin'])
                self.time = str(self.payload['timestamp'])
                self.data = str(self.payload['data'])
                
                self.row_key = self.vin+self.time
                try:
                    self.car_table.put(self.vin,{'user:mostrecent':self.time})
                    self.car_table.put(self.row_key,{'car:data':self.data})
                    self.count = self.count + 1
                    logger.info('HBase Server: key: %s, table: %s, car{data: %s}. Message number: %s', self.row_key, 'rvi', self.data, str(self.count))     
           
                except Exception as e:
                    logger.info('%s,Data Push into HBase unsuccessful...', e)

            else:
                sleep(1/5)

    def shutdown(self):
        self.server_on_flag = False
        logger.info('HBase Server shutting down...')
예제 #26
0
    def test_simple_consumer_reset_partition_offset(self):
        client = MagicMock()

        def mock_offset_request(payloads, **kwargs):
            raise FailedPayloadsError(payloads[0])

        client.send_offset_request.side_effect = mock_offset_request

        consumer = SimpleConsumer(client, group='foobar',
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        # This should not raise an exception
        self.assertEqual(consumer.reset_partition_offset(0), None)
예제 #27
0
 def _connect_consumer(self):
     if self._cons is None:
         try:
             self._cons = SimpleConsumer(self._conn,
                                         self._group,
                                         self._topic,
                                         partitions=self._partition_ids,
                                         buffer_size=1048576,
                                         max_buffer_size=10485760)
         except BrokerResponseError:
             self._cons = None
             logger.warning("Could not connect consumer to Kafka server")
             return False
     return True
예제 #28
0
def kafka_consumer(kafka_hosts,
                   schema_host,
                   schema_port,
                   topic,
                   consumer_group="python"):
    """
    消费kafka对应topic的记录, 非实时消费
    :param kafka_hosts:
    :param schema_host:
    :param schema_port:
    :param topic:
    :param consumer_group:
    :return:
    """
    # 获取topic最新schema
    topic_schema, topic_schema_id, schema_version = get_latest_schema_info(
        schema_host, schema_port, topic)
    # 消费kafka记录
    client = KafkaClient(hosts=kafka_hosts)
    simple_consumer = SimpleConsumer(client,
                                     consumer_group,
                                     topic,
                                     auto_offset_reset="smallest")
    collect_logs = []  # 存放消息记录的partition,offset,value
    msg_exist = True
    while msg_exist:
        msg = simple_consumer.get_message(get_partition_info=True)
        # print "kafka log:", msg
        # 判断此次获取的记录是否为None,为None则停止消费
        if msg is None:
            msg_exist = False
        else:
            msg_partition = msg[0]
            msg_offset = msg[1].offset
            msg_value = msg[1].message.value
            # 对单条记录解码
            bytes_msg = io.BytesIO(msg_value[5:])
            decode_msg = avro.io.BinaryDecoder(bytes_msg)
            recode_msg = avro.io.DatumReader(
                avro.schema.parse(topic_schema)).read(decode_msg)
            # 收集该log的partition,offset,value信息
            msg_collect = [msg_partition, msg_offset, recode_msg]
            collect_logs.append(msg_collect)
    collect_logs.sort(key=lambda x: x[0])  # 按partition id排序
    print "+++++++Topic: %s+++++++" % topic
    for index, log in enumerate(collect_logs):
        print index, log
    print "Successfully received."
    return collect_logs
예제 #29
0
class RVIConsumer(threading.Thread):

    def __init__(self, kafka_addr, topic, vin, web_url):
        threading.Thread.__init__(self)

        self.kafka = KafkaClient(kafka_addr) #kafka_addr
        self.cons = SimpleConsumer(self.kafka, None, topic)
        self.cons.seek(0,2)

        self.vin = vin
        self.web_url = web_url 
        self.flag = True
        self.count = 0
        self.sleep_count = 0
        self.headers = {'Content-Type' : 'application/json'}

    def is_running(self):
        return self.flag
        
    def run(self):
        while self.flag:
            
            #cons = SimpleConsumer(kafka, None, 'rvi')
            m = self.cons.get_message(block=False)
            if (m is not None):
                payload = json.loads(m.message.value)

                if(payload['vin'] == self.vin):
                    self.sleep_count = 0 
                    payloadtoweb = json.dumps(m.message.value)
                    r = requests.post(self.web_url, data=payloadtoweb, headers=self.headers) 
                    if (r.status_code is 200):
                        print m.message.value + " sent successfully\n"        
                    else: 
                        print "%s is not available, status code:%d...shutting down now..."%(self.web_url,r.status_code)
                        self.shutdown()       

            else:
                if (self.sleep_count > 100000):
                    print "No new data for %s... Timing out" % self.vin
                    self.shutdown()

                time.sleep(1/5)
                self.sleep_count = self.sleep_count + 1

    def shutdown(self):
        self.flag = False    
        requests.post(self.web_url, data=json.dumps({'vin':self.vin, 'data':'EOM'}), headers=self.headers) 
        print "%s consumer thread shutting down" % self.vin 
예제 #30
0
def read_kafka():
    """
    read socialSignal, keep if non are zero, save to mongo social/socialSignal
    :return:
    """
    msg_buffer = dict()
    ids = set()
    in_kafka = KafkaClient(settings.IN_SOCIAL_SIGNAL_KAFKA)
    consumer = SimpleConsumer(in_kafka, 'comment.pages1', 'comment.pages', max_buffer_size=20 * 1024 * 1024,
                              fetch_size_bytes=2 * 1024 * 1024, buffer_size=2 * 1024 * 1024)
    consumer.seek(0, 0)

    for msg in consumer:
        if "001WxC6D" in msg.message.value:
            print msg.message.value
예제 #31
0
def validate_samza_job():
    """
  Validates that negate-number negated all messages, and sent the output to 
  samza-test-topic-output.
  """
    logger.info("Running validate_samza_job")
    kafka = util.get_kafka_client()
    kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
    consumer = SimpleConsumer(kafka, "samza-test-group", TEST_OUTPUT_TOPIC)
    messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300)
    message_count = len(messages)
    assert NUM_MESSAGES == message_count, "Expected {0} lines, but found {1}".format(NUM_MESSAGES, message_count)
    for message in map(lambda m: m.message.value, messages):
        assert int(message) < 0, "Expected negative integer but received {0}".format(message)
    kafka.close()
예제 #32
0
def consume_save(group,topic):
	i=0
	tmp_save=open(tmp_file_path,"w")
	while True:
		kafka_consumer=SimpleConsumer(kafka,group,topic)
		messages= kafka_consumer.get_messages(count=1000, block=False)
#		if not messages:
#			print "Consumer didn't read any messages"
		for message in messages:
			tmp_save.write( message.message.value+"\n")
			print message.message.value+"\n"
		# file size > 20MB
                if tmp_save.tell() > 20000000:
                    push_to_hdfs(tmp_file_path)
		kafka_consumer.commit() # inform zookeeper of position in the kafka queu
예제 #33
0
파일: smoke_tests.py 프로젝트: kharus/samza
def validate_samza_job():
  """
  Validates that negate-number negated all messages, and sent the output to 
  samza-test-topic-output.
  """
  logger.info('Running validate_samza_job')
  kafka = _get_kafka_client()
  kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
  consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC)
  messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=60)
  message_count = len(messages)
  assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count)
  for message in map(lambda m: m.message.value, messages):
    assert int(message) < 0 , 'Expected negative integer but received {0}'.format(message)
  kafka.close()
예제 #34
0
def setup_capture_new_messages_consumer(topic):
    """Seeks to the tail of the topic then returns a function that can
    consume messages from that point.
    """
    kafka = KafkaClient(get_config().cluster_config.broker_list)
    group = str('data_pipeline_clientlib_test')
    consumer = SimpleConsumer(kafka,
                              group,
                              topic,
                              max_buffer_size=_ONE_MEGABYTE)
    consumer.seek(0, 2)  # seek to tail, 0 is the offset, and 2 is the tail

    yield consumer

    kafka.close()
예제 #35
0
class Consumer(Thread):
    def __init__(self, args=()):
        super(Consumer, self).__init__()
        self.host = args[0]
        self.port = args[1]
        self.topic = args[2]
        print '[KafkaConsumer] host: {0}, port: {1}, topic: {2}'.format(self.host, self.port, self.topic)
        self.consumer = None
        self.consumer_keep_run = True
        self.consumer_paused = False
        self.consumer_subscribers = []

    def run(self):
        client = kafka_client(self.host, self.port)
        self.consumer = SimpleConsumer(client, None, self.topic)
        self.consumer.seek(0, 1)

        while self.consumer_keep_run:
            print '[KafkaConsumer] looping..'
            if not self.consumer_paused:
                for message in self.consumer.get_messages(block=False):
                    offset = message.offset
                    value = message.message.value
                    j_encoded = json.dumps({'offset': offset, 'message': value})
                    print '[KafkaConsumer] {}'.format(j_encoded)

                    for subscriber in self.consumer_subscribers:
                        IOLoop.instance().add_callback(partial(subscriber.send_message, j_encoded))
            time.sleep(1)

    def pause_consumer(self, paused):
        self.consumer_paused = paused

    def stop_consumer(self):
        self.consumer_keep_run = False

    def add_subscriber(self, subscriber):
        self.consumer_subscribers.append(subscriber)

    def remove_subscriber(self, subscriber):
        self.consumer_subscribers.remove(subscriber)

    def get_subscribers_length(self):
        length = len(self.consumer_subscribers)
        return length

    def get_subscribers(self):
        return self.subscribers
예제 #36
0
def serve_user(user):
    consumer = SimpleConsumer(CLIENT, 'testing', 'user{}_sess{}'.format(user,user))
    msg = None
    msg = consumer.get_message()
    RECEIVE_TIME = time.time()
    color='yellow'

    S_R_LAG = RECEIVE_TIME-SEND_TIME if SEND_TIME else None
    
    if msg:
        print("received message: {} delay: {}".format(msg.message.value.decode(), S_R_LAG))
        if msg.message.value.decode() =='True':
            color='green'
        else:
            color='red'
    return render_template('keylog.html', bgcolor=color)
예제 #37
0
class ScoringWorker(object):
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                       settings.get('SCORING_GROUP'),
                                       settings.get('INCOMING_TOPIC'),
                                       buffer_size=1048576,
                                       max_buffer_size=10485760,
                                       partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlingStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0


    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        try:
            for m in self._in_consumer.get_messages(count=self.consumer_batch_size, block=True, timeout=1.0):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError), e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    type = msg[0]
                    batch.append(msg)
                    if type == 'add_seeds':
                        _, seeds = msg
                        fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds))
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        fingerprints.add(response.meta['fingerprint'])
                        fingerprints.update(map(lambda x: x.meta['fingerprint'], links))
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        fingerprints.add(request.meta['fingerprint'])
                        continue

                    raise TypeError('Unknown message type %s' % type)
                finally:
                    consumed += 1
예제 #38
0
    def init_get_stream(self, get_message_stream, queue_name_spec, starting_marker, echo_requested, include_claimed):
        self.logger.info("KafkaDriver prepare_to_get_messages got: queue_name=%s, echo_requested=%s, include_claimed=%s, starting_marker=%s" % (queue_name_spec,str(echo_requested),str(include_claimed),starting_marker))
        self.logger.info("warning: KafkaDriver ignores echo_requested and include_claimed in GET requests")
        self.consume_group = "cg1" # default consume group
        if len(starting_marker) > 0:
            self.consume_group = starting_marker
        self.logger.info("consume group="+self.consume_group)

        # if the queue name contains "/n"  at the end, we interpret that is referring to partition to read from
        queue_name, partition_part = queue_name_spec.split("/",2)
        if partition_part is None:
            partition = None
        else:
            partition = int(partition_part)
            self.logger.info("limiting topic %s to partition %d" % (queue_name, partition))

        self.get_message_stream = get_message_stream
        self.queue_name = str(queue_name)
        self.consumer = SimpleConsumer(
            client=self.kafka,
            group=self.consume_group,
            topic=self.queue_name,
            partitions=[partition],
            auto_commit=False, # it seems we cannot do any kind of commit when using kafka-pythong 0.9.1 with Kafka versions before 0.8.1 because kafka-python will send a OffsetFetchReqeust (request type 9) or OffsetCommitRequest (request type 8) which is not supported
            fetch_size_bytes= self.MAX_KAFKA_REQ_BATCH_MSGS*4096, # in Marconi,messages can be up to 4k
            iter_timeout=None,
        )
        self.logger.debug("KafkaDriver: seeking to head of %s" % (self.queue_name))
        self.consumer.seek(0,0) # seek to head of topic; TODO: should get starting position from starting_marker param

        self.periodically_check_for_new_messages() # kick of periodic attainment of new messages (space permitting)
예제 #39
0
    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(self.redis_monitor.settings[
                                      'KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(
            self.kafka_conn,
            "demo-id",
            "demo_test.outbound_firehose"
        )
class KafkaConsumer(object):
    def __init__(self, conf):
        self.log = logging.getLogger(__name__)
        self.client = KafkaClient(conf["kafka_server"])
        self.total_inserts = 0
        self.inserts = 0
        self.listenstore = None


    def start_listens(self, listenstore):
        self.listenstore = listenstore
        return self.start(b"listen-group", b"listens")


    def start(self, group_name, topic_name):
        self.group_name = group_name
        self.topic_name = topic_name
        self.log.info("KafkaConsumer subscribed to %s -> %s" % (group_name, topic_name))
        self.consumer = SimpleConsumer(self.client, self.group_name, self.topic_name)

        t0 = 0
        last_offset = -1
        while True:
            listens = []
            if t0 == 0:
                t0 = time()

            messages = self.consumer.get_messages(count=CASSANDRA_BATCH_SIZE, block=True, timeout=KAFKA_READ_TIMEOUT)
            for message in messages:
                try:
                    data = ujson.loads(message.message.value)
                    listens.append(Listen.from_json(data))
                except ValueError as e:
                    self.log.error("Cannot parse JSON: %s\n'%s'" % (str(e), message.message.value))
                    continue

                last_offset = message.offset

            if listens:
                broken = True
                while broken:
                    try:
                        self.listenstore.insert_batch(listens)
                        broken = False
                    except ValueError as e:
                        self.log.error("Cannot insert listens: %s" % unicode(e))
                        broken = False
                    except NoHostAvailable as e:
                        self.log.error("Cannot insert listens: %s. Sleeping, trying again." % unicode(e))
                        sleep(5)


            self.inserts += len(messages)
            if self.inserts >= REPORT_FREQUENCY:
                t1 = time()
                self.total_inserts += self.inserts
                self.log.info("Inserted %d rows in %.1fs (%.2f listens/sec). Total %d rows. last offset: %d" % \
                    (self.inserts, t1 - t0, self.inserts / (t1 - t0), self.total_inserts, last_offset))
                self.inserts = 0
                t0 = 0
예제 #41
0
    def __init__(self, kafka_hostport, topic, group=None, **kwargs):
        if not group:
            group = str(uuid.uuid4())

        self.kafka = get_client(kafka_hostport)
        self.consumer = SimpleConsumer(self.kafka, group, topic,
            max_buffer_size=1048576 * 32, **kwargs)
예제 #42
0
    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(
            self.kafka_conn,
            "demo-id",
            "demo_test.crawled_firehose",
            buffer_size=1024*100,
            fetch_size_bytes=1024*100,
            max_buffer_size=None
        )
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)
def kafka_stream():
    # global visualization_topic 
    # topic = visualization_topic 
    # print "DEBUG stream topic: " + topic
    topic = "web"
    kafka = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(kafka, "python", topic)
    consumer.seek(offset=0, whence=2)
    # topic = None

    def gen():
        for message in consumer:
            yield 'data: %s\n\n' %str(message.message.value)

    print "DEBUG: Kafka Stream Connected"
    return Response(gen(), mimetype="text/event-stream")
예제 #44
0
    def test_simple_consumer_unknown_topic_partition(self):
        client = MagicMock()
        consumer = SimpleConsumer(client, group=None,
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        # Mock so that only the first request gets a valid response
        def unknown_topic_partition(request):
            return FetchResponsePayload(request.topic, request.partition,
                                 UnknownTopicOrPartitionError.errno, -1, ())

        client.send_fetch_request.side_effect = self.fail_requests_factory(unknown_topic_partition)

        # This should not raise an exception
        with self.assertRaises(UnknownTopicOrPartitionError):
            consumer.get_messages(20)
예제 #45
0
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY)

        self._in_consumer = SimpleConsumer(self._kafka,
                                       settings.get('FRONTIER_GROUP'),
                                       settings.get('INCOMING_TOPIC'),
                                       buffer_size=1048576,
                                       max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = SimpleConsumer(self._kafka,
                                           settings.get('FRONTIER_GROUP'),
                                           settings.get('SCORING_TOPIC'),
                                           buffer_size=262144,
                                           max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP'))

        self._manager = FrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}
def kafka_stream():
    # global visualization_topic
    # topic = visualization_topic
    # print "DEBUG stream topic: " + topic
    topic = "web"
    kafka = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(kafka, "python", topic)
    consumer.seek(offset=0, whence=2)

    # topic = None

    def gen():
        for message in consumer:
            yield 'data: %s\n\n' % str(message.message.value)

    print "DEBUG: Kafka Stream Connected"
    return Response(gen(), mimetype="text/event-stream")
예제 #47
0
파일: kafka_test.py 프로젝트: razhong/randy
def read_kafka(docid):
    """
    read socialSignal, keep if non are zero, save to mongo social/socialSignal
    :return:
    """
    msg_buffer = dict()
    ids = set()
    in_kafka = KafkaClient(settings.IN_SOCIAL_SIGNAL_KAFKA)
    consumer = SimpleConsumer(in_kafka, 'test0', TOPIC, max_buffer_size=20 * 1024 * 1024,
                              fetch_size_bytes=2 * 1024 * 1024, buffer_size=2 * 1024 * 1024)
    consumer.seek(6000000, 0)

    for msg in consumer:
        if msg.offset % 100000 == 0:
            print 'working on ', msg.offset
        if docid in msg.message.value:
            print msg.message.value
 def __init__(self, topic, kafka_broker, consumer_group):
     self.kafka = KafkaClient(kafka_broker)
     self.consumer = SimpleConsumer(self.kafka,
                                    consumer_group,
                                    topic,
                                    fetch_size_bytes=self.__max_buffer_size,
                                    buffer_size=self.__max_buffer_size,
                                    max_buffer_size=self.__max_buffer_size)
예제 #49
0
def main():
    """
    Usage:
        dump_to_mongodb dump <topic> --host=<host> [--consumer=<consumer>]
    """
    args = docopt(main.__doc__)
    host = args["--host"]

    print "=> Connecting to {0}...".format(host)
    logger.info("=> Connecting to {0}...".format(host))
    kafka = KafkaClient(host)
    print "=> Connected."
    logger.info("=> Connected.")
    if args["dump"]:
        topic = args["<topic>"]
        consumer_id = args["--consumer"] or "dump_to_mongodb"
        consumer = SimpleConsumer(
            kafka,
            consumer_id,
            topic,
            buffer_size=1024 * 200,  # 100kb
            fetch_size_bytes=1024 * 200,  # 100kb
            max_buffer_size=None  # eliminate big message errors
        )
        consumer.seek(0, 1)
        while True:
            try:
                message = consumer.get_message()
                if message is None:
                    time.sleep(1)
                    continue
                val = message.message.value
                logger.info("message.message.value== %s " % val)
                print('val==', val)
                try:
                    item = json.loads(val)
                except:
                    continue
                if 'meta' in item and 'collection_name' in item['meta']:
                    _insert_item_to_monggodb(item)
            except:
                traceback.print_exc()
                break
        kafka.close()
        return 0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    #get timestamp
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    while True:
        messages = kafka_consumer.get_messages(
            count=1000,
            block=False)  #get 5000 messages at a time, non blocking
        if not messages:
            os.system("sleep 30s")
        continue
        #break
        for message in messages:  #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            print message
        kafka_consumer.commit()  #save position in the kafka queue
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit()  #save position in the kafka queue
    return 0
예제 #51
0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    #get timestamp
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp,
                                                    batch_counter)
    tempfile = open(tempfile_path, "w")
    log_has_at_least_one = False  #did we log at least one entry?
    while True:
        messages = kafka_consumer.get_messages(
            count=1000,
            block=False)  #get 1000 messages at a time, non blocking
        if not messages:
            break
        for message in messages:  #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            log_has_at_least_one = True
            #print(message.message.value)
            tempfile.write(message.message.value + "\n")
        if tempfile.tell() > 10000000:  #file size > 10MB
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit()
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit()  #save position in the kafka queue
    return 0
예제 #52
0
def connection_time():
    cnt = 0
    kafka = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(kafka, "test", "twitter")
    start_time = time.time()
    for msg in consumer:
        cnt += 1
        if cnt > 0:
            return time.time() - start_time
예제 #53
0
class KafkaSpout(Spout):

	def initialize(self, stormconf, context):
		# self.words = itertools.cycle(['dog', 'cat',
		# 								'zebra', 'elephant'])
		self.kafka = KafkaClient("cloud.soumet.com:9092")
		self.consumer = SimpleConsumer(self.kafka, "storm", "realtime", max_buffer_size=1310720000)
		



	def next_tuple(self):
		for message in self.consumer.get_messages(count=500, block=False):#, timeout=1):
			#transaction_data = TransactionFull()
			#transaction_data.ParseFromString(base64.b64decode(message.message.value))
			#self.emit([transaction_data])
			self.emit([message.message.value])
		self.consumer.commit()
예제 #54
0
class QueueKafka(QueueBase.QueueBase):
    @QueueBase.catch
    def __init__(self, name, host='web14', port=51092, **kwargs):
        QueueBase.QueueBase.__init__(self, name, host, port)
        self.__queue = []
        self.__kafka = KafkaClient('%s:%d' % (host, port))
        self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False))
        self.__producer.client.ensure_topic_exists(self.name)
        self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1)

    def __del__(self):
        if self.__kafka:
            [self.put(x.message.value) for x in self.__queue]
            self.__kafka.close()

    @QueueBase.catch
    def put(self, value, *args, **kwargs):
        if isinstance(value, dict) or isinstance(value, list):
            self.__producer.send_messages(self.name, json.dumps(value))
        else:
            self.__producer.send_messages(self.name, value.encode('utf-8') if isinstance(value, unicode) else value)

    @QueueBase.catch
    def get(self, *args, **kwargs):
        if not self.__queue:
            self.__consumer._fetch()
            kq = self.__consumer.queue
            while not kq.empty():
                partition, result = kq.get_nowait()
                self.__queue.append(result)
                self.__consumer.offsets[partition] += 1
                self.__consumer.count_since_commit += 1
            self.__consumer.queue = Queue()
            self.__consumer.commit()
        return self.__queue.pop().message.value if self.__queue else None

    @QueueBase.catch
    def size(self, *args, **kwargs):
        count = 0
        for k, v in self.__consumer.offsets.items():
            reqs = [common.OffsetRequest(self.name, k, -1, 1)]
            (resp, ) = self.__consumer.client.send_offset_request(reqs)
            count += (resp.offsets[0] - v)
        return count + len(self.__queue)