Exemplo n.º 1
0
 def __init__(self, factory, destination):
     self.factory = factory
     self.destination = destination
     self.consumer = SimpleConsumer(self.factory, "test-group",
                                    self.destination)
     self.rate = PerfRate()
     threading.Thread.__init__(self)
Exemplo n.º 2
0
    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP',
                                      'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka,
                                       consumer_group,
                                       self.topic,
                                       auto_commit=True,
                                       iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped,
                                     signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Exemplo n.º 3
0
 def __init__(self, host, port, schema_path, topic, nbmsg, consumer_timeout):
     self.topic = topic
     self.nbmsg = nbmsg
     self.sent_msg = 0
     self.host = host
     self.port = port
     self.sent = [-100] * self.nbmsg
     self.rcv = [-100] * self.nbmsg
     self.runtag = str(random.randint(10, 100000))
     try:
         self.broker = KafkaClient("%s:%d" % (self.host, self.port))
     except:
         raise ValueError(
             "KafkaClient (%s:%d) - init failed" % (self.host, self.port))
     try:
         self.producer = SimpleProducer(self.broker)
     except:
         raise ValueError(
             "SimpleProducer (%s:%d) - init failed" % (self.host, self.port))
     try:
         self.consumer = SimpleConsumer(
             self.broker, "testbot", topic, iter_timeout=consumer_timeout)
     except:
         raise ValueError(
             "SimpleConsumer (%s:%d) - init failed" % (self.host, self.port))
     try:
         self.schema = avro.schema.parse(open(schema_path).read())
     except:
         raise ValueError(
             "Prod2Cons load schema (%s) - init failed" % (schema_path))
Exemplo n.º 4
0
def get_offsets(offsets_after_time_millis,
                conn_params=config.DEFAULT_CONN_PARAMS):

    curr_time = long(time.time() * 1000)

    for host in config.bagheera_nodes:
        for topic in config.topics:
            for partition in config.partitions:
                consumer = SimpleConsumer(host, conn_params['port'],
                                          conn_params['nrecs'],
                                          conn_params['bufsize'])

                offset = long(
                    consumer.getOffsetsBefore(topic, partition,
                                              offsets_after_time_millis, 1)[0])

                consumer.close()

                System.out.println(
                    json.dumps({
                        'time_millis': curr_time,
                        'hostname': host,
                        'topic': topic,
                        'partition': partition,
                        'offset': offset
                    }))
Exemplo n.º 5
0
def kafka_pull(message_queue):
    global g_conf
    global g_master_logger
    ret = True
    while True:
        try:
            if is_quit():
                g_master_logger.info("thread quit: [%d]" % os.getpid())
                return True

            random_v = random.randint(0, len(g_conf["broker_list"]) - 1)
            broker = g_conf["broker_list"][random_v]
            g_master_logger.info("use broker is [%s]" % broker)
            partition_set = set([0])

            # client
            client = KafkaClient(broker)
            consumer = SimpleConsumer(
                client,
                g_conf["msg_group_name"],
                g_conf["msg_topic_name"],
                partitions=partition_set,
                auto_commit_every_n=g_conf["auto_commit_every_n"],
                auto_commit_every_t=g_conf["auto_commit_every_t"],
                fetch_size_bytes=g_conf["fetch_size_bytes"],
                buffer_size=g_conf["buffer_size"],
                max_buffer_size=g_conf["max_buffer_size"])

            cnt = 0
            for message in consumer:
                cnt += 1
                if cnt % 10000 == 0:
                    g_master_logger.info("msg consumer cnt is [%d] queue:%u" %
                                         (cnt, message_queue.qsize()))
                if is_quit():
                    consumer.stop()
                    g_master_logger.info("thread fetch msg quit: [%d]" %
                                         os.getpid())
                    break

                value = message.message.value
                if value == None:
                    g_master_logger.warning("value is none, msg is [%s]" %
                                            str(message))
                    continue
                if len(value) == 0:
                    g_master_logger.warning("value len is 0, msg is [%s]" %
                                            str(message))
                    continue
                if check_pkg(value) == False:
                    continue
                message_queue.put(message)

        except Exception, e:
            g_master_logger.error(
                "work error, exception is [%s], traceback is [%s]" %
                (e, traceback.format_exc()))
            time.sleep(5)
            continue
 def __init__(self, addr, group, topic):
     self.client = KafkaClient(addr)
     self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
     self.temp_file_path = None
     self.temp_file = None
     self.topic = topic
     self.group = group
     self.block_cnt = 0
Exemplo n.º 7
0
def dataConsumer(topic, group='default', count=1, dateStr=''):
    kafka_consumer = SimpleConsumer(KafkaClient(MasterPublicIP + ":9092"), \
                                    group, topic, max_buffer_size=MAX_BUFFER_SIZE)
    messages = kafka_consumer.get_messages(count=count)
    dataList = []
    for message in messages:
        dataList.append(message.message.value)
    if len(dataList) > 0:
        flush2HDFS(dataList, dateStr)
Exemplo n.º 8
0
def main():
    client = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(client, "test-group", "twitter_raw")
    consumer.seek(0,2)

    num = 0
    for message in consumer:
        print "redis publish:", num
        num+=1
        try:
            data_depickled = pickle.loads(message.message.value.decode('utf-8'))
        except Exception, e:
            continue
        # print data_depickled
        # {  
        #    'text':'@_LulaMoore me hamas perra',
        #    'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49),
        #    'source':u'Twitter Web Client',
        #    'lang:':u'es',
        #    'place':{  
        #       'country_code':u'AR',
        #       'coordinates':[  
        #          [  
        #             -68.176283,
        #             -38.984724
        #          ],
        #          [  
        #             -68.176283,
        #             -38.921051
        #          ],
        #          [  
        #             -68.015162,
        #             -38.921051
        #          ],
        #          [  
        #             -68.015162,
        #             -38.984724
        #          ]
        #       ]
        #    },
        #    'user':{  
        #       'statuses_count':15067,
        #       'name':u'Dama negra *\uffe6*',
        #       'friends_count':390,
        #       'created_at':datetime.datetime(2014, 3, 15,2,37, 10),
        #       'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg',
        #       'followers_count':384,
        #       'id':2390242428
        #    },
        #    'id':652628813935980544
        # }

        ### process data here ###
        # text = data_depickled['text']
        filtered_data = data_filter(data_depickled)
        data_pickled = pickle.dumps(filtered_data)
        redis.publish('tweets_processed', data_pickled)
Exemplo n.º 9
0
    def run(self):
        client = KafkaClient(self.bootstrap_server, client_id='commandline')
        consumer = SimpleConsumer(client, self.group, self.topic, auto_commit_every_n=1, buffer_size=160,
                                  auto_commit=True)

        for message in consumer:
            now = datetime.now()
            print("%s: %s" % (now, message))
            consumer.commit()
Exemplo n.º 10
0
 def __init__(self, conn_pool, topic, group):
     self.conn_pool = conn_pool
     self.topic = topic
     self.group = group
     self.kafka = KafkaClient(self.conn_pool)
     self.consumer = SimpleConsumer(self.kafka,
                                    self.group,
                                    self.topic,
                                    max_buffer_size=None)
     self.consumer.seek(0, 2)  # move to the tail of the queue
Exemplo n.º 11
0
 def __init__(self, addr, group, topic):
     self.client = KafkaClient(addr)
     self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000,auto_commit=False)
     self.temp_file_path = None
     self.temp_file = None
     self.hadoop_path = "/user/AdReport/%s/history" %(topic)
     self.cached_path = "/user/AdReport/%s/cached" %(topic)
     self.topic = topic
     self.group = group
     self.block_cnt = 0
Exemplo n.º 12
0
    def run(self):
        client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092")
        consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part)

        consumer.seek(0,0)

        while True:
            message = consumer.get_message(True,60)
            self.__offset = message.offset
            print message.message.value
Exemplo n.º 13
0
    def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
        print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer.seek(0,2)

        for message in consumer:
            message = parse_json(message)
            print "=============" + str(message) + "============"
            message_consume_function(message)
            print "called message consume function"
Exemplo n.º 14
0
def main():
    client = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(client, "test-group", "twitter_raw")
    consumer.seek(0,2)

    for message in consumer:
        # data_deserialized = str.decode(message.message.value)
        data_depickled = pickle.loads(message.message.value.decode('utf-8'))
        # print str(data_depickled).decode('string_escape')
        print data_depickled
Exemplo n.º 15
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/insight/artsy/geo"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "post_geo_activity" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path,"w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)


    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS."""
        self.temp_file.close()
        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp)

        print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath)
        self.block_cnt += 1
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs
        os.remove(self.temp_file_path) # remove temp local file
        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemplo n.º 16
0
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-out-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise


    def next_tuple(self):
        """
        input message:
            dict(
                 id = input['id'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 raw_html =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        safeurl = crawled['url'].encode('utf-8', 'ignore')
        self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl)
        context = {
            'source': 'datawake-lookahead',
            'userId': crawled['attrs']['userId'],
            'org': crawled['attrs']['org'],
            'domain': crawled['attrs']['domain'],
            'url': crawled['url']
        }
        self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
Exemplo n.º 17
0
 def __init__(self, addr, group, topic):
     """Initialize Consumer with kafka broker IP, group, and topic."""
     self.client = KafkaClient(addr)
     self.consumer = SimpleConsumer(self.client,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)
     self.temp_file_path = None
     self.temp_file = None
     self.hadoop_path = "/insight/artsy/geo"
     self.topic = topic
     self.group = group
     self.block_cnt = 0
Exemplo n.º 18
0
    def spiderIdle(self, spider):
        consumer = SimpleConsumer(self.kafka_conn, "test", "commands")
        for msg in consumer.get_messages():
            print msg.message.value
            if msg.message.value == spider.name + '_stop':
                print 'stop'
                spider.spider_pause()
                #spider.close(spider,'ok')
                #self.scrapy.engine.close_spider(spider, 'closespider_itemcount')

            if msg.message.value == spider.name + '_start':
                #self.scrapy.engine.scraper.open_spider(spider)
                spider.spider_resume()
Exemplo n.º 19
0
    def spiderIdle(self, spider):
        consumer = SimpleConsumer(self.kafka_conn, "test", "commands")
        for msg in consumer.get_messages():
            print msg.message.value
            if msg.message.value == spider.name + "_stop":
                print "stop"
                spider.spider_pause()
                # spider.close(spider,'ok')
                # self.scrapy.engine.close_spider(spider, 'closespider_itemcount')

            if msg.message.value == spider.name + "_start":
                # self.scrapy.engine.scraper.open_spider(spider)
                spider.spider_resume()
Exemplo n.º 20
0
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(
                stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' +
                     self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,
                                           self.group,
                                           self.topic,
                                           max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error",
                     level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {'source': 'datawake-visited', 'domain': domain}
                self.emit([
                    url, '', '', '', html, timestamp, context['source'],
                    context
                ])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
        pass
Exemplo n.º 21
0
    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-in-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
	    self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
            self.consumer.seek(0,2) # move to the tail of the queue
        except:
            self.log("CrawlerSpout initialize error",level='error')
            self.log(traceback.format_exc(),level='error')
            raise
Exemplo n.º 22
0
    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)
Exemplo n.º 23
0
    def run(self):
        client = KafkaClient(
            "10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092")
        consumer = SimpleConsumer(client, "test-group", "guantest")

        for message in consumer:
            print(message.message.value)
def main():
    kafka = KafkaClient("localhost:9092")
    print("Consumer established connection to kafka")
    consumer = SimpleConsumer(kafka, "my-group", "test")
    for message in consumer:
        # This will wait and print messages as they become available
        print(message)
Exemplo n.º 25
0
    def __init__(self, topic, hosts=None, log_level=logging.WARNING):
        hosts = hosts or "localhost:9092"
        self.group = "kafque"
        self.topic = "{}_{}".format(self.group, topic)
        self.client = KafkaClient(hosts)
        self.client.ensure_topic_exists(str(self.topic))
        self.consumer = SimpleConsumer(
            self.client, str(self.group), str(self.topic), auto_commit=False)
        self.consumer.provide_partition_info()
        self.consumer.fetch_last_known_offsets()
        self.logger = setup_logger(__name__, level=log_level)

        self.failed_queue = None
        if self.topic != "{}_failed".format(self.group):
            self.failed_queue = FailedQueue(
                hosts=hosts, log_level=logging.ERROR)
Exemplo n.º 26
0
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ListeningKafkaSpider,
                       cls).from_crawler(crawler, *args, **kwargs)

        if not hasattr(spider, 'topic') or not spider.topic:
            spider.topic = '%s-starturls' % spider.name

        hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = crawler.settings.get(
            'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        spider.consumer = SimpleConsumer(_kafka,
                                         consumer_group,
                                         spider.topic,
                                         auto_commit=True,
                                         iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        crawler.signals.connect(spider.item_scraped,
                                signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic)

        return spider
Exemplo n.º 27
0
    def run(self):
        client = KafkaClient("172.17.8.101:9092")
        consumer = SimpleConsumer(client, "test-group", "topic")

        batch_size = 300
        global_counter = 0
        counter = 0
        batch = BatchStatement()

        for message in consumer:
            if counter >= batch_size:
                session.execute(batch)
                batch = BatchStatement()
                counter = 0

            temp = yaml.load(message[1][3])
            #            print temp
            global_counter += 1
            print global_counter
            prepared = session.prepare("""
                    INSERT INTO testkeyspace.meter_data (timestamp, id, P_1, P_2, P_3, Q_1, Q_2, Q_3)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    """)
            batch.add(prepared, (temp["timestamp"], uuid.UUID(
                temp["id"]), temp["P_1"], temp["P_2"], temp["P_3"],
                                 temp["Q_1"], temp["Q_2"], temp["Q_3"]))
            counter += 1
Exemplo n.º 28
0
class Consumer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.topic = "steps_data_part4"
        self.consumer_group = 's3_consumer' 
        self.consumer = SimpleConsumer(self.client, self.consumer_group, self.topic)

    def consume_message(self):
        while True:
            timestamp = time.strftime('%Y%m%d%H%M%S')
            temp_file_name = "%s_%s_%s.dat" %(self.topic, self.consumer_group, timestamp)
            temp_file = open("/home/ubuntu/rankMyStep/kafka/"+temp_file_name,"w")
            messages = self.consumer.get_messages(count=1000, block=False)
            for msg in messages:
                print msg.message.value + "\n"
                temp_file.write(msg.message.value + "\n")
            self.save_to_s3(temp_file_name)

    def save_to_s3(self, file_name):
        mybucket = "anurag-raw-data-store"
        aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')
        s3_client = boto3.client('s3')
        s3_client.upload_file("/home/ubuntu/rankMyStep/kafka/"+file_name, 
                              mybucket,"rankmysteps/"+file_name)
        os.remove("/home/ubuntu/rankMyStep/kafka/"+file_name)
Exemplo n.º 29
0
    def __init__(self, info):
        self.host = info['attributes']['host']
        self.group = info['attributes']['group']
        self.topic = info['attributes']['topic']

        self.client = KafkaClient(self.host)
        self.consumer = SimpleConsumer(client, self.group, self.topic)
Exemplo n.º 30
0
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self,conn_pool,topic,group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
        self.consumer.seek(0,2) # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
Exemplo n.º 31
0
def get_message() :
    try :
        kconn = KafkaClient(kafka_producer.hosts , timeout = 10)
        getter = SimpleConsumer(kconn , 'test_group', kafka_producer.topic)
        #getter.seek(0, 0)
        while True:
            try:
                messages = getter.get_messages(200,timeout=3)
                if messages:
                    logging.info('get message from kafka done'+str(decode(messages)))
                import time
                time.sleep(0.1)
            except BaseException ,e:
                logging.error(str(e))
    except BaseException , e :
        logging.error(str(e) + 'get message from kafka failed')
Exemplo n.º 32
0
    def run(self):
        client = KafkaClient("vsu-01:9092")
        consumer = SimpleConsumer(client, "test-group", "my.price")

        for message in consumer:

            print(message)
Exemplo n.º 33
0
 def __init__(self,conn_pool,topic,group):
     self.conn_pool = conn_pool
     self.topic = topic
     self.group = group
     self.kafka = KafkaClient(self.conn_pool)
     self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
     self.consumer.seek(0,2) # move to the tail of the queue
Exemplo n.º 34
0
        def run(self):
            client = None
            consumer = None
            try:
                prev = None
                # print("Starting Kafka Client")
                # print("Kafka topic: {}").format(self.topic)
                print get_kafka_hosts()
                client = KafkaClient(hosts=get_kafka_hosts())
                consumer = SimpleConsumer(client=client,
                                          group=self.groupName.encode(
                                              'ascii', 'ignore'),
                                          topic=self.topic,
                                          iter_timeout=5)
                consumer.seek(0, 1)
                print '[Kafka Consumer] START'
                print 'Topic: {}'.format(self.topic)
                print 'Listening incoming message...'
                print '========================================================='
                # print("Listening kafka message...")

                while self.stopCpu is False:
                    for message in consumer.get_messages(count=5, block=False):
                        if self.stopCpu is True:
                            # print("Kafka Consumer Listening Stopped")
                            break

                        if message:
                            offset = message.offset
                            value = message.message.value
                            print 'msg: {0}, offset: {1}'.format(value, offset)

                            if len(value) > 0:
                                # chartdata = []
                                # j_val = json.loads(value)
                                # j_val['offset'] = offset
                                # chartdata.append(j_val)
                                # print("destination => ws"+str(self.pid))
                                # self.parentOj.emit("ws"+str(self.type), chartdata)
                                # self.parentOj.emit(self.topic, value)
                                self.parentOj.emit("ws" + str(self.pid), value)

                print '[Kafka Consumer] STOP'
                print 'Topic: {}'.format(self.topic)
                print 'Stop listening...'
                print '========================================================'
                # print("Listening kafka Stopped")
                consumer.stop()
                client.close()
            except Exception as e:
                consumer.stop()
                client.close()
Exemplo n.º 35
0
 def listen(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     # print client.topic_partitions()
     consumer = SimpleConsumer(client, self.consumer_name, self.topic_name)
     for message in consumer:
         value = message.message.value
         print value
Exemplo n.º 36
0
 def register_consumer(self, callback, parse_json, topic_group, topic_name):
     consumer = SimpleConsumer(self.client,
                               topic_group,
                               topic_name,
                               max_buffer_size=None)
     consumer_thread = ConsumerThread(consumer, callback, parse_json)
     print "Starting new subscriber for topic " + topic_name + ' with group ' + topic_group
     consumer_thread.start()
Exemplo n.º 37
0
    def __init__(self, cache):

        threading.Thread.__init__(self)

        self.kafka = KafkaClient(self.kafkaHost)
        self.consumer = SimpleConsumer(self.kafka, "test-group", "collector")

        self.cache = cache
Exemplo n.º 38
0
 def _hidden_setup():
     try:
         self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
         self.kafka_conn.ensure_topic_exists(
             self.settings['KAFKA_INCOMING_TOPIC'])
         self.consumer = SimpleConsumer(
             self.kafka_conn,
             self.settings['KAFKA_GROUP'],
             self.settings['KAFKA_INCOMING_TOPIC'],
             auto_commit=True,
             iter_timeout=1.0)
     except KafkaUnavailableError as ex:
         message = "An exception '{0}' occured. Arguments:\n{1!r}" \
             .format(type(ex).__name__, ex.args)
         self.logger.error(message)
         sys.exit(1)
     return True
Exemplo n.º 39
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000,
                                       auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (
            self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic,
                                                  timestamp)
        print "Block " + str(
            self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (
            self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemplo n.º 40
0
    def run(self):
        #client = KafkaClient("localhost:9092")
        client = KafkaClient("kafka_host:9092")
#        consumer = SimpleConsumer(client, "test-group", "my-topic")
        consumer = SimpleConsumer(client, "python-group", "test")


        for message in consumer:
            print(message)
Exemplo n.º 41
0
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {
                    'source': 'datawake-visited',
                    'domain': domain
                }
                self.emit([url, '', '', '', html, timestamp, context['source'], context])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
	pass 
 def __init__(self, addr, group, topic):
     self.client = KafkaClient(addr)
     self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
     self.temp_file_path = None
     self.temp_file = None
     self.hadoop_path = "/user/AdReport/%s/history" %(topic)
     self.cached_path = "/user/AdReport/%s/cached" % (topic)
     self.topic = topic
     self.group = group
     self.block_cnt = 0
Exemplo n.º 43
0
class CrawlerSpout(Spout):

    group = 'datawake-crawler-in-consumer'.encode()


    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-in-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
	    self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
            self.consumer.seek(0,2) # move to the tail of the queue
        except:
            self.log("CrawlerSpout initialize error",level='error')
            self.log(traceback.format_exc(),level='error')
            raise

    def next_tuple(self):
        """
        input message:
             json.dumps(dict(
                    id = 'abcdefg', #TODO generate UUID,
                    appid = self.appid,
                    url = url,
                    priority = 50,
                    depth = 0,
                    attrs  = dict(
                        userId = context['userId'],
                        org =  context['org'],
                        domain = context['domain']
                    )
                ))
        :return:
        """
        try:
            for message in self.consumer:
                to_crawl = json.loads(message)
                self.emit([to_crawl])
        except:
            self.log(traceback.format_exc(),level='error')
Exemplo n.º 44
0
 def __init__(self, addr, group, topic):
     """Initialize Consumer with kafka broker IP, group, and topic."""
     self.client = KafkaClient(addr)
     self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
     self.temp_file_path = None
     self.temp_file = None
     self.hadoop_path = "/user/parking_data/history"
     self.topic = topic
     self.group = group
     self.block_cnt = 0
Exemplo n.º 45
0
 def initialize(self, stormconf, context):
     try:
         settings = all_settings.get_settings(stormconf['topology.deployment'])
         self.topic = settings['crawler-out-topic'].encode()
         self.conn_pool = settings['conn_pool'].encode()
         self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
         self.kafka = KafkaClient(self.conn_pool)
         self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
         self.consumer.seek(0, 2)  # move to the tail of the queue
     except:
         self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
         self.log(traceback.format_exc(), level='error')
         raise
Exemplo n.º 46
0
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path,"w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp)
        print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemplo n.º 47
0
    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)
Exemplo n.º 48
0
 def _hidden_setup():
     try:
         self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
         self.kafka_conn.ensure_topic_exists(
                 self.settings['KAFKA_INCOMING_TOPIC'])
         self.consumer = SimpleConsumer(self.kafka_conn,
                                        self.settings['KAFKA_GROUP'],
                                        self.settings['KAFKA_INCOMING_TOPIC'],
                                        auto_commit=True,
                                        iter_timeout=1.0)
     except KafkaUnavailableError as ex:
         message = "An exception '{0}' occured. Arguments:\n{1!r}" \
             .format(type(ex).__name__, ex.args)
         self.logger.error(message)
         sys.exit(1)
     return True
Exemplo n.º 49
0
 def _init(self, topics):
     ret = False
     while not ret:
         ret = self.create_newpath(kafka_consts.CONSUMER_PATH + '/' + self.consumer_group)
         if not ret:
             sleep(1)
     ret = False
     while not ret:
         ret = self.create_newpath(kafka_consts.CONSUMER_PATH + '/' + self.consumer_group +
                 '/ids')
         if not ret:
             sleep(1)
    
     self.register()
     self.get_consumer_list()
     self.populate_broker_info()
     temptopics = [x.strip() for x in topics]
     self.topics = []
     for t in temptopics:
         if t != '' and t not in self.topics:
             self.topics.append(t)
     if not self.topics:
         raise ValueError('no topics passed')
     ret = False
     broker_ports = [] 
     with self.lock:
         for brid in self.broker_details:
             broker_port = self.broker_details[brid]
             broker_ports.append('{}:{}'.format(broker_port['host'],broker_port['port']))
     
     self.kafka_client = nsclient(broker_ports)
     self.topic_part_ids = {} 
     for topic in topics:
         pids = self.kafka_client.get_partition_ids_for_topic(topic)
         self.topic_part_ids[topic] = pids
     self.consumed = {} 
     self.rebalance_consumers()
     
     try:
         topic_partitions = {t : None for t in self.topics}
         self.kconsumer = SimpleConsumer(self.kafka_client, self.consumer_group, None,
                 topic_partitions=self.consumed.copy())
     except Exception as e:
         logging.exception(e)
         sys.exit(1)
Exemplo n.º 50
0
    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn('Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info('Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client, self.group, self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)
Exemplo n.º 51
0
    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Exemplo n.º 52
0
class PerfConsumerSync ( threading.Thread ):

    running = True

    def __init__(self, factory, destination):
        self.factory = factory
        self.destination = destination
        self.consumer = SimpleConsumer(self.factory, "test-group", self.destination)
        self.rate = PerfRate()
        threading.Thread.__init__ ( self )

    def run (self):
        while (self.running):
            textMessage = self.consumer.get_messages(block=True, timeout=1000000)
            if (textMessage != None):
                self.rate.increment()

    def stop(self):
        self.running = False

    def start(self):
        threading.Thread.start(self)
Exemplo n.º 53
0
class KafkaMonitor:
    def __init__(self, settings):
        # dynamic import of settings file
        # remove the .py from the filename
        self.settings = importlib.import_module(settings[:-3])

        # only need kafka for both uses
        self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)

    def get_method(self, key):
        if key == 'handle_crawl_request':
            return self.handle_crawl_request
        elif key == 'handle_action_request':
            return self.handle_action_request
        raise AttributeError(key)

    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def handle_crawl_request(self, dict):
        '''
        Processes a vaild crawl request

        @param dict: a valid dictionary object
        '''
        # format key
        key = "{sid}:queue".format(sid=dict['spiderid'])
        val = pickle.dumps(dict, protocol=-1)

        # shortcut to shove stuff into the priority queue
        self.redis_conn.zadd(key, val, -dict['priority'])

        # if timeout crawl, add value to redis
        if 'expires' in dict:
            key = "timeout:{sid}:{appid}:{crawlid}".format(
                sid=dict['spiderid'],
                appid=dict['appid'],
                crawlid=dict['crawlid'])
            self.redis_conn.set(key, dict['expires'])

    def handle_action_request(self, dict):
        '''
        Processes a vaild action request

        @param dict: The valid dictionary object
        '''
        # format key
        key = "{action}:{spiderid}:{appid}".format(
            action=dict['action'],
            spiderid=dict['spiderid'],
            appid=dict['appid'])

        if "crawlid" in dict:
            key = key + ":" + dict['crawlid']

        self.redis_conn.set(key, dict['uuid'])

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        while True:
            start = time.time()

            try:
                for message in self.consumer.get_messages():
                    if message is None:
                        break
                    try:
                        the_dict = json.loads(message.message.value)

                        try:
                            self.validator(self.schema).validate(the_dict)
                            self.result_method(the_dict)
                        except ValidationError as ex:
                            print "invalid json received"

                    except ValueError:
                        print "bad json recieved"
            except OffsetOutOfRangeError:
                # consumer has no idea where they are
                self.consumer.seek(0, 2)

            end = time.time()
            time.sleep(.01)

    def run(self):
        '''
        Sets up the schema to be validated against
        '''
        self.setup()
        with open(self.settings.SCHEMA) as the_file:
            # No try/catch so we can see if there is a json parse error
            # on the schemas
            self.schema = json.load(the_file)
            self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        topic = self.settings.KAFKA_INCOMING_TOPIC
        producer = SimpleProducer(self.kafka_conn)
        print "=> feeding JSON request into {0}...".format(topic)
        print json.dumps(json_item, indent=4)
        self.kafka_conn.ensure_topic_exists(topic)
        producer.send_messages(topic, json.dumps(json_item))
        print "=> done feeding request."
Exemplo n.º 54
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(
            self,
            zk_hosts,
            group,
            topic,
            nodes,
            zk_handler=None,
            logger=None,
            identifier=None,
            **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']")
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:
                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning("Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info('Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(lambda greenlet: self.zkp.join_group)
                if not self.nodes:
                    self.logger.info('Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk, self.group, self.topic, self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk, self.group, self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn('Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info('Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client, self.group, self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
Exemplo n.º 55
0
    def check(self, instance):
        consumer_groups = self.read_config(instance, 'consumer_groups',
                                           cast=self._validate_consumer_groups)
        kafka_host_ports = self.read_config(instance, 'kafka_connect_str')
        full_output = self.read_config(instance, 'full_output', cast=bool)
        dimensions = self.read_config(instance, 'dimensions', cast=dict, optional=True)
        new_dimensions = {'component': 'kafka', 'service': 'kafka'}
        if dimensions is not None:
            new_dimensions.update(dimensions.copy())

        try:
            # Connect to Kafka
            kafka_conn = KafkaClient(kafka_host_ports)

            # Query Kafka for consumer offsets
            consumer_offsets = {}
            topics = defaultdict(set)
            for consumer_group, topic_partitions in consumer_groups.iteritems():
                for topic, partitions in topic_partitions.iteritems():
                    consumer = SimpleConsumer(kafka_conn, consumer_group, topic)
                    # Remember the topic partitions that we've see so that we can
                    # look up their broker offsets later
                    topics[topic].update(set(partitions))
                    for partition in partitions:
                        consumer_offsets[(consumer_group, topic, partition)] = consumer.offsets[partition]
                    consumer.stop()

            # Query Kafka for the broker offsets, done in a separate loop so only one query is done
            # per topic even if multiple consumer groups watch the same topic
            broker_offsets = {}
            for topic, partitions in topics.items():
                offset_responses = kafka_conn.send_offset_request([
                    OffsetRequest(topic, p, -1, 1) for p in partitions])

                for resp in offset_responses:
                    broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0]
        finally:
            try:
                kafka_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Kafka connection')

        # Report the broker data
        if full_output:
            for (topic, partition), broker_offset in broker_offsets.items():
                broker_dimensions = new_dimensions.copy()
                broker_offset = broker_offsets.get((topic, partition))
                self.gauge('kafka.broker_offset',
                           broker_offset,
                           dimensions={'topic': topic,
                                       'partition': partition}.update(broker_dimensions))

        # Report the consumer data
        for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items():
            # Get the broker offset
            broker_offset = broker_offsets.get((topic, partition))
            # Report the consumer offset and lag
            consumer_dimensions = new_dimensions.copy()
            consumer_dimensions['topic'] = topic
            consumer_dimensions['partition'] = partition
            consumer_dimensions['consumer_group'] = consumer_group
            if full_output:
                self.gauge('kafka.consumer_offset',
                           consumer_offset,
                           dimensions={'topic': topic,
                                       'partition': partition,
                                       'consumer_group': consumer_group}.update(consumer_dimensions))
            self.gauge('kafka.consumer_lag',
                       broker_offset - consumer_offset,
                       dimensions={'topic': topic,
                                   'partition': partition,
                                   'consumer_group': consumer_group}.update(consumer_dimensions))
Exemplo n.º 56
0
class KafkaSpiderMixin(object):
    """
    Mixin class to implement reading urls from a kafka queue.
    :type kafka_topic: str
    """
    kafka_topic = None

    def process_kafka_message(self, message):
        """"
        Tell this spider how to extract urls from a kafka message
        :param message: A Kafka message object
        :type message: kafka.common.OffsetAndMessage
        :rtype: str or None
        """
        if not message:
            return None

        return message.message.value

    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)

    def next_request(self):
        """
        Returns a request to be scheduled.
        :rtype: str or None
        """
        message = self.consumer.get_message(True)
        url = self.process_kafka_message(message)
        if not url:
            return None
        return self.make_requests_from_url(url)

    def schedule_next_request(self):
        """Schedules a request if available"""
        req = self.next_request()
        if req:
            self.crawler.engine.crawl(req, spider=self)

    def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        self.schedule_next_request()
        raise DontCloseSpider

    def item_scraped(self, *args, **kwargs):
        """Avoids waiting for the spider to  idle before scheduling the next request"""
        self.schedule_next_request()