class Consumer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.topic = "steps_data_part4" self.consumer_group = 's3_consumer' self.consumer = SimpleConsumer(self.client, self.consumer_group, self.topic) def consume_message(self): while True: timestamp = time.strftime('%Y%m%d%H%M%S') temp_file_name = "%s_%s_%s.dat" %(self.topic, self.consumer_group, timestamp) temp_file = open("/home/ubuntu/rankMyStep/kafka/"+temp_file_name,"w") messages = self.consumer.get_messages(count=1000, block=False) for msg in messages: print msg.message.value + "\n" temp_file.write(msg.message.value + "\n") self.save_to_s3(temp_file_name) def save_to_s3(self, file_name): mybucket = "anurag-raw-data-store" aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') s3_client = boto3.client('s3') s3_client.upload_file("/home/ubuntu/rankMyStep/kafka/"+file_name, mybucket,"rankmysteps/"+file_name) os.remove("/home/ubuntu/rankMyStep/kafka/"+file_name)
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath) self.block_cnt += 1 os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-out-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( id = input['id'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, raw_html = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'userId': crawled['attrs']['userId'], 'org': crawled['attrs']['org'], 'domain': crawled['attrs']['domain'], 'url': crawled['url'] } self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
def spiderIdle(self, spider): consumer = SimpleConsumer(self.kafka_conn, "test", "commands") for msg in consumer.get_messages(): print msg.message.value if msg.message.value == spider.name + "_stop": print "stop" spider.spider_pause() # spider.close(spider,'ok') # self.scrapy.engine.close_spider(spider, 'closespider_itemcount') if msg.message.value == spider.name + "_start": # self.scrapy.engine.scraper.open_spider(spider) spider.spider_resume()
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
def get_message() : try : kconn = KafkaClient(kafka_producer.hosts , timeout = 10) getter = SimpleConsumer(kconn , 'test_group', kafka_producer.topic) #getter.seek(0, 0) while True: try: messages = getter.get_messages(200,timeout=3) if messages: logging.info('get message from kafka done'+str(decode(messages))) import time time.sleep(0.1) except BaseException ,e: logging.error(str(e)) except BaseException , e : logging.error(str(e) + 'get message from kafka failed')
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value message = message.decode('utf-8') message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = { 'source': 'datawake-visited', 'userId': userId, 'org': org, 'domain': domain, 'url': url } self.emit([url, '', '', '', html, timestamp, context['source'], context])
class PerfConsumerSync ( threading.Thread ): running = True def __init__(self, factory, destination): self.factory = factory self.destination = destination self.consumer = SimpleConsumer(self.factory, "test-group", self.destination) self.rate = PerfRate() threading.Thread.__init__ ( self ) def run (self): while (self.running): textMessage = self.consumer.get_messages(block=True, timeout=1000000) if (textMessage != None): self.rate.increment() def stop(self): self.running = False def start(self): threading.Thread.start(self)
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d + 1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None print("self.settings['PLUGIN_DIR'] + instance.schema====", self.settings['PLUGIN_DIR'] + instance.schema) with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict( sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer( self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) print('the_dict', the_dict) found_plugin = False print('self.plugins_dict', self.plugins_dict) for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] print('instance==', instance) schema = obj['schema'] print( 'schema********************************************', schema) try: print('before v = self.validator(schema)') v = self.validator(schema) print('after v = self.validator(schema)') print('the_dict-------', the_dict) v.validate(the_dict) found_plugin = True print('found_plugin====', found_plugin) self._increment_plugin_stat( instance.__class__.__name__, the_dict) print('instance.handle(the_dict)', the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: print(' except ValidationError:======') pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn( "Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][ key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) print 'Added crawl to Redis' def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format(action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) print 'Added action to Redis' def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__(self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError( "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']" ) self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning( "Lost or unable to acquire partition") self.stop() else: self.logger.info( 'Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value( lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info( 'Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn( 'Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info( 'Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
class Consumer(object): """Kafka consumer class with functions to consume messages to HDFS. Messages are blocked into 20MB files and transferred to HDFS Attributes: client: string representing IP:port of the kafka broker consumer: Consumer object specifying the client group, and topic temp_file_path: location of the 20MB file to be appended to before transfer to HDFS temp_file: File object opened from temp_file_path topic: String representing the topic on Kafka group: String representing the Kafka consumer group to be associated with block_cnt: integer representing the block count for print statements """ def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "messages" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ timestamp = time.strftime("%Y%m%d%H%M%S") # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") # while True: for ii in range(0, 2): try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS. Code template from https://github.com/ajmssc/bitcoin-inspector.git Flushes the file into HDFS folders Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ self.temp_file.close() timestamp = time.strftime("%Y%m%d%H%M%S") hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path, # cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime("%Y%m%d%H%M%S") self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
kafka_hosts = arguments.hosts # getting the path to the file where we will dump the content of topic path_to_sink = arguments.path # getting the required number of messages to fetch from kafka message_count = arguments.count # instantiating a Kafka client kafka_client = SimpleClient(hosts=kafka_hosts) # instantiating a Kafka consumer kafka_consumer = SimpleConsumer(client=kafka_client, topic=topic_to_collect_from, group='simple_consumer_group') # fetching the messages messages_data = kafka_consumer.get_messages(count=message_count) # creating a dictionary message_dictionary = {} # running through the for index, message in enumerate(messages_data): message_dictionary[index] = { 'value': message.message.value.decode('utf-8'), 'offset': message.offset, } # dumping messages with open(path_to_sink, 'w') as sink_file: json.dump(message_dictionary, sink_file) # printing the content of the json file in the terminal pprint.pprint(message_dictionary)
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format( action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/FantasyFootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") one_entry = False while True: try: messages = self.consumer.get_messages(count=100, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: one_entry = True self.tempfile.write(message.message.value + "\n") if self.tempfile.tell() > 2000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) if one_entry: self.save_to_hdfs(output_dir, self.topic) self.consumer.commit() def save_to_hdfs(self, output_dir): self.tempfile.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/user/solivero/playerpoints/history/%s_%s_%s.dat" % (self.group, self.topic, timestamp) cached_path = "/user/solivero/playerpoints/cached/%s_%s_%s.dat" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,hadoop_path)) os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,cached_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/fantasyfootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" %(topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") print ( self.temp_file) #one_entry = False while True: try: messages = self.consumer.get_messages(count=10, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: print (message) #one_entry = True #print (self.temp_file.tell()) self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 2000000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) #if one_entry: #print ("sending to hdfs") #self.save_to_hdfs(output_dir, self.topic) #self.consumer.commit() def save_to_hdfs(self, output_dir): print ("Saving file to hdfs") self.temp_file.close() print ("Closed open file") timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group, self.topic, timestamp) #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" % (topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") print(self.temp_file) #one_entry = False while True: try: messages = self.consumer.get_messages(count=10, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: print(message) #one_entry = True #print (self.temp_file.tell()) self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 2000000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) #if one_entry: #print ("sending to hdfs") #self.save_to_hdfs(output_dir, self.topic) #self.consumer.commit() def save_to_hdfs(self, output_dir): print("Saving file to hdfs") self.temp_file.close() print("Closed open file") timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group, self.topic, timestamp) #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
# stdlib from collections import defaultdict # 3p from kafka.client import KafkaClient from kafka.consumer import SimpleConsumer kafka_conn = KafkaClient("kafka:9092") consumer = SimpleConsumer(kafka_conn, "sample_check", "test-topic", auto_commit=True) for message in consumer.get_messages(count=10): print message.offset consumer.commit()
class KafkaHelper(object): def __init__(self): self.client = None self.producer = None self.consumer = None self.consumer_fetch_timeout = None self.consumer_fetch_size = None def __enter__(self): self.get_client() return self def __exit__(self, exctype, excvalue, traceback): self.close_client() @retry(BrokerResponseError, tries=5, delay=3, backoff=2) def get_client(self): if not self.client: self.client = KafkaClient(settings.KAFKA['host']) return self.client def get_producer(self): """ :return: SimpleProducer """ if not self.producer: self.get_client() self.producer = SimpleProducer(self.client) return self.producer def get_multiprocess_consumer( self, consumer_group, topic, fetch_size=settings.KAFKA['message_fetch_batch'], fetch_timeout=settings.KAFKA['message_fetch_timeout'], auto_commit_every_n=settings.KAFKA['auto_commit_msg_count'], **kw): """ Return MultiProcessConsumer which consumes partitions for a topic in parallel using multiple processes Arguments: consumer_group: a name for this consumer, used for offset storage and must be unique topic: the topic to consume Keyword Arguments: fetch_size: Indicates the maximum number of messages to be fetched fetch_timeout: The function will block for the specified time (in seconds) until count messages is fetched auto_commit_every_n: How many messages to consume before a commit """ if not self.consumer: self.consumer_fetch_size = fetch_size self.consumer_fetch_timeout = fetch_timeout self.get_client() partitions = len(self.get_partitions(topic)) self.consumer = MultiProcessConsumer( self.client, consumer_group, topic, num_procs=partitions, partitions_per_proc=1, auto_commit_every_n=auto_commit_every_n, **kw) return self.consumer def get_consumer( self, consumer_group, topic, fetch_size=settings.KAFKA['message_fetch_batch'], fetch_timeout=settings.KAFKA['message_fetch_timeout'], auto_commit_every_n=settings.KAFKA['auto_commit_msg_count'], **kw): if not self.consumer: self.consumer_fetch_size = fetch_size self.consumer_fetch_timeout = fetch_timeout self.get_client() self.consumer = SimpleConsumer( self.client, consumer_group, topic, auto_commit_every_n=auto_commit_every_n, auto_offset_reset='smallest', **kw) return self.consumer def close_client(self): if self.client: self.client.close() def send_message(self, topic, msgs, logger=None): content = [(json.dumps(msg) if type(msg) is dict else msg) for msg in msgs] try: resp = self.producer.send_messages(topic, *content) return resp except Exception as e: if logger: logger.error( 'An error has occured in KafkaHelper.send_message(), please check errors: %s', traceback.format_exc()) raise e def receive_messages(self): messages = self.consumer.get_messages( count=self.consumer_fetch_size, timeout=self.consumer_fetch_timeout) return messages def current_offset(self, topic, partition): offsets, = self.client.send_offset_request( [OffsetRequest(kafka_bytestring(topic), partition, -1, 1)]) return offsets.offsets[0] def consumer_offset(self, consumer_name, topic, partition): offsets, = self.client.send_offset_fetch_request( consumer_name, [OffsetRequest(kafka_bytestring(topic), partition, -1, 1)]) return offsets[2] def get_total_lags(self, consumer_name, topic): lags = [] lag = 0 partitions = self.get_partitions(topic) for p in partitions: offset1 = self.consumer_offset(consumer_name, topic, p) offset2 = self.current_offset(topic, p) lag = (offset2 - offset1) lags.append(lag) #print offset1,offset2,lag return sum(lags) def get_partitions(self, topic): return self.client.get_partition_ids_for_topic(topic)
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer(self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) found_plugin = False for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] schema = obj['schema'] try: self.validator(schema).validate(the_dict) found_plugin = True self._increment_plugin_stat( instance.__class__.__name__, the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn("Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__( self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']") self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() else: self.logger.info('Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value(lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info('Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn('Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info('Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
from kafka.client import KafkaClient from kafka.common import OffsetRequest from kafka.util import kafka_bytestring from kafka.consumer import MultiProcessConsumer from kafka.producer import SimpleProducer from kafka.consumer import SimpleConsumer from kafka.common import BrokerResponseError cli = KafkaClient("localhost:9092") consumer = SimpleConsumer(cli, 'test', 'tp_test1', auto_commit_every_n=10) try: no_msg_times = 0 while 1: is_over = False messages = consumer.get_messages(count=5, timeout=3) if messages: for m in messages: #print m msg_value = m.message.value print msg_value if msg_value == 'over': is_over = True else: print "no msg!" no_msg_times += 1 if is_over: print "The show is over! bye..." break if no_msg_times >= 5:
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format( str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 os.system( "hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): """Kafka consumer class with functions to consume messages to HDFS. Messages are blocked into 20MB files and transferred to HDFS Attributes: client: string representing IP:port of the kafka broker consumer: Consumer object specifying the client group, and topic temp_file_path: location of the 20MB file to be appended to before transfer to HDFS temp_file: File object opened from temp_file_path topic: String representing the topic on Kafka group: String representing the Kafka consumer group to be associated with block_cnt: integer representing the block count for print statements """ def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "messages" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") # while True: for ii in range(0, 2): try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS. Code template from https://github.com/ajmssc/bitcoin-inspector.git Flushes the file into HDFS folders Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print("Block {}: Flushing 20MB file to HDFS => {}".format( str(self.block_cnt), hadoop_fullpath)) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs print("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path, # cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: self.settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = self.settings['crawler-out-topic'].encode() self.conn_pool = self.settings['crawler_conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( crawlid = input['crawlid'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, body = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) if crawled['appid'] == self.settings["appid"]: safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'domain': crawled['attrs']['domain'] } self.emit([ crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context ])
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 os.system ( "hdfs dfs -mkdir /data2" ) def consume_topic(self, output_dir): if not os.path.isdir ( output_dir ): os.makedirs ( output_dir ) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 40MB if self.temp_file.tell() > 40000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') print "Block {}: Flushing 40MB file to HDFS => /data2".format(str(self.block_cnt)) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -copyFromLocal %s %s" % (self.temp_file_path, "/data2")) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")