def forwarder(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) producer = SimpleProducer(client, batch_send=False) print producer for i in xrange(1, 100): with open(self.csvfile, 'r') as FR: fields = next(FR).strip().split('\t') print fields for cnc_log in FR: values = cnc_log.strip().split('\t') zipped = dict(zip(fields, values)) zipped['lower_bound'] = float(zipped['lower_bound']) zipped['upper_bound'] = float(zipped['upper_bound']) zipped['temperature'] = float(zipped['temperature']) zipped['no'] = int(zipped['no']) print json.dumps(zipped, sort_keys=True, indent=4) # prob = 0.8 # y = lambda x, prob: '<span style="background-color:#bd362f; color:white">FAIL</span>' if randint(0,x) > x*prob else 'PASS' # cnc_log = (datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')+"\t"+y(10,0.8)+'\t'+cnc_log.strip()).split('\t') # zipped = dict(zip(fields,cnc_log)) # node = zipped sleep_sec = random.uniform(0, 3) * 5 time.sleep(sleep_sec) producer.send_messages(self.topic_name, json.dumps(zipped))
def configure_internal_queues(self): """ configures the internal queues used hold references to events in the input queue """ for i in range(self.number_of_queues): client = KafkaClient(hosts=self.kafka_hosts) queue_name = SCHEDULER_QUEUE_FORMAT.format(2**i) client.ensure_topic_exists(queue_name) indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts) queue_consumer = KafkaConsumer( queue_name, bootstrap_servers=self.kafka_hosts, group_id=queue_name, consumer_timeout_ms=2000, auto_commit_enable=False, ) queue_producer = SimpleProducer(client) queue_duration = 2**i self.queues.append( InternalQueue( queue_consumer, indexed_consumer, queue_producer, self.number_of_queues, queue_duration, ))
def forwarder(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) producer = SimpleProducer(client, batch_send=False) print producer no = 1 for i in xrange(1,10000): with open(self.csvfile, 'r') as FR: first_line = next(FR) print first_line fields = first_line.lstrip().rstrip().split('\t') print fields for cnc_log in FR: print cnc_log values = cnc_log.strip().split('\t') zipped = dict(zip(fields,values)) zipped['lower_bound'] = float(zipped['lower_bound']) zipped['upper_bound'] = float(zipped['upper_bound']) zipped['spindle'] = float(zipped['spindle']) # zipped['no'] = int(zipped['no']) zipped['no'] = no zipped['tool_no'] = int(zipped['tool_no']) # zipped['tool_no'] = i print json.dumps(zipped,sort_keys=True,indent=4) sleep_sec = 1 time.sleep(sleep_sec) producer.send_messages(self.topic_name, json.dumps(zipped)) no = no +1
class KafkaConnector(object): def __init__(self, host_name, host_port): self.client = KafkaClient(host_name + ":" + host_port) self.producer = SimpleProducer(self.client) def create_topic(self, topic_name): topic_exists = self.client.has_metadata_for_topic(topic_name) if not topic_exists: self.client.ensure_topic_exists(topic_name) def send_message(self, topic_name, message): self.producer.send_messages(topic_name, message) def register_consumer(self, callback, parse_json, topic_group, topic_name): consumer = SimpleConsumer(self.client, topic_group, topic_name) consumer_thread = ConsumerThread(consumer, callback, parse_json) consumer_thread.start() def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name): print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name) consumer = SimpleConsumer(self.client, topic_group, topic_name) consumer.seek(0,2) for message in consumer: message = parse_json(message) print "=============" + str(message) + "============" message_consume_function(message) print "called message consume function"
def listen(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) # print client.topic_partitions() consumer = SimpleConsumer(client, self.consumer_name, self.topic_name) for message in consumer: value = message.message.value print value
def _feed(settings_file, json_item): settings = importlib.import_module(settings_file[:-3]) kafka_conn = KafkaClient(settings.KAFKA_HOSTS) topic = settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
def listen(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) consumer = SimpleConsumer(client, self.consumer_name, self.topic_name) for message in consumer: value = message.message.value value = json.loads(value) if value['no'] % 10 == 0: print value subject = "test mail => "+message.message.value body = "Good day! Now is "+datetime.now().strftime('%Y-%m-%d %H:%M:%S') send_mail(self.email_address,subject,body)
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: self.settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = self.settings['crawler-out-topic'].encode() self.conn_pool = self.settings['crawler_conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( crawlid = input['crawlid'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, body = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) if crawled['appid'] == self.settings["appid"]: safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'domain': crawled['attrs']['domain'] } self.emit([crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context])
class Producer(): def __init__(self, server_list, kafka_port, topic_name): self.server_list = server_list self.kafka_port = kafka_port self.topic_name = topic_name self.client = KafkaClient(hosts(self.server_list, self.kafka_port)) self.producer = SimpleProducer(self.client, batch_send=False) def ensure_topic_exists(self): self.client.ensure_topic_exists(self.topic_name) def forwarder(self, message): self.producer.send_messages(self.topic_name, message)
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = {'source': 'datawake-visited', 'domain': domain} self.emit([ url, '', '', '', html, timestamp, context['source'], context ]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
def configure_input_queue(self): """ configures the input queue that other services can use to schedule an event to be delivered """ client = KafkaClient(hosts=self.kafka_hosts) client.ensure_topic_exists(self.input_topic) indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts) queue_consumer = KafkaConsumer(self.input_topic, bootstrap_servers=self.kafka_hosts, group_id=CONSUMER_GROUP) queue_producer = SimpleProducer(KafkaClient(hosts=self.kafka_hosts)) self.queues.append( InputQueue(queue_consumer, indexed_consumer, queue_producer, self.number_of_queues))
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = { 'source': 'datawake-visited', 'domain': domain } self.emit([url, '', '', '', html, timestamp, context['source'], context]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
class KafkaProducer: def __init__(self, conn_pool, topic): self.conn_pool = conn_pool self.topic = topic self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.producer = SimpleProducer(self.kafka, async=True) def send(self, message): self.producer.send_messages(self.topic, message) def sendBulk(self, messages): self.producer.send_messages(self.topic, *messages) def close(self): self.producer.stop() self.kafka.close() self.kafka = None self.producer = None
class CrawlerSpout(Spout): group = 'datawake-crawler-in-consumer'.encode() def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-in-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000) self.consumer.seek(0,2) # move to the tail of the queue except: self.log("CrawlerSpout initialize error",level='error') self.log(traceback.format_exc(),level='error') raise def next_tuple(self): """ input message: json.dumps(dict( id = 'abcdefg', #TODO generate UUID, appid = self.appid, url = url, priority = 50, depth = 0, attrs = dict( userId = context['userId'], org = context['org'], domain = context['domain'] ) )) :return: """ try: for message in self.consumer: to_crawl = json.loads(message) self.emit([to_crawl]) except: self.log(traceback.format_exc(),level='error')
class KafkaProducer: def __init__(self, conn_pool, topic): self.conn_pool = conn_pool self.topic = topic self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.producer = SimpleProducer(self.kafka, async=True) def send(self, message): self.producer.send_messages(self.topic, message) def sendBulk(self, messages): self.producer.send_messages(self.topic, *messages) def close(self): self.producer.stop() self.kafka.close() self.kafka = None self.producer = None
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self, conn_pool, topic, group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
def forwarder(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) # print client.topic_partitions() producer = SimpleProducer(client, batch_send=False) for i in xrange(1, 100): with open(self.csvfile, 'r') as FR: fields = ("ARRIVAL_TIMESTAMP\t" + "DEFECT\t" + next(FR).strip()).split('\t') for cnc_log in FR: prob = 0.8 y = lambda x, prob: '<span style="background-color:#bd362f; color:white">FAIL</span>' if randint( 0, x) > x * prob else 'PASS' cnc_log = ( datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + "\t" + y(10, 0.8) + '\t' + cnc_log.strip()).split('\t') zipped = dict(zip(fields, cnc_log)) node = zipped sleep_sec = random.uniform(0, 10) time.sleep(sleep_sec) producer.send_messages(self.topic_name, json.dumps(node))
class KafkaConnector(object): def __init__(self, host_name, host_port): self.client = KafkaClient(host_name + ":" + host_port) self.producer = SimpleProducer(self.client) def create_topic(self, topic_name): topic_exists = self.client.has_metadata_for_topic(topic_name) if not topic_exists: self.client.ensure_topic_exists(topic_name) def send_message(self, topic_name, message): self.producer.send_messages(topic_name, message) def register_consumer(self, callback, parse_json, topic_group, topic_name): consumer = SimpleConsumer(self.client, topic_group, topic_name, max_buffer_size=None) consumer_thread = ConsumerThread(consumer, callback, parse_json) print "Starting new subscriber for topic " + topic_name + ' with group ' + topic_group consumer_thread.start()
class ImageConvertProcess(MultiDownloadProcess): name = "image_convert_process" topic_name = "jay.crawled_firehose_images" def __init__(self, settings): super(ImageConvertProcess, self).__init__(settings) self.kafka_client = KafkaClient(self.settings.get("KAFKA_HOSTS")) self.kafka_client.ensure_topic_exists(self.topic_name) self.producer = SimpleProducer(self.kafka_client) #self.lock = RLock() self.IC = ImageConvert(settings) self.IC.set_logger(self.logger) def decode(self, item): return map(lambda x: (x.get('url'), x.get('filename'), x.get('path')), json.loads(item)["images"]) def callback(self, item, flag): try: if flag: item = json.loads(item) spider = item.get("meta", {}).get("spiderid") if spider in DOMAINS: self.logger.debug("process in pan. spider:%s" % (spider)) item["pan_result"] = self.IC.process_image( item.get("meta", {}).get("collection_name"), item) self.logger.debug( "finish process in pan, spider:%s result:%s" % (spider, item["pan_result"])) else: self.logger.info("ignore %s images. " % spider) self.producer.send_messages(self.topic_name, json.dumps(item)) self.logger.debug("send item to kafka. ") else: self.logger.error("download failed") except Exception: self.logger.error(traceback.format_exc())
class FeedProducer: """ Feed Producer class use send() to send to any topic """ def __init__(self, broker): try: self.client = KafkaClient(broker) self.prod = SimpleProducer(self.client) except KafkaUnavailableError: log.critical("\nCluster Unavailable %s : Check broker string\n", broker) raise except: raise def send(self, topic, *msgs): try: self.prod.send_messages(topic, *msgs) except LeaderNotAvailableError: self.client.ensure_topic_exists(topic) return self.send(topic, *msgs) except: raise
class FeedProducer(): """ Feed Producer class use send() to send to any topic """ def __init__(self, broker): try: self.client = KafkaClient(broker) self.prod = SimpleProducer(self.client) except KafkaUnavailableError: log.critical("\nCluster Unavailable %s : Check broker string\n", broker) raise except: raise def send(self, topic, *msgs): try: self.prod.send_messages(topic, *msgs) except LeaderNotAvailableError: self.client.ensure_topic_exists(topic) return self.send(topic, *msgs) except: raise
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) print 'Added crawl to Redis' def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format(action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) print 'Added action to Redis' def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
max_tries = 10 is_connected = False while not is_connected: print("Retrying to start consumer client: {0!s}".format(num_tries), flush=True) time.sleep(5) try: client = KafkaClient(kafka_host) is_connected = True except (LeaderNotAvailableError, ConnectionError, KafkaUnavailableError,) as leader_err2: num_tries += 1 if num_tries == max_tries: raise leader_err2 # except Exception as err: # print("Error starting consumer client: " + str(err), flush=True) # raise err client.ensure_topic_exists(topic) # Create consumer try: consumer = SimpleConsumer(client, consumer_group, topic) except LeaderNotAvailableError as leader_err: num_tries = 0 max_tries = 10 is_connected = False while not is_connected: print("Retrying to start consumer: {0!s}".format(num_tries), flush=True) time.sleep(5) try: consumer = SimpleConsumer(client, consumer_group, topic) is_connected = True except LeaderNotAvailableError as leader_err2:
class BusAdapter(object): ''' The BusAdapter class is intended to be imported to bus modules. Instances of this class provide the software bus illusion over Kafka. Public methods are: * publish() * waitForMessage() * subscribeToTopic() * unSubscribeFromTopic() * addTopicListener() * removeTopicListener() * mySubscriptions() * returnError() * close() A minimal consumer module looks like this: :: # A callback function: def printMessage(topicName, msgText, msgOffset): print('Msg[%s]: %s' % (topicName, msgText)) bus = BusAdapter() # Subscribe to a topic, passing the callback function: bus.subscribeToTopic('exampleTopic', printMessage) while True: # do anything you like time.sleep(10) A corresponding minimal producer module would be like this: :: bus = BusAdapter() while True: # Read one line from console: msgText = raw_input("Type a message to send: ('Q' to end.): ") if msgText == 'Q': break else: bus.publish(msgText, 'exampleTopic') For better structured, but equivalent examples, see :py:class:`Example Producer <kafka_bus_python.example_producer.BusModuleProducer>` and :py:class:`Example Consumer <kafka_bus_python.example_consumer.BusModuleConsumer>`. Clients of this class may install multiple listeners for any given topic. The publish() method may be used asynchronously, just to send a message to subscribing modules on the bus, or synchronously like a remote procedure call. The BusAdapter wraps payloads into a JSON structure as follows: :: 'id' : <RFC 4122 UUID Version 4> # e.g. 'b0f4259e-3d01-44bd-9eb3-25981c2dc643' 'type' : {req | resp} 'status' : { OK | ERROR } 'time' : <ISO 8601> # e.g. '2015-05-31T17:13:41.957350' 'content': <text> It is the responsibility of listener functions to strip this header away, if desired. For an example see echo_service.EchoServer's echoRequestDelivery() method. ''' _LEGAL_MSG_TYPES = ['req', 'resp'] _LEGAL_STATUS = ['OK', 'ERROR'] _DEFAULT_KAFKA_LISTEN_PORT = 9092 _KAFKA_SERVERS = [('localhost', _DEFAULT_KAFKA_LISTEN_PORT), ('mono.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT), ('datastage.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT), ] # _KAFKA_SERVERS = [('mono.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT), # ('localhost', _DEFAULT_KAFKA_LISTEN_PORT), # ('datastage.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT), # ] # Remember whether logging has been initialized (class var!): _loggingInitialized = False _logger = None def __init__(self, kafkaHost=None, kafkaPort=None, loggingLevel=logging.DEBUG, logFile=None, kafkaGroupId='school_bus' ): ''' Initialize communications with Kafka. :param kafkaHost: hostname or ip address of host where Kafka server runs. If None, then BusAdapter._KAFKA_SERVERS are tried in turn. :type kafkaHost: {string | None} :param kafkaPort: port at which Kafka expects clients to come in. if None, then BusAdapter._DEFAULT_KAFKA_LISTEN_PORT is used. :type kafkaPort: {int | None} :param loggingLevel: detail of logging :type loggingLevel: {logging.DEBUG | logging.INFO | logging.ERROR} :param logFile: file to which log is written; concole, if NONE :type logFile: {string | None} :param kafkaGroupId: name under which message offset management is stored [by Kafka in zookeeper]. Different groups of bus modules will have different sets of message offsets recorded. You can leave this default. :type kafkaGroupId: string ''' if kafkaPort is None: kafkaPort = BusAdapter._DEFAULT_KAFKA_LISTEN_PORT self.port = kafkaPort self.kafkaGroupId = kafkaGroupId self._setupLogging(loggingLevel, logFile) for hostPortTuple in BusAdapter._KAFKA_SERVERS: self.logDebug('Contacting Kafka server at %s:%s...' % hostPortTuple) try: self.kafkaClient = KafkaClient("%s:%s" % hostPortTuple) except KafkaUnavailableError: # Have we just contacted the last of the available # servers? if hostPortTuple == BusAdapter._KAFKA_SERVERS[-1]: raise KafkaUnavailableError("No Kafka server found running at any of %s." % str(BusAdapter._KAFKA_SERVERS)) else: continue self.logDebug('Successfully contacted Kafka server at %s:%s...' % hostPortTuple) # If succeeded, init the 'bootstrap_servers' array # referenced in topic_waiter.py: self.bootstrapServers = ['%s:%s' % hostPortTuple] # Don't try any other servers: break self.producer = SimpleProducer(self.kafkaClient) # Create a function that has the first method-arg # 'self' already built in. That new function is then # called with just the remaining positional/keyword parms. # In this case: see method :func:`addTopicListener`. # This way we can by default pass :func:`_deliverResult` to a # _TopicWaiter instance, and thereby cause it to invoke our # _deliverResult() *method* (which takes the hidden 'self.' # Yet other callers to subscribeToTopic() can specify # a *function* which only takes the non-self parameters # specified in method :func:`addTopicListener`. self.resultCallback = partial(self._deliverResult) # A function that will be called when the result to # a synchronous call arrives: self.syncResultWaiter = partial(self._awaitSynchronousReturn) # Dict mapping topic names to thread objects that listen # to the respective topic. Used by subscribeToTopic() and # unsubscribeFromTopic(): self.listenerThreads = {} # Dict mapping topic names to event objects that provide # communication between the topic's thread and the main # thread. Used in awaitMessage(): self.topicEvents = {} # Dict used for synchronous calls: the dict maps # msg UUIDs to the results of a call. Set in # _awaitSynchronousReturn(), and emptied in publish() self.resDict = {} # -------------------------- Pulic Methods --------------------- def publish(self, busMessage, topicName=None, sync=False, msgId=None, msgType='req', timeout=None, auth=None): ''' Publish either a string or a BusMessage object. If busMessage is a string, then the caller is responsible for ensuring that the string is UTF-8, and a topic name must be provided. If busMessage is a BusMessage object, then that object contains all the required information. In this case, parameter topicName overrides a topic name that might be stored in the BusMessage. Messages are wrapped in a JSON structure that provides 'id', 'type', 'time', and 'content' fields. The 'content' field will contain the message payload. Two ways of using this method: asynchronously, and synchronously. In asynchronous invocation the passed-in message is published, and this method returns immediately. For this type of invocation just provide argument busMessage, and possibly topicName, if busMessage is a string. Synchronous invocation is just like a remote procedure call. In synchronous invocation the passed-in message is published, and this method will wait for a return message that carries the same message ID, and is of message type 'resp'. This method then returns the **content** of the returned message; the surrounding wrapper (time/msgId/msgType...) is stripped. :param busMessage: string or BusMessage to publish :type busMessage: {string | BusMessage} :param topicName: name of topic to publish to. If None, then parameter must be a BusMessage object that contains an associated topic name. :type topicName: {string | None} :param sync: if True, call will not return till answer received, or timeout (if given) has expired). :type sync: boolean :param msgId: if this publish() call is a response to a prior request, the request message's ID must be the id of the response. In that case the caller can use this parameter to provide the ID. If None, a new message ID is generated. :type msgId: string :param msgType: value for the message type field of the outgoing message. Usually this is 'req', but when calling publish() to return a result to a prior request, then set this argument to 'resp'. :param timeout: timeout after which synchronous call should time out. if sync is False, the timeout parameter is ignored. :type timeout: float :param auth: reserved for later authentication mechanism. :type auth: not yet known :return: value is only defined for synchronous invocation. :rtype: string :raises ValueError: if targeted topic name is not provided in a msg object, or explicitly in the topicName parameter. :raises ValueError: if illegal message type is passed in. :raises BadInformation: if Kafka does not recognize the provided topic **and** Kafka is not configured to create topics on the fly. :raises SyncCallTimedOut: if no response is received to a synchronous call within the provided timeout period. :raises SyncCallRuntimeError: if a message received in response to a synchronous call cannot be parsed. ''' if not isinstance(busMessage, BusMessage): # We were passed a raw string to send. The topic name # to publish to better be given: if topicName is None: raise ValueError('Attempt to publish a string without specifying a topic name.') msg = busMessage else: # the busMessage parm is a BusMessage instance: # If topicName was given, it overrides any topic name # associated with the BusObject; else: if topicName is None: # Grab topic name from the BusMessage: topicName = busMessage.topicName() # If the BusMessage did not include a topic name: error if topicName is None: raise ValueError('Attempt to publish a BusMessage instance that does not hold a topic name: %s' % str(busMessage)) # Get the serialized, UTF-8 encoded message from the BusMessage: msg = busMessage.content() # Now msg contains the msg text. try: self.kafkaClient.ensure_topic_exists(topicName, timeout=5) except KafkaTimeoutError: raise BadInformation("Topic '%s' is not a recognized topic." % topicName) # Create a JSON struct: if msgId is None: msgUuid = str(uuid.uuid4()) else: msgUuid = msgId # Sanity check on message type: if msgType not in BusAdapter._LEGAL_MSG_TYPES: raise ValueError('Legal message types are %s' % str(BusAdapter._LEGAL_MSG_TYPES)) msgDict = dict(zip(['id', 'type', 'time', 'content'], [msgUuid, msgType, datetime.now().isoformat(), msg])) # If synchronous operation requested, wait for response: if sync: # Before publishing the request, must prepare for # a function that will be invoked with the result. # Use instance vars for communication with the result # delivery thread. # Use of these instance vars means that publish # isn't re-entrant. Fine for now: # For the result delivery method to know which msg id # we are waiting for: self.uuidToWaitFor = msgUuid # For the result delivery method to know which topic # we are waiting for: self.topicToWaitFor = topicName # For the result delivery method to put a string # if an error occurs while processing the result # bus message: self.syncResultError = None # Create event that will wake us when result # arrived and has been placed in self.resDict: self.resultArrivedEvent = threading.Event(timeout) # If not subscribed to the topic to which this synchronous # call is being published, then subscribe to it temporarily: wasSubscribed = topicName in self.mySubscriptions() if not wasSubscribed: self.subscribeToTopic(topicName, self.syncResultWaiter) else: self.addTopicListener(topicName, self.syncResultWaiter) # Finally: post the request... self.producer.send_messages(topicName, json.dumps(msgDict)) # ... and wait for the answer message to invoke # self._awaitSynchronousReturn(): resBeforeTimeout = self.resultArrivedEvent.wait(timeout) # Result arrived, and was placed into # self.resDict under the msgUuid. Remove the listener # that waited for the result: self.removeTopicListener(topicName, self.syncResultWaiter) # If we weren't subscribed to this topic, then # restore that condition: if not wasSubscribed: self.unsubscribeFromTopic(topicName) # If the 'call' timed out, raise exception: if not resBeforeTimeout: raise SyncCallTimedOut('Synchronous call on topic %s timed out' % topicName) # A result arrived from the call: res = self.resDict.get(msgUuid, None) # No longer need the result to be saved: try: del self.resDict[msgUuid] except KeyError: pass # Check whether awaitSynchronousReturn() placed an # error message into self.syncResultError: if self.syncResultError is not None: raise(SyncCallRuntimeError(self.syncResultError)) return res else: # Not a synchronous call; just publish the request: self.producer.send_messages(topicName, json.dumps(msgDict)) def subscribeToTopic(self, topicName, deliveryCallback=None, kafkaLiveCheckTimeout=30): ''' Fork a new thread that keeps waiting for any messages on the topic of the given name. Stop listening for the topic by calling unsubscribeFromTropic(). For convenience, a deliveryCallback function may be passed, saving a subsequent call to addTopicListener(). See addTopicListener() for details. If deliveryCallback is absent or None, then method _deliverResult() in this class will be used. That method is intended to be a placeholder with no side effects. It is a no-op to call this method multiple times for the same topic. :param topicName: official name of topic to listen for. :type topicName: string :param deliveryCallback: a function that takes two args: a topic name, and a topic content string. :type deliveryCallback: function :param kafkaLiveCheckTimeout: timeout in (fractional) seconds to wait when checking for a live Kafka server being available. :type kafkaLiveCheckTimeout: float :raises KafkaServerNotFound: when no Kafka server responds ''' if deliveryCallback is None: deliveryCallback = self.resultCallback if type(deliveryCallback) != types.FunctionType and type(deliveryCallback) != functools.partial: raise ValueError("Parameter deliveryCallback must be a function, was of type %s" % type(deliveryCallback)) try: # Does a thread for this msg already exist? self.listenerThreads[topicName] # Yep (b/c we didn't bomb out). Nothing to do: return except KeyError: # No thread exists for this topic. # Create an event object that the thread will set() # whenever a msg arrives, even if no listeners exist: event = threading.Event() self.topicEvents[topicName] = event # Create the thread that will listen to Kafka; # raises KafkaServerNotFound if necessary: waitThread = _TopicWaiter(topicName, self, self.kafkaGroupId, deliveryCallback=deliveryCallback, eventObj=event, kafkaLiveCheckTimeout=kafkaLiveCheckTimeout) # Remember that this thread listens to the given topic: self.listenerThreads[topicName] = waitThread waitThread.start() def unsubscribeFromTopic(self, topicName): ''' Unsubscribes from topic. Stops the topic's thread, and removes it from bookkeeping so that the Thread object will be garbage collected. Same for the Event object used by the thread to signal message arrival. Calling this method for a topic that is already unsubscribed is a no-op. :param topicName: name of topic to subscribe from :type topicName: string ''' # Delete our record of the Event object used by the thread to # indicate message arrivals: try: del self.topicEvents[topicName] except KeyError: pass try: # Does a thread for this msg even exist? existingWaitThread = self.listenerThreads[topicName] # Yep, it exists. Stop it and remove it from # our bookkeeping existingWaitThread.stop() del self.listenerThreads[topicName] except KeyError: # No thread exists for this topic at all, so all done: return def addTopicListener(self, topicName, deliveryCallback): ''' Add a listener function for a topic for which a subscription already exists. Parameter deliverCallback must be a function accepting parameters: topicName, rawResult, msgOffset It is an error to call the method without first having subscribed to the topic. :param topicName: name of topic to add :type topicName: String :param deliveryCallback: function to call when message to this topic arrives :type deliveryCallback: <function(topicName, rawResult, msgOffset) :raises NameError: if caller has not previously subscribed to topicName. ''' if deliveryCallback != types.FunctionType and type(deliveryCallback) != functools.partial: raise ValueError("Parameter deliveryCallback must be a function, was of type %s" % type(deliveryCallback)) try: # Does a thread for this msg already exist? existingWaitThread = self.listenerThreads[topicName] # Yep (b/c we didn't bomb out). Check whether the # given deliveryCallback is already among the listeners # added earlier: try: existingWaitThread.listeners().index(deliveryCallback) # Both, a thread and this callback already exist, do nothing: return except ValueError: pass # Thread exists for this topic, but an additional # callback is being registered: existingWaitThread.addListener(deliveryCallback) return except KeyError: # No thread exists for this topic, so no deliveryCallback # can be added: raise NameError("Attempt to add topic listener %s for topic '%s' without first subscribing to '%s'" % (str(deliveryCallback), topicName, topicName)) def removeTopicListener(self, topicName, deliveryCallback): ''' Remove a topic listener function from a topic. It is a no-op to call this method with a topic that has not been subscribed to, or with a deliveryCallback function that was never added to the topic. :param topicName: :type topicName: :param deliveryCallback: :type deliveryCallback: ''' try: # Does a thread for this msg even exist? existingWaitThread = self.listenerThreads[topicName] # Yep, exists (we didn't bomb). Now check whether the # given deliveryCallback was actually added to the listeners # earlier: existingListeners = existingWaitThread.listeners() try: existingListeners.index(deliveryCallback) # The listener to be removed does exist: existingWaitThread.removeListener(deliveryCallback) return except NameError: # This listener isn't registered, so all done: return except KeyError: # No listener thread exists for this topic at all, so all done: return def waitForMessage(self, topicName, timeout=None): ''' Block till a message on the given topic arrives. It is an error to call this method on a topic to which the caller has not previously subscribed. :param topicName: :type topicName: :param timeout: seconds (or fractions of second) to wait. :type timeout: float :returns: True if a message arrived in time, else returnes False :rtype: boolean :raises NameError: on attempt to wait for a topic for which no subscription exists. ''' try: event = self.topicEvents[topicName] return(event.wait(timeout)) except KeyError: raise NameError("Attempt to wait for messages on topic %s, which was never subscribed to." % topicName) def mySubscriptions(self): ''' Return a list of topic names to which this bus adapter is subscribed. :return: List of topics to which caller is subscribed :rtype: [String] ''' return self.topicEvents.keys() def returnError(self, req_key, topicName, errMsg): ''' Convencience method when handling an incoming message. Returns a message that is marked as an error return. :param req_key: key of the incoming message; it will be used in the return message as well. :type req_key: String :param topicName: name of topic to use in the return message :type topicName: String :param errMsg: error message to include in the return message :type errMsg: String ''' errMsg = {'resp_key' : req_key, 'type' : 'resp', 'status' : 'ERROR', 'time' : datetime.now().isoformat(), 'content' : errMsg } errMsgJSON = _JSONEncoderBusExtended.makeJSON(errMsg) self.bus.publish(errMsgJSON, topicName) def close(self): ''' Cleanup. All threads are stopped. Kafka connection is closed. ''' for thread in self.listenerThreads.values(): thread.stop() self.listenerThreads.clear() self.topicEvents.clear() self.kafkaClient.close() # -------------------------- Private Methods --------------------- def _deliverResult(self, topicName, rawResult, msgOffset): ''' Simple default message delivery callback. Just prints topic name and content. Override in subclass to get more interesting behavior. Remember, though: you (I believe) need to do the functools.partial trick to create a function for your overriding method that already has 'self' curried out. We may be able to simplify that, because the listening threads do save the BusAdapter objecst that created them. :param topicName: name of topic the msg came from :type topicName: string :param rawResult: the string from the wire; not yet de-serialized :type rawResult: string :param msgOffset: the Kafka queue offset of the message :type msgOffset: int ''' print('Msg at offset %d: %s' % (msgOffset,rawResult)) def _awaitSynchronousReturn(self, topicName, rawResult, msgOffset): ''' A callback for _TopicWaiter. Invoked from a different thread!! This callback is installed by publish() when a synchronous bus 'call' is executed. The main thread, i.e. publish() will have delivered the request to the bus, and initialized the following instance variables for us: * self.uuidToWaitFor: the message id an incoming result must have * self.syncResultError: a place for this method to place an error message if necessary * self.resultArrivedEvent: a threading.Event() obj which this method will set() when it's done. :param topicName: name of topic on which a message arrived :type topicName: string :param rawResult: message payload; a JSON string :type rawResult: string :param msgOffset: offset in Kafka system :type msgOffset: int ''' # If this incoming message is the wrong topic, # ignore; this should never happen, b/c this method # is only installed as a listener when we hang for # a synchronous call: if topicName != self.topicToWaitFor: return # Turn msg JSON into a dict: try: thisResDict = json.loads(rawResult) except ValueError: self.syncResultError = 'Bad JSON while waiting for sync response: %s' % rawResult # Tell main thread that answer to synchronous # call arrived, and was processed: self.resultArrivedEvent.set() return # Is this a response msg, and is it the one # we are waiting for? thisUuid = thisResDict.get('id', None) thisMsgType = thisResDict.get('type', None) thisContent = thisResDict.get('content', None) if thisUuid == self.uuidToWaitFor and \ thisMsgType == 'resp': # All good; store just the msg content field # in a result dict that's shared with the main # thread: self.resDict[thisUuid] = thisContent # Tell main thread that answer to synchronous # call arrived, and was processed: self.resultArrivedEvent.set() else: # Not the msg we are waiting for: return def _setupLogging(self, loggingLevel, logFile): if BusAdapter._loggingInitialized: # Remove previous file or console handlers, # else we get logging output doubled: BusAdapter._logger.handlers = [] # Set up logging: # A _logger named SchoolBusLog: BusAdapter._logger = logging.getLogger('SchoolBusLog') BusAdapter._logger.setLevel(loggingLevel) # A msg formatter that shows datetime, _logger name, # the log level of the message, and the msg. # The datefmt=None causes ISO8601 to be used: formatter = logging.Formatter(fmt='%(asctime)s-%(name)s-%(levelname)s-%(module)s: %(message)s',datefmt=None) # Create file handler if requested: if logFile is not None: handler = logging.FileHandler(logFile) else: # Create console handler: handler = logging.StreamHandler() handler.setFormatter(formatter) handler.setLevel(loggingLevel) # # create formatter and add it to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # fh.setFormatter(formatter) # ch.setFormatter(formatter) # Add the handler to the _logger BusAdapter._logger.addHandler(handler) #********************** #BusAdapter._logger.info("Info for you") #BusAdapter._logger.warn("Warning for you") #BusAdapter._logger.debug("Debug for you") #********************** BusAdapter._loggingInitialized = True def logWarn(self, msg): ''' Loccally log a warning message using the Python logging facility. The _logger name is 'SchoolBusLog'. Change format or _logger name by modifying _setupLogging(). :param msg: message to log :type msg: String ''' BusAdapter._logger.warn(msg) def logInfo(self, msg): ''' Locally log an info message using the Python logging facility. The _logger name is 'SchoolBusLog'. Change format or _logger name by modifying _setupLogging(). :param msg: message to log :type msg: String ''' BusAdapter._logger.info(msg) def logError(self, msg): ''' Locally log an error message using the Python logging facility. The _logger name is 'SchoolBusLog'. Change format or _logger name by modifying _setupLogging(). :param msg: message to log :type msg: String ''' BusAdapter._logger.error(msg) def logDebug(self, msg): ''' Locally log a debug message using the Python logging facility. The _logger name is 'SchoolBusLog'. Change format or _logger name by modifying _setupLogging(). :param msg: message to log :type msg: String ''' BusAdapter._logger.debug(msg)
class RedisMonitor: def __init__(self): self.setup() def setup(self): ''' Connection stuff here so we can mock it ''' self.redis_conn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) # set up kafka self.kafka_conn = KafkaClient(KAFKA_HOSTS) self.producer = SimpleProducer(self.kafka_conn) self.topic_prefix = KAFKA_TOPIC_PREFIX def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' while True: self._do_info() self._do_expire() self._do_stop() time.sleep(0.1) def _do_info(self): ''' Processes info action requests ''' for key in self.redis_conn.scan_iter(match="info:*:*"): # the master dict to return master = {} master['uuid'] = self.redis_conn.get(key) master['total_pending'] = 0 master['server_time'] = int(time.time()) # break down key elements = key.split(":") dict = {} dict['spiderid'] = elements[1] dict['appid'] = elements[2] if len(elements) == 4: dict['crawlid'] = elements[3] # generate the information requested if 'crawlid' in dict: master = self._build_crawlid_info(master, dict) else: master = self._build_appid_info(master, dict) self.redis_conn.delete(key) if self._send_to_kafka(master): pass #print 'Sent info to kafka' else: print 'Failed to send info to kafka' def _send_to_kafka(self, master): ''' Sends the message back to Kafka @param master: the final dict to send @log_extras: the extras to append to the log output @returns: True if successfully sent to kafka ''' appid_topic = "{prefix}.outbound_{appid}".format( prefix=self.topic_prefix, appid=master['appid']) firehose_topic = "{prefix}.outbound_firehose".format( prefix=self.topic_prefix) try: self.kafka_conn.ensure_topic_exists(appid_topic) self.kafka_conn.ensure_topic_exists(firehose_topic) # dont want logger in outbound kafka message dump = json.dumps(master) self.producer.send_messages(appid_topic, dump) self.producer.send_messages(firehose_topic, dump) return True except Exception as ex: print traceback.format_exc() pass return False def _build_appid_info(self, master, dict): ''' Builds the appid info object @param master: the master dict @param dict: the dict object received @return: the appid info object ''' master['total_crawlids'] = 0 master['total_pending'] = 0 master['total_domains'] = 0 master['crawlids'] = {} master['appid'] = dict['appid'] match_string = '{sid}:queue'.format(sid=dict['spiderid']) sortedDict = self._get_bin(match_string) # now iterate through binned dict for score in sortedDict: for item in sortedDict[score]: if 'meta' in item: item = item['meta'] if item['appid'] == dict['appid']: crawlid = item['crawlid'] # add new crawlid to master dict if crawlid not in master['crawlids']: master['crawlids'][crawlid] = {} master['crawlids'][crawlid]['total'] = 0 master['crawlids'][crawlid]['high_priority'] = -9999 master['crawlids'][crawlid]['low_priority'] = 9999 timeout_key = 'timeout:{sid}:{aid}:{cid}'.format( sid=dict['spiderid'], aid=dict['appid'], cid=crawlid) if self.redis_conn.exists(timeout_key): master['crawlids'][crawlid]['expires'] = self.redis_conn.get(timeout_key) master['total_crawlids'] = master['total_crawlids'] + 1 if item['priority'] > master['crawlids'][crawlid]['high_priority']: master['crawlids'][crawlid]['high_priority'] = item['priority'] if item['priority'] < master['crawlids'][crawlid]['low_priority']: master['crawlids'][crawlid]['low_priority'] = item['priority'] master['crawlids'][crawlid]['total'] = master['crawlids'][crawlid]['total'] + 1 master['total_pending'] = master['total_pending'] + 1 return master def _get_bin(self, key): ''' Returns a binned dictionary based on redis zscore @return: The sorted dict ''' # keys based on score sortedDict = {} # this doesnt return them in order, need to bin first for item in self.redis_conn.zscan_iter(key): my_item = pickle.loads(item[0]) # score is negated in redis my_score = -item[1] if my_score not in sortedDict: sortedDict[my_score] = [] sortedDict[my_score].append(my_item) return sortedDict def _build_crawlid_info(self,master, dict): ''' Builds the crawlid info object @param master: the master dict @param dict: the dict object received @return: the crawlid info object ''' master['total_pending'] = 0 master['appid'] = dict['appid'] master['crawlid'] = dict['crawlid'] timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=dict['spiderid'], aid=dict['appid'], cid=dict['crawlid']) if self.redis_conn.exists(timeout_key): master['expires'] = self.redis_conn.get(timeout_key) # get all domain queues match_string = '{sid}:queue'.format(sid=dict['spiderid']) sortedDict = self._get_bin(match_string) # now iterate through binned dict for score in sortedDict: for item in sortedDict[score]: if 'meta' in item: item = item['meta'] if item['appid'] == dict['appid'] and \ item['crawlid'] == dict['crawlid']: if 'high_priority' not in master: master['high_priority'] = -99999 if 'low_priority' not in master: master['low_priority'] = 99999 if item['priority'] > master['high_priority']: master['high_priority'] = item['priority'] if item['priority'] < master['low_priority']: master['low_priority'] = item['priority'] master['total_pending'] = master['total_pending'] + 1 return master def _do_expire(self): ''' Processes expire requests Very similar to _do_stop() ''' for key in self.redis_conn.scan_iter(match="timeout:*:*:*"): timeout = float(self.redis_conn.get(key)) curr_time = time.time() if curr_time > timeout: # break down key elements = key.split(":") spiderid = elements[1] appid = elements[2] crawlid = elements[3] # add crawl to blacklist so it doesnt propagate redis_key = spiderid + ":blacklist" value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) # add this to the blacklist set self.redis_conn.sadd(redis_key, value) # everything stored in the queue is now expired result = self._purge_crawl(spiderid, appid, crawlid) # item to send to kafka extras = {} extras['action'] = "expire" extras['spiderid'] = spiderid extras['appid'] = appid extras['crawlid'] = crawlid extras['total_expired'] = result self.redis_conn.delete(key) if self._send_to_kafka(extras): #print 'Sent expired ack to kafka' pass else: print 'Failed to send expired ack to kafka' def _do_stop(self): ''' Processes stop action requests ''' for key in self.redis_conn.scan_iter(match="stop:*:*:*"): # break down key elements = key.split(":") spiderid = elements[1] appid = elements[2] crawlid = elements[3] uuid = self.redis_conn.get(key) redis_key = spiderid + ":blacklist" value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) # add this to the blacklist set self.redis_conn.sadd(redis_key, value) # purge crawlid from current set result = self._purge_crawl(spiderid, appid, crawlid) # item to send to kafka extras = {} extras['action'] = "stop" extras['spiderid'] = spiderid extras['appid'] = appid extras['crawlid'] = crawlid extras['total_purged'] = result self.redis_conn.delete(key) if self._send_to_kafka(extras): # delete timeout for crawl (if needed) since stopped timeout_key = 'timeout:{sid}:{aid}:{cid}'.format( sid=spiderid, aid=appid, cid=crawlid) self.redis_conn.delete(timeout_key) #print 'Sent stop ack to kafka' else: print 'Failed to send stop ack to kafka' def _purge_crawl(self, spiderid, appid, crawlid): ''' Wrapper for purging the crawlid from the queues @param spiderid: the spider id @param appid: the app id @param crawlid: the crawl id @return: The number of requests purged ''' # purge three times to try to make sure everything is cleaned total = self._mini_purge(spiderid, appid, crawlid) total = total + self._mini_purge(spiderid, appid, crawlid) total = total + self._mini_purge(spiderid, appid, crawlid) return total def _mini_purge(self, spiderid, appid, crawlid): ''' Actually purges the crawlid from the queue @param spiderid: the spider id @param appid: the app id @param crawlid: the crawl id @return: The number of requests purged ''' total_purged = 0 match_string = '{sid}:queue'.format(sid=spiderid) # using scan for speed vs keys for item in self.redis_conn.zscan_iter(match_string): item_key = item[0] item = pickle.loads(item_key) if 'meta' in item: item = item['meta'] if item['appid'] == appid and item['crawlid'] == crawlid: self.redis_conn.zrem(match_string, item_key) total_purged = total_purged + 1 return total_purged
'name': 'id', 'type': 'int' }, { 'name': 'random', 'type': 'int' }, { 'name': 'data', 'type': 'string' }, ], })) kafka = KafkaClient(kafkaConnect) kafka.ensure_topic_exists(topic) producer = SimpleProducer(kafka) for x in xrange(maxRecords): writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write( { 'id': x, 'random': randint(1, 3), 'data': str(uuid.uuid4().get_hex().upper()[0:20]) }, encoder) raw_bytes = bytes_writer.getvalue() producer.send_messages(topic, raw_bytes)
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d + 1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None print("self.settings['PLUGIN_DIR'] + instance.schema====", self.settings['PLUGIN_DIR'] + instance.schema) with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict( sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer( self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) print('the_dict', the_dict) found_plugin = False print('self.plugins_dict', self.plugins_dict) for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] print('instance==', instance) schema = obj['schema'] print( 'schema********************************************', schema) try: print('before v = self.validator(schema)') v = self.validator(schema) print('after v = self.validator(schema)') print('the_dict-------', the_dict) v.validate(the_dict) found_plugin = True print('found_plugin====', found_plugin) self._increment_plugin_stat( instance.__class__.__name__, the_dict) print('instance.handle(the_dict)', the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: print(' except ValidationError:======') pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn( "Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][ key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer(self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) found_plugin = False for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] schema = obj['schema'] try: self.validator(schema).validate(the_dict) found_plugin = True self._increment_plugin_stat( instance.__class__.__name__, the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn("Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: self.settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = self.settings['crawler-out-topic'].encode() self.conn_pool = self.settings['crawler_conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( crawlid = input['crawlid'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, body = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) if crawled['appid'] == self.settings["appid"]: safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'domain': crawled['attrs']['domain'] } self.emit([ crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context ])
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format( action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
kafkaConnect = args["k"] topic = args["t"] quiet = args["q"] schema = avro.schema.parse(json.dumps({ 'name': 'kafkatest', 'namespace': 'test', 'type': 'record', 'fields': [ {'name': 'id', 'type': 'int'}, {'name': 'random', 'type': 'int'}, {'name': 'data', 'type': 'string'}, ], })) kafka = KafkaClient(kafkaConnect) kafka.ensure_topic_exists(topic) producer = SimpleProducer(kafka) writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) for x in xrange(maxRecords): writer.write( {'id': x, 'random': randint(1, 3) ,'data': str(uuid.uuid4().get_hex().upper()[0:20])}, encoder) raw_bytes = bytes_writer.getvalue() producer.send_messages(topic, raw_bytes) if not quiet: print "Sent message ID: "+str(x)