class Node(Daemon): """ Node is started up on the remote instance via the bootstrapping process for that instance. The node is responsible for tracking active streams and managing the workers that process the jobs from thosee streams. If a stream goes idle (ie, there are no more jobs in the streams queue and all workers have died) then node will stop tracking the stream. If jobs re-appear on the stream Node will spawn new workers to process those jobs. If a new stream appears Node will spawn new workers to processs the jobs on that stream. Each worker is an independent concurrent process that inherits the stream to process from the Node. """ def __init__(self, queue, qauthkey, mpps= 5, dfs= None, dauthkey= None, logdir= curdir, piddir= curdir, **properties): """Initialize the Node's I/O stream and connect to the Queue and/or DFS.""" self.id= getipaddress() self.queue= queue self.qauthkey= qauthkey self.mpps= mpps self.dfs= dfs self.dauthkey= dauthkey self.properties= properties self.shutdown= Value('i', 0) self.workers= {} self.alive= True self.start_time= datetime.utcnow() self.connect() super(Node, self).__init__( pidfile= path.join(piddir, self.__class__.__name__ + ".pid"), stdout= path.join(logdir, self.__class__.__name__ + ".out"), stderr= path.join(logdir, self.__class__.__name__ + ".err"), stdin= path.join(logdir, self.__class__.__name__ + ".in") ) def connect(self): """Connects to the Queue and/or DFS on the host/port for whic hthe Node was intialized for.""" self.qconnect() if None not in self.dfs: self.dconnect() def qconnect(self): """ Attempts to connect to the Queue on the host/port for which the Node was initialized for. If no connection can be made, Node will keep attempting to connect until a connection can be established. One connection is established the remove methods requested will be registered. """ # remove connection from cache: # BaseProxy class has thread local storage which caches the connection # which is reused for future connections causing "borken pipe" errors on # creating new manager. if self.queue in BaseProxy._address_to_local: if hasattr(BaseProxy._address_to_local[self.queue][0], 'connection'): del BaseProxy._address_to_local[self.queue][0].connection # register handlers SyncManager.register("get_streams") SyncManager.register("get_queue") SyncManager.register("get_store") SyncManager.register("get_properties") print "connecting to queue", self.queue while self.alive: try: self.impq= SyncManager(address= self.queue, authkey= self.qauthkey) self.impq.connect() print "connected to queue", self.queue break except (EOFError, IOError, SocketError) as e: print "could not connect ...trying again", str(e) sleep(1) def dconnect(self): """ Attempts to connect to the DFS on the host/port for which the Node was initialized for. If no connection can be made, Node will keep attempting to connect until a connection can be established. Once a connection can be established the remove methods requested will be registered. """ # remove connection from cache: # BaseProxy class has thread local storage which caches the connection # which is reused for future connections causing "borken pipe" errors on # creating new manager. if self.dfs in BaseProxy._address_to_local: if hasattr(BaseProxy._address_to_local[self.dfs][0], 'connection'): del BaseProxy._address_to_local[self.dfs][0].connection # register handlers SyncManager.register("get_nodes") print "connecting to dfs", self.dfs while self.alive: try: self.impd= SyncManager(address= self.dfs, authkey= self.dauthkey) self.impd.connect() print "connected to dfs", self.dfs break except (EOFError, IOError, SocketError) as e: print "could not connect ...trying again", str(e) sleep(1) def process(self): """ Starts tracking streams. When a stream is found which matches the Node's criteria workers are assigned to the stream and spawned to start processing jobs from the streams queue. When the stream goes idle and all workers for that stream have died the Node will stop tracking the stream until new jobs appear on the stream. Node will limit the amount of workers it can spawn for a stream to the configred amount. If Node was started with the --dfs option then status updates about how many streams, workers and jobs are being processed will continually be sent back to DFS via a configurable rate. """ print "processing", self.mpps # get list of streams proxies streams= self.impq.get_streams() streams_tracking= {} # if reporting to DFS # track nodes via shared dict else maintain local dict if hasattr(self, 'impd'): nodes= self.impd.get_nodes() idle_time= datetime.utcnow() while self.alive: # get list of streams to track we are not currently tracking streams_to_track= filter(lambda stream_id: stream_id not in streams_tracking.keys(), streams.keys()) # if we are only tracking streams with specific properties # TODO: need to think through this more """ if len(self.properties): # get properties for all the streams we are tracking stream_properties= [dict(self.impq.get_properties(stream_id)) for stream_id in streams_to_track] # filter out streams we want to track based on matching subsets of properties if "id" in self.properties: streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(sp.items()).issubset(self.properties.items()), stream_properties)) else: streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(filter(lambda (property_name, property_value): property_name != 'id', sp.items())).issubset(self.properties.items()), stream_properties)) """ for stream_id in streams_to_track: print "tracking stream", stream_id streams_tracking.update([(stream_id, (self.impq.get_queue(stream_id), self.impq.get_store(stream_id), self.properties))]) # stop tracking streams which are no longer active for stream_id in streams_tracking.keys(): if stream_id not in streams.keys(): print 'stopped tracking stream', stream_id streams_tracking.pop(stream_id) # stop tracking workers which are no longer active for (pid, worker) in self.workers.items(): if not worker.is_alive(): #print "worker dead", pid, worker.stream_id self.workers.pop(pid) else: idle_time= datetime.utcnow() # create workers for streams we are currently tracking for (stream_id, (queue, store, properties)) in streams_tracking.items(): qsize= queue.qsize() stream_workers= filter(lambda w: w.stream_id == stream_id, self.workers.values()) num_stream_workers= min(qsize, self.mpps - len(stream_workers)) if num_stream_workers: print "creating %s workers for %s" % (num_stream_workers, stream_id) for i in range(1, num_stream_workers + 1): worker= Worker(self.id, stream_id, queue, store, properties, self.shutdown) worker.start() self.workers.update([(worker.pid, worker)]) idle_time= datetime.utcnow() print "created worker", i, worker.pid, stream_id status= Status( mpps= self.mpps, streams= len(streams_tracking), workers= len(self.workers), starttime= self.start_time, uptime= datetime.utcnow() - self.start_time, lastactivity= idle_time, idletime= datetime.utcnow() - idle_time, properties= self.properties, pid= getpid() ) if hasattr(self, 'impd'): nodes.update([(self.id, status)]) # if a worker has been blocked then stop all workers and shutdown # this will then cause the Node to go idle and DFS will shut it down # and restart a new node to take it's place if self.shutdown.value: print >> stderr, "node blocked", self.id self.alive= False sleep(1) self.stop() def stop(self): """ Starts the shutdown process for the Node. Waits for all workers to finish their activity. If Node was started with the --dfs option then it will de-register itself with DFS. """ # wait for workers to finish before shutting down print "shutting down node..." self.alive= False for (pid, worker) in self.workers.items(): print "waiting for worker:", pid, worker.stream_id worker.join() # if reporting to DFS # track nodes via shared dict else maintain local dict if hasattr(self, 'impd'): print "de-registering nodes with dfs" nodes= self.impd.get_nodes() try: del nodes[self.id] except KeyError: print >> stderr, "node not registered with dfs", self.id print "node shutdown complete." super(Node, self).stop() def run(self): """ Starts processing the streams which match the given properties of the Node. If a connection error between Node and Queue/DFs occurs Node will continually try to re-establish connection. """ while self.alive: try: self.process() except (KeyboardInterrupt, Exception) as e: if type(e) == KeyboardInterrupt: self.stop() else: print >> stderr, "queue/dfs communication error", str(e) self.connect() sleep(1)