class Node(Daemon): """ Node is started up on the remote instance via the bootstrapping process for that instance. The node is responsible for tracking active streams and managing the workers that process the jobs from thosee streams. If a stream goes idle (ie, there are no more jobs in the streams queue and all workers have died) then node will stop tracking the stream. If jobs re-appear on the stream Node will spawn new workers to process those jobs. If a new stream appears Node will spawn new workers to processs the jobs on that stream. Each worker is an independent concurrent process that inherits the stream to process from the Node. """ def __init__(self, queue, qauthkey, mpps= 5, dfs= None, dauthkey= None, logdir= curdir, piddir= curdir, **properties): """Initialize the Node's I/O stream and connect to the Queue and/or DFS.""" self.id= getipaddress() self.queue= queue self.qauthkey= qauthkey self.mpps= mpps self.dfs= dfs self.dauthkey= dauthkey self.properties= properties self.shutdown= Value('i', 0) self.workers= {} self.alive= True self.start_time= datetime.utcnow() self.connect() super(Node, self).__init__( pidfile= path.join(piddir, self.__class__.__name__ + ".pid"), stdout= path.join(logdir, self.__class__.__name__ + ".out"), stderr= path.join(logdir, self.__class__.__name__ + ".err"), stdin= path.join(logdir, self.__class__.__name__ + ".in") ) def connect(self): """Connects to the Queue and/or DFS on the host/port for whic hthe Node was intialized for.""" self.qconnect() if None not in self.dfs: self.dconnect() def qconnect(self): """ Attempts to connect to the Queue on the host/port for which the Node was initialized for. If no connection can be made, Node will keep attempting to connect until a connection can be established. One connection is established the remove methods requested will be registered. """ # remove connection from cache: # BaseProxy class has thread local storage which caches the connection # which is reused for future connections causing "borken pipe" errors on # creating new manager. if self.queue in BaseProxy._address_to_local: if hasattr(BaseProxy._address_to_local[self.queue][0], 'connection'): del BaseProxy._address_to_local[self.queue][0].connection # register handlers SyncManager.register("get_streams") SyncManager.register("get_queue") SyncManager.register("get_store") SyncManager.register("get_properties") print "connecting to queue", self.queue while self.alive: try: self.impq= SyncManager(address= self.queue, authkey= self.qauthkey) self.impq.connect() print "connected to queue", self.queue break except (EOFError, IOError, SocketError) as e: print "could not connect ...trying again", str(e) sleep(1) def dconnect(self): """ Attempts to connect to the DFS on the host/port for which the Node was initialized for. If no connection can be made, Node will keep attempting to connect until a connection can be established. Once a connection can be established the remove methods requested will be registered. """ # remove connection from cache: # BaseProxy class has thread local storage which caches the connection # which is reused for future connections causing "borken pipe" errors on # creating new manager. if self.dfs in BaseProxy._address_to_local: if hasattr(BaseProxy._address_to_local[self.dfs][0], 'connection'): del BaseProxy._address_to_local[self.dfs][0].connection # register handlers SyncManager.register("get_nodes") print "connecting to dfs", self.dfs while self.alive: try: self.impd= SyncManager(address= self.dfs, authkey= self.dauthkey) self.impd.connect() print "connected to dfs", self.dfs break except (EOFError, IOError, SocketError) as e: print "could not connect ...trying again", str(e) sleep(1) def process(self): """ Starts tracking streams. When a stream is found which matches the Node's criteria workers are assigned to the stream and spawned to start processing jobs from the streams queue. When the stream goes idle and all workers for that stream have died the Node will stop tracking the stream until new jobs appear on the stream. Node will limit the amount of workers it can spawn for a stream to the configred amount. If Node was started with the --dfs option then status updates about how many streams, workers and jobs are being processed will continually be sent back to DFS via a configurable rate. """ print "processing", self.mpps # get list of streams proxies streams= self.impq.get_streams() streams_tracking= {} # if reporting to DFS # track nodes via shared dict else maintain local dict if hasattr(self, 'impd'): nodes= self.impd.get_nodes() idle_time= datetime.utcnow() while self.alive: # get list of streams to track we are not currently tracking streams_to_track= filter(lambda stream_id: stream_id not in streams_tracking.keys(), streams.keys()) # if we are only tracking streams with specific properties # TODO: need to think through this more """ if len(self.properties): # get properties for all the streams we are tracking stream_properties= [dict(self.impq.get_properties(stream_id)) for stream_id in streams_to_track] # filter out streams we want to track based on matching subsets of properties if "id" in self.properties: streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(sp.items()).issubset(self.properties.items()), stream_properties)) else: streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(filter(lambda (property_name, property_value): property_name != 'id', sp.items())).issubset(self.properties.items()), stream_properties)) """ for stream_id in streams_to_track: print "tracking stream", stream_id streams_tracking.update([(stream_id, (self.impq.get_queue(stream_id), self.impq.get_store(stream_id), self.properties))]) # stop tracking streams which are no longer active for stream_id in streams_tracking.keys(): if stream_id not in streams.keys(): print 'stopped tracking stream', stream_id streams_tracking.pop(stream_id) # stop tracking workers which are no longer active for (pid, worker) in self.workers.items(): if not worker.is_alive(): #print "worker dead", pid, worker.stream_id self.workers.pop(pid) else: idle_time= datetime.utcnow() # create workers for streams we are currently tracking for (stream_id, (queue, store, properties)) in streams_tracking.items(): qsize= queue.qsize() stream_workers= filter(lambda w: w.stream_id == stream_id, self.workers.values()) num_stream_workers= min(qsize, self.mpps - len(stream_workers)) if num_stream_workers: print "creating %s workers for %s" % (num_stream_workers, stream_id) for i in range(1, num_stream_workers + 1): worker= Worker(self.id, stream_id, queue, store, properties, self.shutdown) worker.start() self.workers.update([(worker.pid, worker)]) idle_time= datetime.utcnow() print "created worker", i, worker.pid, stream_id status= Status( mpps= self.mpps, streams= len(streams_tracking), workers= len(self.workers), starttime= self.start_time, uptime= datetime.utcnow() - self.start_time, lastactivity= idle_time, idletime= datetime.utcnow() - idle_time, properties= self.properties, pid= getpid() ) if hasattr(self, 'impd'): nodes.update([(self.id, status)]) # if a worker has been blocked then stop all workers and shutdown # this will then cause the Node to go idle and DFS will shut it down # and restart a new node to take it's place if self.shutdown.value: print >> stderr, "node blocked", self.id self.alive= False sleep(1) self.stop() def stop(self): """ Starts the shutdown process for the Node. Waits for all workers to finish their activity. If Node was started with the --dfs option then it will de-register itself with DFS. """ # wait for workers to finish before shutting down print "shutting down node..." self.alive= False for (pid, worker) in self.workers.items(): print "waiting for worker:", pid, worker.stream_id worker.join() # if reporting to DFS # track nodes via shared dict else maintain local dict if hasattr(self, 'impd'): print "de-registering nodes with dfs" nodes= self.impd.get_nodes() try: del nodes[self.id] except KeyError: print >> stderr, "node not registered with dfs", self.id print "node shutdown complete." super(Node, self).stop() def run(self): """ Starts processing the streams which match the given properties of the Node. If a connection error between Node and Queue/DFs occurs Node will continually try to re-establish connection. """ while self.alive: try: self.process() except (KeyboardInterrupt, Exception) as e: if type(e) == KeyboardInterrupt: self.stop() else: print >> stderr, "queue/dfs communication error", str(e) self.connect() sleep(1)
class Impetus(object): """ Multi-threaded library for interfacing with the Impetus system. Hides threading considerations from the client. Determines callback methods through introspection if callbacks are not explicitly stated. Decorators are provided for the client to indicate methods which run on the remote nodes and local process methods which consume the results. Creates a single stream per instance. The client can created additional streams through the Queue's remote methods via the "impq" handler. """ statuses= ("forked", "processed") def __init__(self, address, authkey, taskdir= "tasks", id= None, **properties): """Creates a stream and retrieves the streams priority queue and data-store.""" self.id= id if id else str(uuid1()) self.ipaddress= getipaddress() self.address= address self.taskdir= path.join(taskdir, self.id) self.properties= properties self.impq= SyncManager(address= self.address, authkey= authkey) self.impq.register("get_streams") self.impq.register("create_stream") self.impq.register("delete_stream") self.impq.register("get_store") self.impq.register("get_queue") self.impq.connect() self.jobs= [] self.impq.create_stream(id= self.id, ipaddress= self.ipaddress, **properties) self.store= self.impq.get_store(id= self.id) self.queue= self.impq.get_queue(id= self.id) self.alive= True self._current_thread= None self._lock= Lock() self.threads= [] self.errors= {} self.ready= {} self._progress= {} try: makedirs(self.taskdir) except: pass def __del__(self): """Deletes the stream that was created during initialization.""" self.impq.delete_stream(self.id) @staticmethod def node(target): """ All methods that are to run on remote nodes must be staticmethods as the context of which the methods was defined can not be serialized. """ return target @staticmethod def startup(target): """ Sets up the startup method for the object to run as a thread. """ def _process(self): target(self) global _declaration_order _process.order= _declaration_order _declaration_order+= 1 return _process @staticmethod def shutdown(target): """ Sets up the shutdown method to be excuted after all threads have been terminated. The ready and errors parameters will contain a dict of file-handles pointing to the results files (ie, ../tasks/<task_id>/<method>.ok, .err> for each @process method. """ def _shutdown(self): target(self, self.ready, self.errors, self._progress) global _declaration_order _shutdown.order= _declaration_order return _shutdown @staticmethod def process(target): """ Sets up method to run as a thread. The method will be called with a list of currently available jobs that are either in a ready or error state. The thread will die when it has finished processing all the jobs the previous @process method forked and when the previous @process method has terminatted. Each thread will be regulated so that all threads have an eventual chance of executing. Order of execution is not guarenteed and thread scheudling is handled by the operating system. """ def _process(self): current_thread= currentThread() if current_thread.name == 'MainThread': return previous_thread= current_thread.previous_thread while self.alive: self._thread_regulator(current_thread, previous_thread) with self._lock: jobs= filter(lambda job: job.get("callback") == current_thread.name, self.store.values()) ready= filter(lambda job: job.get("status") == "ready", jobs) errors= filter(lambda job: job.get("status") == "error", jobs) for job in ready: self.ready[current_thread.name].write(encode(compress(jdumps(job, cls= JobEncoder))) + "\n") self.store.pop(job.get("id")) for job in errors: self.errors[current_thread.name].write(encode(compress(jdumps(job, cls= JobEncoder))) + "\n") self.store.pop(job.get("id")) if len(ready) or len(errors): target(self, ready, errors) self._thread_progress(current_thread.name, "processed", len(ready) + len(errors)) self._show_progress(current_thread) if len(self.store) == 0 and previous_thread != None and previous_thread.is_alive() == False: print "%s %s completed" % (datetime.utcnow(), current_thread.name) stdout.flush() self.alive= False sleep(0.01) global _declaration_order _process.order= _declaration_order _declaration_order+= 1 return _process def fork(self, target, args, callback= None, priority= None, job_id= None, **properties): """ Turns the target method to be forked into byte-code and creates a Job. The Job is initialized to the starting state and placed the the streams priorty queue for execution. """ if self.properties.get('mss'): stall_time= 1 while len(self.store) > int(self.properties.get('mss')): print "throttling size %s exceeds mss %s" % (len(self.store), self.properties.get('mss')) sleep(stall_time) stall_time+= 1 if stall_time >= 10: break current_thread= currentThread() job= Job( client= {"id": self.id, "ipaddress": self.ipaddress}, name= target.func_name, code= encode(compress(mdumps(target.func_code))), args= args, callback= callback if callback else current_thread.next_thread.name, result= None, transport= None, **properties ) if priority: setattr(job, "priority", priority) self.store.update([(job.get("id"), job)]) self.queue.put([(job.get("priority"), job.get("id"))]) #print "forked", len(self.store) self.jobs.append(job.get("id")) self._thread_progress(current_thread.name, "forked", 1) return job.get("id") def _thread_progress(self, name, status, count): """ Keeps track of how many jobs the current thread has forked/processed. """ with self._lock: progress= self._progress.get(name, dict([(s, 0) for s in self.statuses])) progress.update([(status, progress.get(status, 0) + count)]) self._progress.update([(name, progress)]) def _show_progress(self, current_thread): """Displays the current threads progress to stdout.""" msg= [] with self._lock: for thread in self.threads: progress= self._progress.get(thread.name, dict([(s, 0) for s in self.statuses])) msg.append("%s %s/%s -> " % (thread.name, progress.get("forked"), progress.get("processed"))) print "thread: %s via %s" % (''.join(msg)[:-4], current_thread.name) def _thread_regulator(self, current_thread, previous_thread): """ Regulates the current thread so all threads have an eventual chance to run. Thread scheduling is handled by the operating-system. If the operating-system repeatively schedules the same thread than that thread is immediately put to sleep so the operating-system can schedule a new thread. """ stall_time= 1 while self._current_thread == current_thread: #print "stalling:", current_thread.name, stall_time sleep(stall_time) stall_time+= 1 if stall_time >= 10: break if current_thread.name == self.threads[-1].name and previous_thread != None and previous_thread.is_alive() == False: with self._lock: self._current_thread= self.threads[0] with self._lock: #print "setting current thread", current_thread.name self._current_thread= current_thread def _create_thread(self, name, method): """ Creates thread for the @process method as well as error/ready file handlers for which all jobs in an error/ready state are written to. All threads are maintained in an internal thread list. """ thread= Thread(target= method, name= name, args= (self, )) self.errors[name]= open(path.join(self.taskdir, '.'.join((name, "err"))), 'ab+') self.ready[name]= open(path.join(self.taskdir, '.'.join((name, "ok"))), 'ab+') return thread def _link_threads(self, threads): """ Creates previous/next properties for each thread based on the threads declaration order. """ for i in range(len(threads)): setattr(threads[i], "previous_thread", threads[i-1] if i > 0 else None) setattr(threads[i], "next_thread", threads[i+1] if i < len(threads)-1 else None) return threads[0] def _start_threads(self, threads): """Starts all threads based on their delcaration order.""" [thread.start() for thread in threads] [thread.join() for thread in threads] def run(self): self.threads= [self._create_thread(name, method) for (name, method) in sorted(filter(lambda (name, method): type(method) == FunctionType and method.__name__ == "_process", self.__class__.__dict__.items()), key= lambda (name, method): method.order)]