class DNACurveNode(ApplicationNode): """ DNA Curve Analysis application """ implements(IApp) factor = IntItem('dnasample', 'factor', 1, """How many workloads does a single task get assigned, in our a workload is considered a row""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(DNACurveNode, self).app_init() def app_main(self): """ Applications main entry """ return super(DNACurveNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() self.dna_system = DNACurveTaskSystem("ATGCAAATTG"*1000, "trifonov", name="Example", maxlen=1024*1024, factor=self.factor) return self.dna_system def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ # Reassamble result to be processed further try: print("Total time: {}".format(time.time() - self.start_time)) except: traceback.print_exc() self.shutdown_main_loop() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ self.log.error("Tasks system failed to be send to framework!") # Check if the resuklt dict contains a traceback if "t" in result: self.log.error(result["t"])
class Pickler(Component): implements(IPickler) """ Class responsible for pickling and unpickling objects """ pickle_protocol = IntItem( 'pickler', 'protocol', pickle.HIGHEST_PROTOCOL, """Protocol used when pickling, by default pickle.HIGHEST_PROTOCOL""") secret = ConfigItem( 'pickler', 'secret', 'JhTv535Vg385V', """Default salt used on decrypting encrypting a pickle""") # salt size in bytes salt_size = IntItem('pickler', 'salt_size', 16, """Size of the salt used in the encryption process""") # number of iterations in the key generation num_iterations = IntItem( 'pickler', 'num_iterations', 20, """Number of iterations used in the key generation""") # the size multiple required for AES aes_padding = IntItem('pickler', 'aes_padding', 16, """Padding used for AES encryption""") def __init__(self): super(Pickler, self).__init__() self.crypto_helper = CryptoHelper(self.salt_size, self.num_iterations, self.aes_padding) if self.secret == Pickler.secret.default.decode('utf-8'): self.log.warn( "Pickler using default secret, please setup you own to avoid security vulnerabilities!" ) def pickle_f(self, fname, obj): """ picke an object into a file """ try: pickle.dump(obj=obj, file=gzip.open(fname, "wb"), protocol=self.pickle_protocol) except: raise PickleException() def unpickle_f(self, fname): """ Unpicke an object from a file """ try: return pickle.load(gzip.open(fname, "rb")) except: raise UnpickleException() def pickle_s(self, obj): """ pickle an object and return the pickled string """ try: return pickle.dumps(obj, protocol=self.pickle_protocol) except: raise PickleException() def pickle_encode_s(self, obj): """ Encode a pickled object """ try: return base64.b64encode( self.crypto_helper.encrypt(self.pickle_s(obj), self.secret)) except: raise PickleException() def unpickle_s(self, pickle_string): """ unpickle a string and return an object """ try: return pickle.loads(pickle_string) except: raise UnpickleException() def unpickle_decode_s(self, pickle_string): """ Unpickle a base64 string and return an object """ try: return self.unpickle_s( self.crypto_helper.decrypt(base64.b64decode(pickle_string), self.secret)) except: raise UnpickleException()
class ZMQTaskManager(Component, threading.Thread): implements(ITaskManager) """ Simple task manager used in simple single job applications """ num_workers = IntItem('ZMQTaskManager', 'num_workers', -1, """Number of worker processed to be created, -1 will spawn as much as physical cores.""") master_backend_port = HostItem('ZMQTaskManager', 'master_backend_port', 'localhost:5001', """Masters backend port where we will request tasks.""") def __init__(self): threading.Thread.__init__(self) Component.__init__(self) # Some thread related stuff self.daemon = True self.kill_switch = False # Create contect and socket self.context = zmq.Context() # Initialize base manager stuff self._num_workers = 0 self.results = multiprocessing.JoinableQueue() def init(self, identity, address): """ Initialize the manager """ self.identity = identity self.host = address[0] self.port = address[1] self._num_workers = self.num_workers if self._num_workers <= 0: self._num_workers = multiprocessing.cpu_count() # We now prepare our queues, both the joinable and the results # queues. Then we just create a process for each worker self.tasks = multiprocessing.JoinableQueue() self.processes = [TaskProcess(self.results, i, self.tasks, self.identity, self.host, self.port) for i in range(self._num_workers)] #self.processes = [TaskProcess(self.results, i) for i in range(self._num_workers)] context = zmq.Context() self.ventilator_send = context.socket(zmq.PUSH) self.ventilator_send.bind("tcp://127.0.0.1:%d" % WORKER_PORT) def get_num_workers(self): """ Return the number of workers we use for our processing """ return self._num_workers def start(self): """ Start our worker processes """ threading.Thread.start(self) for worker in self.processes: worker.daemon = True worker.start() def stop(self): """ Stop our worker processes """ self.log.info("Shutting down ZMQTaskManager") for i in xrange(self._num_workers): #send_to_zmq_zipped(self.ventilator_send, None) self.tasks.put(None) # Poison for result listener self.results.put(None) # Kill our own thread self.kill_switch = True self.context.term() self.join(5000) self.log.info("ZMQTaskManager shutdown finished") def run(self): self.log.info("ZMQTaskManager started") # Create and connect to our scheduler socket self.socket = self.context.socket(zmq.PULL) self.socket.setsockopt(zmq.LINGER, 0) self.socket.set_hwm(0) self.socket.connect('tcp://{host}:{port}'.format(host=self.master_backend_port[0], port=self.master_backend_port[1])) # Start receiving messages while not self.kill_switch: try: next_task = receive_from_zmq_zipped(self.socket) self.push_task(next_task) except zmq.ContextTerminated: break except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: pass # no message was ready else: break except: traceback.print_exc() self.socket.close() self.log.info("ZMQTaskManager stopped") def update_pool(self, _num_workers=-1): """ Set the number of workers the task manager should use """ self.stop() self.init(_num_workers) self.start() def push_task(self, task): """ Push a task that should be completed by the workers """ try: #send_to_zmq_zipped(self.ventilator_send, task) self.tasks.put(task) except: traceback.print_exc() return True def wait_for_all(self): """ Wait until all tasks has been finished """ pass def get_results_queue(self): """ Return a refernce to the result queue """ return self.results def task_finished(self, task, result, error): """ Called once a task has been performed """ task.finished(result, error)
class ZMQTaskScheduler(Component, threading.Thread): implements(ITaskScheduler) """ Different task scheduler implementation using ZMQ push/pull sockets. Uses a simple round-robin mechanism to handle multiple slaves. """ frontend_port = IntItem('ZMQTaskScheduler', 'frontend_port', 5000, """Frontend port used to send tasks to the scheduler""") backend_port = IntItem('ZMQTaskScheduler', 'backend_port', 5001, """Backend port used to send tasks to the scheduler. Slaves will receive tasks on it.""") def __init__(self): threading.Thread.__init__ (self) Component.__init__(self) self.stats = Stats.getInstance() # Some thread related stuff self.daemon = True self.kill_switch = False # The socket framework self.context = zmq.Context() self.frontend = self.context.socket(zmq.PULL) self.frontend.bind('tcp://*:{port}'.format(port=self.frontend_port)) self.frontend.setsockopt(zmq.LINGER, 0) self.frontend.set_hwm(0) self.backend = self.context.socket(zmq.PUSH) self.backend.bind('tcp://*:{port}'.format(port=self.backend_port)) self.backend.setsockopt(zmq.LINGER, 0) self.backend.set_hwm(0) # The poller is used to poll for incomming messages for both # the frontend (internet) and the backend (scheduling) self.poll = zmq.Poller() self.poll.register(self.frontend, zmq.POLLIN) # Connected socket locally to frontend to send tasks, this socket # provides a lock to be able to be thread-safe self.frontend_push = self.context.socket(zmq.PUSH) self.frontend_push.connect('tcp://localhost:{port}'.format(port=self.frontend_port)) self.frontend_push.setsockopt(zmq.LINGER, 0) self.frontend_push.set_hwm(0) # Our lock used to protect the frontend_push socket self.lock = threading.Lock() def setup(self, master): self.master = master self.start() def run(self): self.log.info("ZMQTaskScheduler started") # Start receiving messages while not self.kill_switch: try: sockets = dict(self.poll.poll(1000)) if self.frontend in sockets: msg = self.frontend.recv(flags=zmq.NOBLOCK) #tprint('Server received message from %s' % (ident)) self.backend.send(msg, flags=zmq.NOBLOCK) except zmq.Again: # Timeouy just fired, no problem! pass except KeyboardInterrupt: break except zmq.ContextTerminated: break except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: pass # no message was ready else: break except: traceback.print_exc() # Not really good to just pass but saver for now! pass self.frontend.close() self.backend.close() with self.lock: self.frontend_push.close() self.context.term() self.log.info("ZMQTaskScheduler stopped") def stop(self): self.log.info("Shutting down ZMQTaskScheduler") self.kill_switch = True self.join(5000) self.log.info("ZMQTaskScheduler shutdown finished") def start_system(self, task_system): """ Start an incomming task system """ self.push_tasks(task_system.generate_tasks(self.master)) def _push_task(self, task): """ No lock variant of push task method """ send_to_zmq_zipped(self.frontend_push, task) def push_tasks(self, tasks): """ Push all tasks on the global task queue """ with self.lock: # DO NOT USE push_task to queue tasks! It would be a deadlock! for task in tasks: self._push_task(task) #self.tasks.put(task) def push_task(self, task): """ Put a task on the global task queue """ with self.lock: # Do not poison ourselfs! if task: self._push_task(task) #self.tasks.put(task) def rate_slaves(self): """ Update slaves """ pass def _tasked_pushed(self, slave_id): """ A slave has aquired a new task, update its rank """ pass def task_finished(self, task, result, error): """ A slave has finished a new task, update its rank """ task.finished(result, error)
class MandlebrotSimpleNode(ApplicationNode): """ Application node distributing the computation of the mandlebrot set using just tasks """ implements(IApp) use_optimized_task = BoolItem( 'mandlebrotsample', 'use_optimized_task', True, """Should we use the data optimized task or the lazy task""") send_task_batch = BoolItem( 'mandlebrotsample', 'task_batch', True, """Should we send all tasks one by one or should we batch them into a hughe list""" ) factor = IntItem( 'mandlebrotsample', 'factor', 1, """How many workloads does a single task get assigned, in our a workload is considered a row""" ) iters = IntItem('mandlebrotsample', 'iters', 20, """Mandlebrot iterations per pixel""") height = IntItem('mandlebrotsample', 'height', 1024, """Height of the mandlebrot set image""") width = IntItem('mandlebrotsample', 'width', 1536, """Width of the mandlebrot set image""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(MandlebrotSimpleNode, self).app_init() def app_main(self): """ Applications main entry """ return super(MandlebrotSimpleNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ # Do not create a tasks system, we will handle tasks on our own return None def start_processing(self): """ Called when the app is not using a ITaskSystem and will instead just add tasks and will take care of the task flow itself """ self.log.info("Starting computation") if self.send_task_batch: self.log.info(" Task batching enabled") self.start_time = time.time() self.image = np.zeros((self.height, self.width), dtype=np.uint8) # Init task related stuff self.min_x = -2.0 self.max_x = 1.0 self.min_y = -1.0 self.max_y = 1.0 self.pixel_size_x = (self.max_x - self.min_x) / self.width self.pixel_size_y = (self.max_y - self.min_y) / self.height # Job handling (very optimistic :D) self.jobs = 0 self.finished_jobs = 0 job_list = [] workload = [] rows = 0 x = 0 if self.use_optimized_task: num_tasks, reminder = divmod(self.width, self.factor) self.jobs = num_tasks + reminder for i in xrange(0, self.jobs): if self.send_task_batch: job_list.append( MandlebrotTaskOptimized("m", None, self.node_id_str, iters=self.iters, start_x=i, rows=self.factor, cols=self.height, pixel_size_x=self.pixel_size_x, pixel_size_y=self.pixel_size_y, min_x=self.min_x, min_y=self.min_y)) else: self.push_task( MandlebrotTaskOptimized("m", None, self.node_id_str, iters=self.iters, start_x=i, rows=self.factor, cols=self.height, pixel_size_x=self.pixel_size_x, pixel_size_y=self.pixel_size_y, min_x=self.min_x, min_y=self.min_y)) else: for x in range(self.width): # Distribute using rows rows += 1 real = self.min_x + x * self.pixel_size_x for y in range(self.height): imag = self.min_y + y * self.pixel_size_y workload.append((x, y, real, imag, self.iters)) # every self.factor rows create a task with the workload. Note that in this case we will force the system_id to be None while setting the client id if rows == self.factor: if self.send_task_batch: job_list.append( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) else: self.push_task( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) self.jobs += 1 workload = [] rows = 0 # Add last task with rest of workload if len(workload) > 0: if self.send_task_batch: job_list.append( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) else: self.push_task( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) self.jobs += 1 if self.send_task_batch: self.jobs = len(job_list) # Send batch or check for eventual end condition if self.send_task_batch: self.push_tasks(job_list) else: # Check in case we are already done! self.check_finished() def task_finished(self, task, result, error): """ Called when a task has been done """ # Integrate results in our image if result: for x, column in result.iteritems(): for y, value in column.iteritems(): self.image[y, x] = value self.finished_jobs += 1 self.check_finished() def check_finished(self): """ Check if we finsihed all computation or not """ if self.finished_jobs == self.jobs: self.log.info("All tasks finished!!") print("Calculated in {} seconds!".format(time.time() - self.start_time)) self.shutdown_main_loop() imshow(self.image) show() def push_task_response(self, result): """ We just add a Task to the computation framework """ pass #self.log.info("Task send to computation framework") def push_task_failed(self, result): """ We failed to add a Task to the computation framework """ self.log.info("Failed to send task send to computation framework") def push_tasks_response(self, result): """ We just add a set of Tasks to the computation framework """ self.log.info("Tasks send to computation framework") def push_tasks_failed(self, result): """ We failed to add a set of Tasks to the computation framework """ self.log.info("Failed to send tasks send to computation framework")
class GenericTaskManager(Component): implements(ITaskManager) """ Simple task manager used in simple single job applications """ num_workers = IntItem('GenericTaskManager', 'num_workers', -1, """Number of worker processed to be created, -1 will spawn as much as physical cores.""") def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) # Initialize base manager stuff self._num_workers = 0 self.results = multiprocessing.JoinableQueue() def init(self, identity, address): """ Initialize the manager """ self.identity = identity self.host = address[0] self.port = address[1] self._num_workers = self.num_workers if self._num_workers <= 0: self._num_workers = multiprocessing.cpu_count() # We now prepare our queues, both the joinable and the results # queues. Then we just create a process for each worker self.tasks = multiprocessing.JoinableQueue() self.processes = [TaskProcess(self.results, i, self.tasks, self.identity, self.host, self.port) for i in range(self._num_workers)] #self.processes = [TaskProcess(self.results, i) for i in range(self._num_workers)] context = zmq.Context() self.ventilator_send = context.socket(zmq.PUSH) self.ventilator_send.bind("tcp://127.0.0.1:%d" % WORKER_PORT) def get_num_workers(self): """ Return the number of workers we use for our processing """ return self._num_workers def start(self): """ Start our worker processes """ for worker in self.processes: worker.daemon = True worker.start() def stop(self): """ Stop our worker processes """ for i in xrange(self._num_workers): #send_to_zmq_zipped(self.ventilator_send, None) print("Adding task") self.tasks.put(None) # Poison for result listener self.results.put(None) def update_pool(self, _num_workers=-1): """ Set the number of workers the task manager should use """ self.stop() self.init(_num_workers) self.start() def push_task(self, task): """ Push a task that should be completed by the workers """ try: #send_to_zmq_zipped(self.ventilator_send, task) self.tasks.put(task) except: traceback.print_exc() return True def wait_for_all(self): """ Wait until all tasks has been finished """ pass def get_results_queue(self): """ Return a refernce to the result queue """ return self.results def task_finished(self, task, result, error): """ Called once a task has been performed """ task.finished(result, error)
class MandlebrotNode(ApplicationNode): """ Application node distributing the computation of the mandlebrot set using an autonomous task system """ implements(IApp) use_optimized_task = BoolItem( 'mandlebrotsample', 'use_optimized_task', True, """Should we use the data optimized task or the lazy task""") factor = IntItem( 'mandlebrotsample', 'factor', 1, """How many workloads does a single task get assigned, in our a workload is considered a row""" ) iters = IntItem('mandlebrotsample', 'iters', 20, """Mandlebrot iterations per pixel""") height = IntItem('mandlebrotsample', 'height', 1024, """Height of the mandlebrot set image""") width = IntItem('mandlebrotsample', 'width', 1536, """Width of the mandlebrot set image""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(MandlebrotNode, self).app_init() def app_main(self): """ Applications main entry """ return super(MandlebrotNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() self.system = MandlebrotTaskSystem(-2.0, 1.0, -1.0, 1.0, self.height, self.width, self.iters, self.factor, self.use_optimized_task) return self.system def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ print("Total time: {}".format(time.time() - self.start_time)) self.shutdown_main_loop() # Reassamble result to be processed further try: self.system.image = np.zeros((self.height, self.width), dtype=np.uint8) self.system.do_post_run(result) except: traceback.print_exc() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ self.log.error("Tasks system failed to be send to framework!") # Check if the resuklt dict contains a traceback if "t" in result: self.log.error(result["t"])
class BaseNode(object): """ Base node, all nodes will be atleast of this type. Responsible for hosting and exposing a simple API apart from listening on a TCP port for socket interactions. """ port = IntItem('node', 'port', 8080, """Port of the API interface with this node""") use_gzip = BoolItem( 'node', 'use_gzip', True, """Check if we should gzip all interactions (recommended)""") pickler = ExtensionPointItem( 'Node', 'pickler', IPickler, 'Pickler', """Pickler class used by the whole framework""") proxy_api = IntItem('node', 'proxy_api', 1, """API version used for any client JSON RPC calls""") proxy_username = ConfigItem( 'node', 'proxy_username', '', """Username used when performing API client calls""") proxy_password = ConfigItem( 'node', 'proxy_password', '', """Password used when performing API client calls""") heartbeat_timer = FloatItem( 'node', 'heartbeat_timer', 5.0, """Timer used to send periodically heartbeats to the master""") stats_dump_timer = FloatItem( 'node', 'stats_dump_timer', 30.0, """Timer used to dump stats into the log. -1 will never dump stats.""") secret = ConfigItem( 'node', 'crypot_secret', 'JhTv535Vg385V', """Default salt used on decrypting encrypting a pickle""") # salt size in bytes salt_size = IntItem('node', 'crypot_salt_size', 16, """Size of the salt used in the encryption process""") # number of iterations in the key generation num_iterations = IntItem( 'node', 'crypot_num_iterations', 20, """Number of iterations used in the key generation""") # the size multiple required for AES aes_padding = IntItem('node', 'crypot_aes_padding', 16, """Padding used for AES encryption""") urls = ( # Get and basic API handling (not versioned!) '/', 'index_get', '/ping/', 'ping_get', '/ping', 'pint_get', '/status/', 'status_get', '/status', 'status_get', '/stats/', 'stats_get', '/stats', 'stats_get' # Post API handling of version 1 , '/api/1/', 'APIHandlerV1', '/api/1', 'APIHandlerV1') def app_init(self): """ Initialize application just before running it """ self.lock_cache = RWLockCache() def app_main(self): """ Launch a concurrent application """ # Generate rest API self.generate_api() # Now run our API listener self.log.debug("Hosting application on port %d" % (self.port)) # Get a ref to our stats helper self.stats = Stats.getInstance() # Create cryto helper used for network communciation self.crypto_helper = CryptoHelper(self.salt_size, self.num_iterations, self.aes_padding) # Make sure the URL proxy knows us global global_hook global_hook = GlobalHook({'node': self}) #api should only be there for the master node and used for node registration and heartbeats. Each node will have a socket #while slave nodes will have a local server too. There servers are no web servers because they are too expensive! #refactor server thingy tomorrow and add client which will be connected with the server through a normal socket! #The master server will act as only that, a controller and will distribute work using a better performing mechanism: UDP? #Use asycn calls for heartbeat for example #Create the server the same way the guys from PP do! (See ppserver) Try using a multithreaded pool to handle connections instead of threads! self.api_thread = api_thread(self.log, self.urls, self.port, self.use_gzip) self.api_thread.daemon = True self.api_thread.start() self.heartbeat_threshold = self.heartbeat_timer self.current_time = 0 self.last_time = 0 self.last_delta_time = 0 self.stats_dump_threshold = self.stats_dump_timer # Bool flag used to control the main loop self.kill_received = False # Give us some time until its up time.sleep(0.5) return APP_RET_CODE_SUCCESS def stop_api_thread(self): self.api_thread.stop() def main_loop(self): # Register with master before anything if self.has_master(): self.register_with_master() self.last_time = time.time() while not self.kill_received: try: # Calculate delta time for this frame self.current_time = time.time() delta_time = self.current_time - self.last_time self.on_update(delta_time) # Safe last time self.last_time = self.current_time self.last_delta_time = delta_time except KeyboardInterrupt: try: if self.has_master(): self.unregister_from_master() except Exception as e: traceback.print_exc() self.log.info("Exiting main loop") self.kill_received = True except Exception as e: traceback.print_exc() self.log.error("Mainloop exception: %s" % (e)) self.log.info("Main loop exited!") def shutdown_main_loop(self): self.kill_received = True def on_update(self, delta_time): # Only dump is requested if self.stats_dump_timer > 0: self.stats_dump_threshold -= delta_time if self.stats_dump_threshold < 0: self.stats.dump_stats(self.log) self.stats_dump_threshold = self.stats_dump_timer def generate_api(self): # API service handler for version 1 (only version for now) self.api_service_v1 = SimpleJSONRPCService(api_version=1) @jsonremote(self.api_service_v1) def ping(request): return "pong" @jsonremote(self.api_service_v1) def status(request): return self.status() @jsonremote(self.api_service_v1) def api(request): return self.api_service_v1.api() def ping(self): return "pong" def index(self): return "OK" def status(self): status = { 'node': self.__class__.__name__, 'systeminfo': self.compmgr.systeminfo } return status def get_stats(self): return self.stats.dump_all() def create_node_proxy(self, url): """ Create a new json proxy instance used by the node when acting as a client role """ return NodeProxy( pyjsonrpc.HttpClient( url=("http://%s/api/%d") % (url, self.proxy_api), username=self.proxy_username, password=self.proxy_password), self.log, self.rpc_call_success, self.rpc_call_failed) def create_tcp_proxy(self, host, port): """ Create a JSON TCP socket proxy instance to a server """ #tcp_client = TCPClient(self.log, host, port, self) #return TCPProxy(tcp_client, self.log), tcp_client tcp_client = TCPServerProxyZMQ(self.node_id_str, host, port, self.log) return TCPProxy(tcp_client, self.log), tcp_client def create_tcp_client_proxy(self, sock, request): """ Create a JSON TCP socket proxy instance to a client """ return TCPProxyZMQ(sock, request, self.log) def create_tcp_client_proxy_zmq(self, context, identity): """ Create a JSON TCP socket proxy instance to a client """ return TCPProxy(TCPClientProxyZMQ(context, identity, self.log), self.log)
class ExpensiveSimpleNode(ApplicationNode): """ Application node distributing the computation of the mandlebrot set using just tasks """ implements(IApp) send_task_batch = BoolItem( 'expensivesample', 'task_batch', True, """Should we send all tasks one by one or should we batch them into a hughe list""" ) time_per_task = IntItem( 'expensivesample', 'time_per_task', 1, """Time each task will perform on doing nothind (active wait) to simulate an expensive computation""" ) num_tasks = IntItem('expensivesample', 'num_tasks', 8, """Number of tasks that must be performend""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(ExpensiveSimpleNode, self).app_init() def app_main(self): """ Applications main entry """ return super(ExpensiveSimpleNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ # Do not create a tasks system, we will handle tasks on our own return None def start_processing(self): """ Called when the app is not using a ITaskSystem and will instead just add tasks and will take care of the task flow itself """ self.log.info("Starting computation") if self.send_task_batch: self.log.info(" Task batching enabled") self.start_time = time.time() self.finished_jobs = 0 if self.send_task_batch: self.push_tasks([ ExpensiveTask("expensive_{}".format(i), None, self.node_id_str, sleep_time=self.time_per_task) for i in range(self.num_tasks) ]) else: for i in range(self.num_tasks): self.push_task( ExpensiveTask("expensive_{}".format(i), None, self.node_id_str, sleep_time=self.time_per_task)) self.check_finished() def task_finished(self, task, result, error): """ Called when a task has been done """ self.finished_jobs += 1 self.check_finished() def check_finished(self): """ Check if we finsihed all computation or not """ self.log.info("%d -> %d" % (self.finished_jobs, self.num_tasks)) if self.finished_jobs == self.num_tasks: self.log.info("All tasks finished!!") end_time = time.time() - self.start_time self.log.info("Total time: {}".format(end_time)) # Print expected single threaded time and improvement expected_time = self.time_per_task * self.num_tasks self.log.info( "Plain python expected time: {}".format(expected_time)) self.log.info("Concurrent improvememnet: {}%".format( (expected_time / end_time) * 100.0)) self.shutdown_main_loop() def push_task_response(self, result): """ We just add a Task to the computation framework """ pass #self.log.info("Task send to computation framework") def push_task_failed(self, result): """ We failed to add a Task to the computation framework """ self.log.info("Failed to send task send to computation framework") def push_tasks_response(self, result): """ We just add a set of Tasks to the computation framework """ self.log.info("Tasks send to computation framework") def push_tasks_failed(self, result): """ We failed to add a set of Tasks to the computation framework """ self.log.info("Failed to send tasks send to computation framework")
class ExpensiveNode(ApplicationNode): """ Application node distributing the computation of an expensive task """ implements(IApp) time_per_task = IntItem( 'expensivesample', 'time_per_task', 1, """Time each task will perform on doing nothind (active wait) to simulate an expensive computation""" ) num_tasks = IntItem('expensivesample', 'num_tasks', 8, """Number of tasks that must be performend""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(ExpensiveNode, self).app_init() def app_main(self): """ Applications main entry """ return super(ExpensiveNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() self.system = ExpensiveNodeTaskSystem(self.time_per_task, self.num_tasks) return self.system def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ end_time = time.time() - self.start_time self.log.info("Total time: {}".format(end_time)) # Print expected single threaded time and improvement expected_time = self.time_per_task * self.num_tasks self.log.info("Plain python expected time: {}".format(expected_time)) self.log.info("Concurrent improvememnet: {}%".format( (expected_time / end_time) * 100.0)) self.shutdown_main_loop() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ self.log.error("Tasks system failed to be send to framework!") # Check if the resuklt dict contains a traceback if "t" in result: self.log.error(result["t"])
class MasterNode(Component, BaseNode): implements(IApp) """ A MasterNode is a compute node that can act and be used in computation when in standalone mode but is mainly used to dsitribute jobs along registered slaves. Once the jobs of a slave, or its own, are finished we will redistribute the results to the responsible client nodes. """ is_standalone = BoolItem('masternode', 'is_standalone', 'False', """Master node is also a slave and a standalone application""") inactivity_time_multiplier = IntItem('node', 'inactivity_time_multiplier', 3, """Inactivty multiplier multiplies the heartbeat time to ensure inactivity is always several heartbeats""") registry_mirror_timer = FloatItem('masternode', 'registry_mirror_timer', 30.0, """Timer used to update node registry mirror""") registry_cleanup_timer = FloatItem('masternode', 'registry_cleanup_timer', 60.0, """Timer used to cleanup the node registry""") task_scheduler= ExtensionPointItem('masternode', 'task_scheduler', ITaskScheduler, 'GenericTaskScheduler', """Task scheduler used by the master node""") master_port = IntItem('node', 'master_port', 8081, """Port used by the master node for high-performance communication and dedicated persistent connections""") def app_init(self): """ Initialize application just before running it """ super(MasterNode, self).app_init() # Start our TCPServer, #self.server = TCPServer("localhost", self.master_port, self) #self.server_thread = threading.Thread(name="tcp_server", target=self.server.serve_forever) #self.server_thread.daemon = True # Setup our ZeroMQ asyn server self.zmq_server = TCPServerZMQ(self.master_port, self.log, 5) # The node registry holds updated into about slaves/clients and its processing # we week track of number of tasks submitted to each slave, how they perform # general statistics and more. self.node_registry = defaultdict(self._default_node) self.registry_lock = self.lock_cache.registry_lock self.node_cleanup_threshold = self.registry_cleanup_timer self.task_scheduler.setup(self) # Our client registry self.client_registry = defaultdict(self._default_node) self.client_registry_lock = self.lock_cache.client_registry_lock # The registry mirror is used to send all updates from time to time and cache it. # We use a different dict so client status request do not block self.node_registry_mirror = {} self.registry_mirror_lock = self.lock_cache.registry_mirror_lock self.registry_mirror_threshold = self.registry_mirror_timer self.registry_mirror_dirty = True # Client registry mirror self.client_registry_mirror = {} self.client_registry_mirror_lock = self.lock_cache.client_registry_mirror_lock # Timer which controls inactivity handling of a node, being it a slave or a client self.inactivity_timer = self.heartbeat_timer*self.inactivity_time_multiplier self.inactivity_unregister_timer = self.inactivity_timer * 3 self.inactivity_threshold = self.inactivity_timer self.test_timer = 1 self.test_app_id = uuid.uuid1() # Our task system registry self.tasksystem_registry = defaultdict(self._default_tasksystem) self.tasksystem_lock = self.lock_cache.tasksystem_lock # Create master thread #self.master_thread = master_thread(self.log) def app_main(self): """ Launch a concurrent application """ self.log.info("Initializing MasterNode") result = super(MasterNode, self).app_main() if result not in SUCCESS_RET_CODES: return result # Start the main server thread #self.server_thread.start() self.zmq_server.start() # Enter mail loop self.main_loop() # Stop all threads processes #self.server.shutdown() self.zmq_server.stop() self.notify_shutdown() self.stop_api_thread() #self.stop_master_thread() self.task_scheduler.stop() # Now launch base node return result def handle_echo(self, sock, address): print(address) fp = sock.makefile() while True: line = fp.readline() if line: fp.write(line) fp.flush() else: break def stop_master_thread(self): self.master_thread.stop() def generate_api(self): """ Create all rpc methods the node requires """ super(MasterNode, self).generate_api() if not self.is_standalone: @jsonremote(self.api_service_v1) def register_slave(request, node_id, port, data): self.stats.add_avg('register_slave') return self.register_node(node_id, web.ctx['ip'], port, data, NodeType.slave) @tcpremote(self.zmq_server, name='register_slave') #@tcpremote(self.server, name='register_slave') def register_slave_tcp(handler, request, node_id): self.stats.add_avg('register_slave_tcp') return self.register_node_tcp(handler, request, node_id, NodeType.slave) @jsonremote(self.api_service_v1) def register_client(request, node_id, port, data): self.stats.add_avg('register_client') return self.register_node(node_id, web.ctx['ip'], port, data, NodeType.client) @tcpremote(self.zmq_server, name='register_client') #@tcpremote(self.server, name='register_client') def register_client_tcp(handler, request, node_id): self.stats.add_avg('register_client_tcp') return self.register_node_tcp(handler, request, node_id, NodeType.client) @jsonremote(self.api_service_v1) def unregister_slave(request, node_id): self.stats.add_avg('unregister_slave') return self.unregister_node(node_id, NodeType.slave) @jsonremote(self.api_service_v1) def unregister_client(request, node_id): self.stats.add_avg('unregister_client') return self.unregister_node(node_id, NodeType.client) @jsonremote(self.api_service_v1) def heartbeat_slave(request, node_id): self.stats.add_avg('heartbeat_slave') return self.heartbeat(node_id, NodeType.slave) @jsonremote(self.api_service_v1) def heartbeat_client(request, node_id): self.stats.add_avg('heartbeat_client') return self.heartbeat(node_id, NodeType.client) @tcpremote(self.zmq_server) #@tcpremote(self.server) def task_finished(handler, request, task, result, error): self.stats.add_avg('task_finished') self.task_finished(task, result, error) # This is an end method for the interaction raise NoResponseRequired() @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_task_response(handler, request, result): # TODO: Handle failure when result is False! pass @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_task_failed(handler, request, result): # TODO: Handle failure when pushing tasks failed! pass @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_tasksystem(handler, request, tasksystem): """ Push a application onto the computation framework """ self.stats.add_avg('push_tasksystem') return self.push_tasksystem(request, tasksystem) @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_task(handler, request, task): """ Push a task onto the computation framework """ self.stats.add_avg('push_task') return self.push_task(request, task) @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_tasks(handler, request, tasks): """ Push a set of tasks onto the computation framework """ self.stats.add_avg('push_tasks') if isinstance(tasks, list): for task in tasks: if not self.push_task(request, task): return False return True @tcpremote(self.zmq_server) #@tcpremote(self.server) def test_method(handler, request): print("test_method from {}".format(request)) raise NoResponseRequired() def _generate_status_dict(self, node): return {'type':node.type,'state':node.state} def status(self): status = ComputeNode.status(self) with self.registry_mirror_lock.readlock: status['nodes'] = dict((k, self._generate_status_dict(v)) for k, v in self.node_registry_mirror.iteritems() if v) with self.client_registry_mirror_lock.readlock: status['clients'] = dict((k, self._generate_status_dict(v)) for k, v in self.client_registry_mirror.iteritems() if v) return status def on_update(self, delta_time): super(MasterNode, self).on_update(delta_time) # Update map self.registry_mirror_threshold -= delta_time if self.registry_mirror_threshold < 0: self.update_registry_mirror() self.registry_mirror_threshold = self.registry_mirror_timer # Handle inactive nodes or cleanup empty nodes self.inactivity_threshold -= delta_time self.node_cleanup_threshold -= delta_time if self.inactivity_threshold < 0: self.update_inactive_nodes() self.inactivity_threshold = self.inactivity_timer elif self.node_cleanup_threshold < 0: self.clean_node_map() self.node_cleanup_threshold = self.registry_cleanup_timer def has_master(self): """ Check if the node has a master or not. Master node has no master itself """ return False def _handle_timeout(self, node): """ Handle state for a given node checking the nodes timestamp value """ ellapsed_time = self.current_time - node['heartbeat'] if node['state'] == NodeState.active and ellapsed_time > self.inactivity_timer: self.log.info("Node %s set to inactive (t:%f)" % (node['node_id'], ellapsed_time)) node['state'] = NodeState.inactive self.set_registry_dirty() elif node['state'] == NodeState.inactive and ellapsed_time > self.inactivity_unregister_timer: # Delete node! To much time inactive! self.log.info("Node %s kicked from system! To much time of inactivity! (t:%f)" % (node['node_id'], ellapsed_time)) self.set_registry_dirty() return None return node def set_registry_dirty(self): """ Set the registry dirty, this will force an update of the task scheduler """ self.registry_mirror_dirty = True self.update_scheduler() def update_scheduler(self): """ Update task scheduler with the current list of slaves """ self.task_scheduler.rate_slaves() def update_inactive_nodes(self): """ Called when we check for inactive nodes, those that have not send any heartbeat for a while """ self.log.info("Checking for inactive nodes...") with self.registry_lock.writelock: self.node_registry = dict((k, self._handle_timeout(v)) for k, v in self.node_registry.iteritems() if v) with self.client_registry_lock.writelock: self.client_registry = dict((k, self._handle_timeout(v)) for k, v in self.client_registry.iteritems() if v) def update_registry_mirror(self): """ Update the registry mirror with a copy of the registry. Used to expose a copy dict to the public. """ if self.registry_mirror_dirty: self.log.info("Updating node registry mirror...") with self.registry_mirror_lock.writelock: self.node_registry_mirror = dict((k, v) for k, v in self.node_registry.iteritems() if v) with self.client_registry_mirror_lock.writelock: self.client_registry_mirror = dict((k, v) for k, v in self.client_registry.iteritems() if v) self.registry_mirror_dirty = False def clean_node_map(self): """ Clean node map for any empty node values. """ self.log.info("Cleaning node registry...") with self.registry_lock.writelock: self.node_registry = dict((k, v) for k, v in self.node_registry.iteritems() if v) with self.client_registry_lock.writelock: self.client_registry = dict((k, v) for k, v in self.client_registry.iteritems() if v) def get_node_id_no_lock(self, url): return next((k for k, v in self.node_registry.iteritems() if v and v.url == url), None) def get_node_id(self, url): """ Return a node id given an url """ with self.registry_lock.readlock: node_id = self.get_node_id_no_lock(url) return node_id def get_client_id_no_lock(self, url): return next((k for k, v in self.client_registry.iteritems() if v and v.url == url), None) def get_client_id(self, url): """ Return a client id given an url """ with self.client_registry_lock.readlock: node_id = self.get_client_id_no_lock(url) return node_id def get_node(self, url): """ Get a node representation given an url """ node = None with self.registry_lock.readlock: node_id = self.get_node_id_no_lock(url) if node_id: node = self.node_registry[node_id] return node def get_client(self, url): """ Get a node representation given an url """ node = None with self.registry_lock.readlock: node_id = self.get_client_id_no_lock(url) if node_id: node = self.node_registry[node_id] return node def _default_node(self): return {} def _default_tasksystem(self): return Bunch({}) def _default_slave_bunch(self): return Bunch({'node_id':'', 'url':'', 'ip':'', 'port':0, 'type':NodeType.slave, 'state':NodeState.inactive, 'heartbeat':0, 'proxy':None, 'workers':0, 'tasks':0, 'rating':0.0, 'handler': None}) def _default_client_bunch(self): return Bunch({'node_id':'', 'url':'', 'ip':'', 'port':0, 'type':NodeType.slave, 'state':NodeState.inactive, 'heartbeat':0, 'proxy':None, 'handler': None}) def register_node(self, node_id, ip, port, data, node_type): """ Register a node within our node map """ try: # TODO: CHECK ALL CLIENT DATA! url = ("%s:%d") % (ip, port) if NodeType.slave == node_type: with self.registry_lock.writelock: node = self.get_node(url) if node is None: # This is a node that is registering again so reuse it node = self.node_registry[node_id] = self._default_slave_bunch() # Basic node values node.node_id = node_id node.url = url node.ip = ip node.port = port node.type = node_type node.proxy = self.create_node_proxy(url) node.state = NodeState.pending node.heartbeat = time.time() # Add slave data node.workers = data['workers'] node.tasks = 0 # Rating goes from [0, ..) 0 is the best rating and so asuitable candidate node.rating = 0 node.handler = None node.tcp_proxy = None # Make sure the mirror updates properly self.set_registry_dirty() # Send back the generated id return {'id': node.node_id, 'port': self.master_port} elif NodeType.client == node_type: with self.client_registry_lock.writelock: node = self.get_node(url) if node is None: # This is a node that is registering again so reuse it node = self.client_registry[node_id] = self._default_client_bunch() # Basic node values node.node_id = node_id node.url = url node.ip = ip node.port = port node.type = node_type node.proxy = self.create_node_proxy(url) node.state = NodeState.pending node.heartbeat = time.time() # Add client data node.handler = None node.tcp_proxy = None # Make sure the mirror updates properly self.set_registry_dirty() # Send back the generated id return {'id': node.node_id, 'port': self.master_port} else: raise NotImplementedError("Unkown node") except Exception as e: traceback.print_exc() # Make sure to cleanup node from node map! if node_id: self.unregister_node(node_id, node_type) raise e def unregister_node(self, node_id, node_type): """ Unregister a node within our node map """ if NodeType.slave == node_type: with self.registry_lock.writelock: if node_id in self.node_registry: self.node_registry[node_id] = None # Make sure we let the mirror update self.registry_mirror_dirty = True self.set_registry_dirty() return True return False elif NodeType.client == node_type: with self.client_registry_lock.writelock: if node_id in self.client_registry: # if we had a socket close it now! self.client_registry[node_id] = None # Get rid of any registered task system with self.tasksystem_lock.writelock: if node_id in self.tasksystem_registry: del self.tasksystem_registry[node_id] # Make sure we let the mirror update self.registry_mirror_dirty = True self.set_registry_dirty() return True return False else: raise NotImplementedError("Unkown node") def register_node_tcp(self, handler, request, node_id, node_type): """ Slave has just registered itself throug the compute channel """ if NodeType.slave == node_type: with self.registry_lock.writelock: if node_id in self.node_registry: # The handler is shared between many client sockets! self.node_registry[node_id].handler = handler self.node_registry[node_id].socket = handler.worker #self.node_registry[node_id].tcp_proxy = self.create_tcp_client_proxy(handler.worker, request) self.node_registry[node_id].tcp_proxy = self.create_tcp_client_proxy_zmq(self.zmq_server.context, request) self.node_registry[node_id].state = NodeState.active # Let the slave know that the handshake worked return True return False elif NodeType.client == node_type: with self.client_registry_lock.writelock: if node_id in self.client_registry: # The handler is shared between many client sockets! self.client_registry[node_id].handler = handler self.client_registry[node_id].socket = handler.worker #self.client_registry[node_id].tcp_proxy = self.create_tcp_client_proxy(handler.worker, request) self.client_registry[node_id].tcp_proxy = self.create_tcp_client_proxy_zmq(self.zmq_server.context, request) self.client_registry[node_id].state = NodeState.active # Safe some data within the handler itself handler.node_id = node_id handler.node_type = NodeType.client # Let the client know that the handshake worked return True return False else: raise NotImplementedError("Unkown node") def notify_shutdown(self): """ Notify a global shutdown to all nodes """ with self.registry_lock.readlock: for node_id in self.node_registry: if self.node_registry[node_id] and self.node_registry[node_id].proxy: try: self.node_registry[node_id].proxy.master_disconnected() except: pass with self.client_registry_lock.readlock: for node_id in self.client_registry: if self.client_registry[node_id] and self.client_registry[node_id].proxy: try: self.client_registry[node_id].proxy.master_disconnected() except: pass def heartbeat(self, node_id, node_type): """ We just received a nice beat from a node, update it's last heartbeat timestamp to perevent timeouts """ if NodeType.slave == node_type: with self.registry_lock.writelock: if node_id in self.node_registry: self.node_registry[node_id].heartbeat = time.time() if self.node_registry[node_id].state == NodeState.inactive: self.node_registry[node_id].state = NodeState.active #self.log.info("Node %s just ticked" % (node_id)) return True return False elif NodeType.client == node_type: with self.client_registry_lock.writelock: if node_id in self.client_registry: self.client_registry[node_id].heartbeat = time.time() if self.client_registry[node_id].state == NodeState.inactive: self.client_registry[node_id].state = NodeState.active #self.log.info("Node %s just ticked" % (node_id)) return True return False else: raise NotImplementedError("Unkown node") def rpc_call_failed(self, proxy, method, reason): """ Called when an RPC call failed for an unexpected reason """ self.log.info("Method %s failed because of %s" % (method, reason)) def rpc_call_success(self, proxy, method, result): """ Called when an RPC call succeded """ self.log.info("Method %s succeded with %s" % (method, result)) return result def push_tasksystem(self, request, tasksystem): """ We received a task system from a client. Get the first list of tasks and save out the system itself for later access """ # Easier access node_id = request # Now get the with self.tasksystem_lock.writelock: # No re-registering! system_id = tasksystem.system_id if system_id in self.tasksystem_registry: return False # Safe out the registry system_entry = self.tasksystem_registry[system_id] = self._default_tasksystem() system_entry.system = tasksystem system_entry.client_id = node_id system_entry.system_id = system_id # Now gather task and push them to the system system_entry.system.log = self.log system_entry.system.init_system(self) self.task_scheduler.start_system(system_entry.system) return True def push_task(self, request, task): """ We received a task from a client, add it to the system to be processed """ if isinstance(task, Task): self.task_scheduler.push_task(task) return True return False def task_finished(self, task, result, error): """ Called when a task has finished its computation, the result object contains the task, the result or an error and additional information """ # if the task does not specify a ITaskSystem id its a single executed task which is not controller by # a dedicated autonomouse system on the master if task.system_id is None: client_id = task.client_id with self.client_registry_lock.readlock: if client_id in self.client_registry: self.client_registry[client_id].tcp_proxy.task_finished(task.task_id, result, error) else: # If we do have a system id let it process it instead with self.tasksystem_lock.writelock: if task.system_id in self.tasksystem_registry: system_entry = self.tasksystem_registry[task.system_id] system_entry.system.task_finished(self, task, result, error) # Inform scheduler of the task self.task_scheduler.task_finished(task, result, error) # Check for end if system_entry.system.is_complete(self): try: # Gather results final_results = system_entry.system.gather_result(self) # Send to client proxy the results client_id = system_entry.client_id with self.client_registry_lock.readlock: if client_id in self.client_registry: self.client_registry[client_id].tcp_proxy.work_finished(final_results, system_entry.system.system_id) finally: del self.tasksystem_registry[task.system_id]