def __init__( self, app, appconfig, workdata ): WorkerBase.__init__( self, app, appconfig, workdata ) self.surveyorname = workdata["hostname"] self.vsub = nn.Socket( nn.SUB, domain=nn.AF_SP ) self.vpub = nn.Socket( nn.PUB, domain=nn.AF_SP ) self.broker_address = None self.bonjour = BonjourResolver( "_vertexremap._tcp", self.cb_broker_changed ) self.bonjour.start() inputfile = os.path.join( self.remaproot, "data", self.workdata["inputfile"] ) outputdir = os.path.join( self.remaproot, "job", self.jobid, "part" ) self.input = self.app.create_vertex_reader( inputfile ) self.outputdir = outputdir self.partitions = {} self.mode = MODE_IDLE self.surveyor = nn.Socket( nn.RESPONDENT ) self.surveyor.connect( "tcp://%s:8688"%(self.surveyorname) ) # 6 seconds self.surveyor.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, 50 ) self.vertices = {} logger.info("Waiting to get vertex broker host from bonjour") self.ready = False
def __init__(self, remaproot): self.remaproot = remaproot self.cores = {} self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.tot_m_rcv = 0 self.hw = NodeHardware() self.nodeid = remap_utils.node_id() self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.coresChanged = False
def __init__(self, rootdir): Monitor.__init__(self, rootdir) self.remaproot = rootdir self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.jobid = None self.refreshed = 0 self.job_status = "waiting" self.rejectedtasks = {} self.completedtasks = {} self.tasks = {} self.allocatedtasks = {} self.jobtype = "not_started" self.priority = 0 self.parallellism = 1 self.manager = None self.last_check = time.time()
def __init__(self, remaproot): self.remaproot = remaproot self.cores = {} self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.tot_m_rcv = 0 self.hw = NodeHardware() self.nodeid = remap_utils.node_id() self.bonjour = BonjourResolver( "_remap._tcp", self.cb_broker_changed ) self.bonjour.start() self.coresChanged = False
class Initiator(Monitor): def __init__(self, rootdir): Monitor.__init__(self, rootdir) self.remaproot = rootdir self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.jobid = None self.refreshed = 0 self.job_status = "waiting" self.rejectedtasks = {} self.completedtasks = {} self.tasks = {} self.allocatedtasks = {} self.jobtype = "not_started" self.priority = 0 self.parallellism = 1 self.manager = None self.last_check = time.time() def load_plugin(self, name): try: mod = __import__("module_%s" % name) return mod except ImportError as ie: raise RemapException("No such worker type: %s" % (name)) # ------- # Broker handling # ------- def setup_broker(self): self.brokerChanged = False if self.bsub != None: self.bsub.close() self.bsub = None self.apply_timeouts() if self.broker_address == "unknown": logger.error("Deferring broker setup as address is still unknown.") return self.bsub = nn.Socket(nn.SUB) self.bsub.connect("tcp://%s:8687" % (self.broker_address)) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "global") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "local") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "notlocal") if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "tracker") self.apply_timeouts() self.bpub = nn.Socket(nn.PUB) self.bpub.connect("tcp://%s:8686" % (self.broker_address)) logger.info("Broker setup complete") def apply_timeouts(self): if self.bsub != None: rcv_timeout = 100 self.bsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) def cb_broker_changed(self, broker_address): logger.info("Received new broker address: %s" % (broker_address)) self.broker_address = broker_address self.brokerChanged = True def forward_to_broker(self, msg): if self.bpub != None: try: self.bpub.send(msg) except nn.NanoMsgAPIError as e: pass def process_broker_messages(self): if self.bsub == None: # No broker is known yet. if self.brokerChanged: logger.info("The broker configuration changed.") self.setup_broker() if self.bsub == None: logger.info("Failed broker setup.") return False else: return False try: # Grab next msg from broker if any msg = self.bsub.recv() if msg != None and len(msg) > 0: msgprefix, data = remap_utils.unpack_msg(msg) recipientid, msgtype, senderid = remap_utils.split_prefix( msgprefix) if msgtype == "complete": self.update_corecomplete(recipientid, senderid, data) if msgtype == "corestatus": self.update_corestatus(recipientid, senderid, data) if msgtype == "raisehand": self.update_hands(recipientid, senderid, data) return True else: return False except nn.NanoMsgAPIError as e: return False # ------- # Messaging handling # ------- def update_corestatus(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) if key in self.allocatedtasks: job = self.allocatedtasks[key] job["ts_finish"] = time.time() + 7 def update_corecomplete(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) logger.info("Job %s completed." % (key)) if key in self.allocatedtasks: job = self.allocatedtasks[key] task = self.tasks[key] self.completedtasks[key] = task del self.tasks[key] del self.allocatedtasks[key] logger.info( "%d tasks left, %d tasks committed, %d tasks complete, %d tasks failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks))) def update_hands(self, recipientid, senderid, data): # "%s.raisehand.%s"%( senderid, self.nodeid ), {"cores":3,"interruptable":0} if senderid in self.nodes: self.nodes[senderid]["avail"] = data else: self.nodes[senderid] = {} self.nodes[senderid]["avail"] = data # ------- # Job management # ------- def start_job(self, jobdata): if self.job_status != "waiting": raise RemapException( "A job is currently in progress on this monitor") if "type" not in jobdata: raise RemapException("Must have job type specified") if "priority" not in jobdata: raise RemapException("Must have priority specified") if "parallellism" not in jobdata: raise RemapException("Must have parallellism specified") self.job_status = "preparing" self.prepare_start = time.time() self.jobtype = jobdata["type"] self.priority = jobdata["priority"] self.parallellism = jobdata["parallellism"] plugin = self.load_plugin(self.jobtype) self.rejectedtasks = {} self.completedtasks = {} if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_UNSUBSCRIBE, self.jobid) if "jobid" in jobdata: self.jobid = jobdata["jobid"] del jobdata["jobid"] else: self.jobid = remap_utils.unique_id() self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) if "app" not in jobdata: raise RemapException("The name of the app must be provided") if jobdata["app"] not in self.list_apps(): raise RemapException("No such application: %s" % (jobdata["app"])) config = {"jobid": self.jobid, "remaproot": self.remaproot} logger.info("Started a new job: %s" % (self.jobid)) self.manager = plugin.create_manager(jobdata, config) if ((time.time() - self.refreshed) > 60): # Not refreshed > 60s self.refresh_nodes(self.priority) # Wait for a bunch of nodes to advertise themselves r = Timer(1.0, self.resume, ()) r.start() else: self.resume() return {"jobid": self.jobid} def resume(self): self.manager.prepare() logger.info("Starting a %s job" % (self.jobtype)) self.planner = JobPlanner(self.manager.config_file) self.tasks = self.manager.plan_jobs(self.planner) logger.info("Found %d tasks to execute" % (len(self.tasks))) numnodes, self.allocatedtasks = self.planner.distribute_jobs_over_nodes( self.tasks, {}, self.nodes, self.parallellism) if len(self.allocatedtasks) == 0: logger.error("No nodes found to distribute the tasks.") self.job_status = "waiting" return if self.manager.all_hands_on_deck(): if len(self.allocatedtasks) != len(self.tasks): raise RemapException( "Not enough cores available. Have %d, need %d." % (len(self.allocatedtasks), len(self.tasks))) logger.info("%d new tasks distributed over %d nodes." % (len(self.allocatedtasks), numnodes)) self.job_status = "executing" self.outbound_work(self.allocatedtasks) # In outbound work we update our local "jobs" data with timestamps # when they were sent out and send the task data to nodes. def outbound_work(self, jobs): nodes = {} for key, job in jobs.items(): nodeid = job["nodeid"] if nodeid in nodes: nodes[nodeid]["cores"].append(job["jobdata"]) else: tasks = {} tasklist = [] job["ts_start"] = time.time() job["ts_finish"] = time.time() + 7 tasklist.append(job["jobdata"]) tasks["cores"] = tasklist tasks["priority"] = self.priority nodes[nodeid] = tasks for nodeid, tasks in nodes.items(): msg = remap_utils.pack_msg("%s.jobstart.%s" % (nodeid, self.jobid), tasks) self.forward_to_broker(msg) def check_progress(self): if self.manager != None: if self.manager.module_tracks_progress(): if not self.manager.check_progress(len(self.tasks)): self.manager.finish() self.manager = None self.job_status = "waiting" logger.info("Vertex job complete") else: if time.time() - self.last_check <= 4: return newtime = time.time() kill_list = [] for key, job in self.allocatedtasks.items(): if newtime > job["ts_finish"]: # This job hasn't been updated, probably dead. jobdata = job["jobdata"] # Update tasks with an attempt + 1 task = self.tasks[key] task["attempts"] = task["attempts"] + 1 nodeid = job["nodeid"] logger.info( "Task %s failed on node %s. Reattempting elsewhere" % (key, nodeid)) if task["attempts"] > 4: # 5 attempts so far. let's cancel it. logger.warn( "Task %s failed 5 attempts. Cancelling file to reject." % (key)) del self.tasks[key] kill_list.append(key) self.rejectedtasks[key] = task for key in kill_list: del self.allocatedtasks[key] # Now also check if there are jobs that can be started if len(self.tasks) > 0: numnodes, new_allocations = self.planner.distribute_jobs_over_nodes( self.tasks, self.allocatedtasks, self.nodes, self.parallellism) if numnodes > 0: logger.info("%d new tasks distributed over %d nodes" % (len(new_allocations), numnodes)) self.outbound_work(new_allocations) self.allocatedtasks.update(new_allocations) if self.job_status == "executing" and len( self.tasks) == 0 and len(self.allocatedtasks) == 0: # finished all work self.job_status = "waiting" self.manager.finish() self.manager = None logger.info( "%d jobs left, %d jobs committed, %d jobs complete, %d jobs failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks))) if self.job_status == "preparing" and time.time( ) - self.prepare_start > 5: # over 5 seconds? quit it self.job_status = "waiting" if self.manager != None: self.manager.finish() self.manager = None logger.info("Cancelled job in progress.") else: # no manager. if self.job_status != "waiting": self.job_status = "waiting" logger.info("Resolving inconsistent state.") self.last_check = time.time() # ------- # Node management # ------- def refresh_nodes(self, priority): self.nodes = {} self.priority = priority self.refreshed = time.time() msg = remap_utils.pack_msg("local.showhands.%s" % (self.jobid), {"priority": self.priority}) self.forward_to_broker(msg)
class Vertex(WorkerBase): def __init__( self, app, appconfig, workdata ): WorkerBase.__init__( self, app, appconfig, workdata ) self.surveyorname = workdata["hostname"] self.vsub = nn.Socket( nn.SUB, domain=nn.AF_SP ) self.vpub = nn.Socket( nn.PUB, domain=nn.AF_SP ) self.broker_address = None self.bonjour = BonjourResolver( "_vertexremap._tcp", self.cb_broker_changed ) self.bonjour.start() inputfile = os.path.join( self.remaproot, "data", self.workdata["inputfile"] ) outputdir = os.path.join( self.remaproot, "job", self.jobid, "part" ) self.input = self.app.create_vertex_reader( inputfile ) self.outputdir = outputdir self.partitions = {} self.mode = MODE_IDLE self.surveyor = nn.Socket( nn.RESPONDENT ) self.surveyor.connect( "tcp://%s:8688"%(self.surveyorname) ) # 6 seconds self.surveyor.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, 50 ) self.vertices = {} logger.info("Waiting to get vertex broker host from bonjour") self.ready = False def cb_broker_changed( self, broker_address ): logger.info("Received vertex broker address: %s"%(broker_address) ) if self.broker_address != None: return self.broker_address = broker_address # vertex broker pub and sub self.vpubc = self.vpub.connect( "tcp://%s:8689"%(self.broker_address) ) self.vsubc = self.vsub.connect( "tcp://%s:8690"%(self.broker_address) ) # 2 seconds max self.vsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, 2000 ) logger.info("Vertex broker setup complete") for value in self.input.read(): key, vertex = self.app.prepare( value ) if key == None or vertex == None: continue # Store vertex by id in dict with 2 lists for messages self.vertices[ key ] = (vertex, [], []) self.vsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, key ) logger.info("Ready for processing") self.ready = True def module_manages_progress( self ): return True def result( self ): return "complete", {"inputfile":self.workdata["inputfile"]} def forward( self, id, msg ): # Forward to vertex broker self.vpub.send( id + " " + msg ) def subscribe( self, topic ): self.vsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, topic) def unsubscribe( self, topic ): self.vsub.set_string_option( nn.SUB, nn.SUB_UNSUBSCRIBE, topic) # This function performs actual work. The *state* is in the initiator daemon only, # so a worker is directly responsive to whatever the surveyor tells the worker to do. def work( self ): if not self.ready: return True surveyormsg = None try: surveyormsg = remap_utils.decode( self.surveyor.recv() ) except nn.NanoMsgAPIError as e: return True if surveyormsg[0] == 'S': # Shift messages if self.mode != MODE_MSGS: self.mode = MODE_MSGS # We haven't done this in a previous step. Due to recovery, it might be # used by the initiator to get others up to speed. for key, (vertex,messages,messagesNext) in self.vertices.items(): self.vertices[ key ] = ( vertex, messagesNext, [] ) self.surveyor.send( "D" ) return True if surveyormsg[0] == 'H': self.mode = MODE_HALT logger.info("Halting core.") self.surveyor.close() return False if surveyormsg[0] == 'P': if self.mode != MODE_PROCESS: # First time in this state, we need to grab all messages and # allocate them to vertex queue self.mode = MODE_PROCESS logger.info("Processing messages 1") while True: try: msg = self.vsub.recv() prefix, data = remap_utils.unpack_msg( msg ) if prefix in self.vertices: # This vertex is indeed on this host. Add the message to its new msg list for next iteration vertex, messages, messagesNext = self.vertices[ prefix ] messagesNext.append( data ) except nn.NanoMsgAPIError as e: logger.error( "No more messages available." ) break else: logger.info("Processing messages 2") # doing things twice does not make a difference. Second time around, just throw away all messages while True: try: msg = self.vsub.recv() print("Received and thrown away: ", msg) except nn.NanoMsgAPIError as e: logger.error( "No more messages available." ) break self.surveyor.send( "D" ) return True self.mode = MODE_RUN self.superstep = int(surveyormsg) mainHalt = True for key, (vertex,messages,messagesNext) in self.vertices.items(): vertex, halt = self.app.compute( self.forward, self.subscribe, self.unsubscribe, self.superstep, vertex, messages ) if vertex != None: # Store the new vertex object in its place, maintaining the messagesNext list as we know it self.vertices[ key ] = (vertex,[],messagesNext) if not halt: mainHalt = False if mainHalt: self.surveyor.send( "H" ) else: self.surveyor.send( "D" ) return True
class NodeDaemon( object ): def __init__(self, remaproot): self.remaproot = remaproot self.cores = {} self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.tot_m_rcv = 0 self.hw = NodeHardware() self.nodeid = remap_utils.node_id() self.bonjour = BonjourResolver( "_remap._tcp", self.cb_broker_changed ) self.bonjour.start() self.coresChanged = False # Create a bi-directional communication channel, where the node daemon # 'shouts' in the room even to contact a single core, but the core only # sends written messages back to the shouter with the megaphone. # (embarassing protocol). def setup_bus( self ): self.lsub = nn.Socket( nn.SUB ) self.lsub.bind("ipc:///tmp/node_pub.ipc") self.lsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "" ) self.lpub = nn.Socket( nn.PUB ) self.lpub.bind("ipc:///tmp/node_sub.ipc") def apply_timeouts( self ): if self.bsub == None: rcv_timeout = 100 self.lsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout ) else: rcv_timeout = 100 self.bsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout ) rcv_timeout = 0 self.lsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout ) def cb_broker_changed( self, broker_address ): logger.info("Received new broker address: %s"%(broker_address) ) self.broker_address = broker_address self.brokerChanged = True def setup_broker( self ): self.brokerChanged = False if self.bsub != None: self.bsub.close() self.bsub = None self.apply_timeouts() if self.broker_address == "unknown": logger.error("Deferring broker setup as address is still unknown.") return self.bsub = nn.Socket( nn.SUB ) self.bsub.connect( "tcp://%s:8687"%( self.broker_address )) self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "global") self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "local") self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "notlocal") self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, self.nodeid) self.apply_timeouts() self.bpub = nn.Socket( nn.PUB ) self.bpub.connect( "tcp://%s:8686"%( self.broker_address )) logger.info("Broker setup complete") def process_bus_messages( self ): try: msg = self.lsub.recv() msgprefix, data = remap_utils.unpack_msg( msg ) if len(msgprefix) == 0: return True recipientid,msgtype,senderid = remap_utils.split_prefix(msgprefix) if msgtype[0] == '_': # node message self.process_core_message( msgtype, senderid, data ) elif msgtype == "corestatus": if senderid in self.cores: coredata = self.cores[ senderid ] coredata["ts_last_seen"] = time.time() coredata["progress"] = data["progress"] logger.info("Core %s progressed %d"%( senderid, coredata["progress"] )) self.forward_to_broker( msg ) elif msgtype == "complete": if senderid in self.cores: coredata = self.cores[ senderid ] logger.info("Core %s completed the job"%( senderid )) self.forward_to_broker( msg ) del self.cores[ senderid ] self.coresChanged = True else: # forward to broker instead self.forward_to_broker( msg ) return True except nn.NanoMsgAPIError as e: return False def process_core_message( self, msgtype, senderid, data ): if msgtype == "_hello": self.process_hello( data ) if msgtype == "_todo": self.process_todo( senderid, data ) if msgtype == "_status": self.process_status( senderid, data ) if msgtype == "_sub": self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, data["prefix"]) if msgtype == "_unsub": self.bsub.set_string_option( nn.SUB, nn.SUB_UNSUBSCRIBE, data["prefix"]) def forward_to_broker( self, msg ): if self.bpub != None: try: self.bpub.send( msg ) except nn.NanoMsgAPIError as e: pass # This processes a message where a core is announcing itself and wants to # get a core id to start existing on the network def process_hello( self, data ): msgid = remap_utils.safe_get(data, "msgid") pid = remap_utils.safe_get(data, "pid") priority = remap_utils.safe_get( data, "priority" ) coreid = remap_utils.core_id( self.nodeid, pid ) self.cores[ coreid ] = {"coreid":coreid,"ts_last_seen":time.time(),"progress":-1,"pid":pid,"priority":priority} msg = remap_utils.pack_msg( "%s._hey.%s"%(coreid, self.nodeid), {"msgid":msgid,"coreid":coreid} ) logger.info( "A core registered %s"%( coreid )) self.lpub.send( msg ) def process_todo( self, senderid, data ): coredata = self.cores[ senderid ] work = self.hw.grab_work_item() if work != None: msg = remap_utils.pack_msg( "%s._work.%s"%(senderid, self.nodeid), work ) logger.info( "A core was given some work to do: %s"%( senderid )) self.lpub.send( msg ) def process_status( self, senderid, data ): coredata = self.cores[ senderid ] coredata["ts_last_seen"] = time.time() def process_broker_messages( self ): if self.bsub == None: # No broker is known yet. if self.brokerChanged: logger.info("The broker configuration changed.") self.setup_broker() if self.bsub == None: logger.info("Failed broker setup.") return False else: return False try: # Grab next msg from broker if any msg = self.bsub.recv() self.tot_m_rcv = self.tot_m_rcv + 1 if msg == None or len(msg)==0: return False msgprefix, data = remap_utils.unpack_msg( msg ) recipientid,msgtype,senderid = remap_utils.split_prefix(msgprefix) if msgtype == "showhands": self.handle_showhands( recipientid, senderid, data ) elif msgtype == "jobstart": #if recipientid == self.nodeid: self.handle_jobstart( recipientid, senderid, data ) else: # Forward to all cores for their processing. self.lpub.send(msg) return True except nn.NanoMsgAPIError as e: return False def purge_inactive_cores( self, new_ts ): kill_list = [] for key, coredata in self.cores.items(): last_ts = coredata["ts_last_seen"] if (new_ts - last_ts) > remap_constants.THR_STATUS_DELAY: logger.info("Core %s missed a status report."%( key )) if (new_ts - last_ts) > remap_constants.MAX_STATUS_DELAY: logger.info("Core %s is considered dead."%( key )) kill_list.append( key ) # Add code here to kill core just in case. for key in kill_list: del self.cores[ key ] def maybe_send_status( self ): if self.coresChanged: self.handle_showhands( "tracker", "unknown", { "priority":0 } ) self.coresChanged = False # Request re-registration of existing core processes currently on the bus # allows failover restart of this node daemon. def req_registration( self ): msg = remap_utils.pack_msg( "node._plzreg.%s"%(self.nodeid), {} ) self.lpub.send( msg ) # Some app initiator requests processing capacity def handle_showhands( self, recipientid, senderid, data ): avail, interruptable = self.hw.available_cpus( remap_utils.safe_get( data, "priority" ), self.cores ) if avail > 0 or interruptable > 0: logger.info( "Volunteering with %d cores, %d interruptable"%( avail, interruptable )) msg = remap_utils.pack_msg( "tracker.raisehand.%s"%( self.nodeid ), {"free":avail,"interruptable":interruptable} ) self.forward_to_broker( msg ) # Some app initiator wants this node to start work def handle_jobstart( self, recipientid, senderid, data ): avail, interruptable = self.hw.available_cpus( remap_utils.safe_get( data, "priority" ), self.cores ) numcores = len(remap_utils.safe_get( data, "cores" )) if (avail + interruptable) >= numcores: logger.info("Starting job with %d cores"%( numcores )) if not self.hw.start_job( self.remaproot, senderid, numcores, data ): logger.error("Error starting job") else: # Something changed in the meantime. Reject logger.info( "Initiator requested %d cores, %d can be committed. Rejecting"%( numcores, avail + interruptable )) msg = remap_utils.pack_msg( "%s.rejectjob.%s"%( senderid, self.nodeid ), {} ) self.forward_to_broker( msg ) self.coresChanged = True
class NodeDaemon(object): def __init__(self, remaproot): self.remaproot = remaproot self.cores = {} self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.tot_m_rcv = 0 self.hw = NodeHardware() self.nodeid = remap_utils.node_id() self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.coresChanged = False # Create a bi-directional communication channel, where the node daemon # 'shouts' in the room even to contact a single core, but the core only # sends written messages back to the shouter with the megaphone. # (embarassing protocol). def setup_bus(self): self.lsub = nn.Socket(nn.SUB) self.lsub.bind("ipc:///tmp/node_pub.ipc") self.lsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "") self.lpub = nn.Socket(nn.PUB) self.lpub.bind("ipc:///tmp/node_sub.ipc") def apply_timeouts(self): if self.bsub == None: rcv_timeout = 100 self.lsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) else: rcv_timeout = 100 self.bsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) rcv_timeout = 0 self.lsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) def cb_broker_changed(self, broker_address): logger.info("Received new broker address: %s" % (broker_address)) self.broker_address = broker_address self.brokerChanged = True def setup_broker(self): self.brokerChanged = False if self.bsub != None: self.bsub.close() self.bsub = None self.apply_timeouts() if self.broker_address == "unknown": logger.error("Deferring broker setup as address is still unknown.") return self.bsub = nn.Socket(nn.SUB) self.bsub.connect("tcp://%s:8687" % (self.broker_address)) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "global") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "local") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "notlocal") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.nodeid) self.apply_timeouts() self.bpub = nn.Socket(nn.PUB) self.bpub.connect("tcp://%s:8686" % (self.broker_address)) logger.info("Broker setup complete") def process_bus_messages(self): try: msg = self.lsub.recv() msgprefix, data = remap_utils.unpack_msg(msg) if len(msgprefix) == 0: return True recipientid, msgtype, senderid = remap_utils.split_prefix( msgprefix) if msgtype[0] == '_': # node message self.process_core_message(msgtype, senderid, data) elif msgtype == "corestatus": if senderid in self.cores: coredata = self.cores[senderid] coredata["ts_last_seen"] = time.time() coredata["progress"] = data["progress"] logger.info("Core %s progressed %d" % (senderid, coredata["progress"])) self.forward_to_broker(msg) elif msgtype == "complete": if senderid in self.cores: coredata = self.cores[senderid] logger.info("Core %s completed the job" % (senderid)) self.forward_to_broker(msg) del self.cores[senderid] self.coresChanged = True else: # forward to broker instead self.forward_to_broker(msg) return True except nn.NanoMsgAPIError as e: return False def process_core_message(self, msgtype, senderid, data): if msgtype == "_hello": self.process_hello(data) if msgtype == "_todo": self.process_todo(senderid, data) if msgtype == "_status": self.process_status(senderid, data) if msgtype == "_sub": self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, data["prefix"]) if msgtype == "_unsub": self.bsub.set_string_option(nn.SUB, nn.SUB_UNSUBSCRIBE, data["prefix"]) def forward_to_broker(self, msg): if self.bpub != None: try: self.bpub.send(msg) except nn.NanoMsgAPIError as e: pass # This processes a message where a core is announcing itself and wants to # get a core id to start existing on the network def process_hello(self, data): msgid = remap_utils.safe_get(data, "msgid") pid = remap_utils.safe_get(data, "pid") priority = remap_utils.safe_get(data, "priority") coreid = remap_utils.core_id(self.nodeid, pid) self.cores[coreid] = { "coreid": coreid, "ts_last_seen": time.time(), "progress": -1, "pid": pid, "priority": priority } msg = remap_utils.pack_msg("%s._hey.%s" % (coreid, self.nodeid), { "msgid": msgid, "coreid": coreid }) logger.info("A core registered %s" % (coreid)) self.lpub.send(msg) def process_todo(self, senderid, data): coredata = self.cores[senderid] work = self.hw.grab_work_item() if work != None: msg = remap_utils.pack_msg("%s._work.%s" % (senderid, self.nodeid), work) logger.info("A core was given some work to do: %s" % (senderid)) self.lpub.send(msg) def process_status(self, senderid, data): coredata = self.cores[senderid] coredata["ts_last_seen"] = time.time() def process_broker_messages(self): if self.bsub == None: # No broker is known yet. if self.brokerChanged: logger.info("The broker configuration changed.") self.setup_broker() if self.bsub == None: logger.info("Failed broker setup.") return False else: return False try: # Grab next msg from broker if any msg = self.bsub.recv() self.tot_m_rcv = self.tot_m_rcv + 1 if msg == None or len(msg) == 0: return False msgprefix, data = remap_utils.unpack_msg(msg) recipientid, msgtype, senderid = remap_utils.split_prefix( msgprefix) if msgtype == "showhands": self.handle_showhands(recipientid, senderid, data) elif msgtype == "jobstart": #if recipientid == self.nodeid: self.handle_jobstart(recipientid, senderid, data) else: # Forward to all cores for their processing. self.lpub.send(msg) return True except nn.NanoMsgAPIError as e: return False def purge_inactive_cores(self, new_ts): kill_list = [] for key, coredata in self.cores.items(): last_ts = coredata["ts_last_seen"] if (new_ts - last_ts) > remap_constants.THR_STATUS_DELAY: logger.info("Core %s missed a status report." % (key)) if (new_ts - last_ts) > remap_constants.MAX_STATUS_DELAY: logger.info("Core %s is considered dead." % (key)) kill_list.append(key) # Add code here to kill core just in case. for key in kill_list: del self.cores[key] def maybe_send_status(self): if self.coresChanged: self.handle_showhands("tracker", "unknown", {"priority": 0}) self.coresChanged = False # Request re-registration of existing core processes currently on the bus # allows failover restart of this node daemon. def req_registration(self): msg = remap_utils.pack_msg("node._plzreg.%s" % (self.nodeid), {}) self.lpub.send(msg) # Some app initiator requests processing capacity def handle_showhands(self, recipientid, senderid, data): avail, interruptable = self.hw.available_cpus( remap_utils.safe_get(data, "priority"), self.cores) if avail > 0 or interruptable > 0: logger.info("Volunteering with %d cores, %d interruptable" % (avail, interruptable)) msg = remap_utils.pack_msg("tracker.raisehand.%s" % (self.nodeid), { "free": avail, "interruptable": interruptable }) self.forward_to_broker(msg) # Some app initiator wants this node to start work def handle_jobstart(self, recipientid, senderid, data): avail, interruptable = self.hw.available_cpus( remap_utils.safe_get(data, "priority"), self.cores) numcores = len(remap_utils.safe_get(data, "cores")) if (avail + interruptable) >= numcores: logger.info("Starting job with %d cores" % (numcores)) if not self.hw.start_job(self.remaproot, senderid, numcores, data): logger.error("Error starting job") else: # Something changed in the meantime. Reject logger.info( "Initiator requested %d cores, %d can be committed. Rejecting" % (numcores, avail + interruptable)) msg = remap_utils.pack_msg( "%s.rejectjob.%s" % (senderid, self.nodeid), {}) self.forward_to_broker(msg) self.coresChanged = True
class Initiator(Monitor): def __init__(self, rootdir): Monitor.__init__(self, rootdir) self.remaproot = rootdir self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.jobid = None self.refreshed = 0 self.job_status = "waiting" self.rejectedtasks = {} self.completedtasks = {} self.tasks = {} self.allocatedtasks = {} self.jobtype = "not_started" self.priority = 0 self.parallellism = 1 self.manager = None self.last_check = time.time() def load_plugin(self, name): try: mod = __import__("module_%s" % name) return mod except ImportError as ie: raise RemapException("No such worker type: %s" % (name)) # ------- # Broker handling # ------- def setup_broker(self): self.brokerChanged = False if self.bsub != None: self.bsub.close() self.bsub = None self.apply_timeouts() if self.broker_address == "unknown": logger.error("Deferring broker setup as address is still unknown.") return self.bsub = nn.Socket(nn.SUB) self.bsub.connect("tcp://%s:8687" % (self.broker_address)) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "global") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "local") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "notlocal") if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "tracker") self.apply_timeouts() self.bpub = nn.Socket(nn.PUB) self.bpub.connect("tcp://%s:8686" % (self.broker_address)) logger.info("Broker setup complete") def apply_timeouts(self): if self.bsub != None: rcv_timeout = 100 self.bsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) def cb_broker_changed(self, broker_address): logger.info("Received new broker address: %s" % (broker_address)) self.broker_address = broker_address self.brokerChanged = True def forward_to_broker(self, msg): if self.bpub != None: try: self.bpub.send(msg) except nn.NanoMsgAPIError as e: pass def process_broker_messages(self): if self.bsub == None: # No broker is known yet. if self.brokerChanged: logger.info("The broker configuration changed.") self.setup_broker() if self.bsub == None: logger.info("Failed broker setup.") return False else: return False try: # Grab next msg from broker if any msg = self.bsub.recv() if msg != None and len(msg) > 0: msgprefix, data = remap_utils.unpack_msg(msg) recipientid, msgtype, senderid = remap_utils.split_prefix(msgprefix) if msgtype == "complete": self.update_corecomplete(recipientid, senderid, data) if msgtype == "corestatus": self.update_corestatus(recipientid, senderid, data) if msgtype == "raisehand": self.update_hands(recipientid, senderid, data) return True else: return False except nn.NanoMsgAPIError as e: return False # ------- # Messaging handling # ------- def update_corestatus(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) if key in self.allocatedtasks: job = self.allocatedtasks[key] job["ts_finish"] = time.time() + 7 def update_corecomplete(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) logger.info("Job %s completed." % (key)) if key in self.allocatedtasks: job = self.allocatedtasks[key] task = self.tasks[key] self.completedtasks[key] = task del self.tasks[key] del self.allocatedtasks[key] logger.info( "%d tasks left, %d tasks committed, %d tasks complete, %d tasks failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks)) ) def update_hands(self, recipientid, senderid, data): # "%s.raisehand.%s"%( senderid, self.nodeid ), {"cores":3,"interruptable":0} if senderid in self.nodes: self.nodes[senderid]["avail"] = data else: self.nodes[senderid] = {} self.nodes[senderid]["avail"] = data # ------- # Job management # ------- def start_job(self, jobdata): if self.job_status != "waiting": raise RemapException("A job is currently in progress on this monitor") if "type" not in jobdata: raise RemapException("Must have job type specified") if "priority" not in jobdata: raise RemapException("Must have priority specified") if "parallellism" not in jobdata: raise RemapException("Must have parallellism specified") self.job_status = "preparing" self.prepare_start = time.time() self.jobtype = jobdata["type"] self.priority = jobdata["priority"] self.parallellism = jobdata["parallellism"] plugin = self.load_plugin(self.jobtype) self.rejectedtasks = {} self.completedtasks = {} if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_UNSUBSCRIBE, self.jobid) if "jobid" in jobdata: self.jobid = jobdata["jobid"] del jobdata["jobid"] else: self.jobid = remap_utils.unique_id() self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) if "app" not in jobdata: raise RemapException("The name of the app must be provided") if jobdata["app"] not in self.list_apps(): raise RemapException("No such application: %s" % (jobdata["app"])) config = {"jobid": self.jobid, "remaproot": self.remaproot} logger.info("Started a new job: %s" % (self.jobid)) self.manager = plugin.create_manager(jobdata, config) if (time.time() - self.refreshed) > 60: # Not refreshed > 60s self.refresh_nodes(self.priority) # Wait for a bunch of nodes to advertise themselves r = Timer(1.0, self.resume, ()) r.start() else: self.resume() return {"jobid": self.jobid} def resume(self): self.manager.prepare() logger.info("Starting a %s job" % (self.jobtype)) self.planner = JobPlanner(self.manager.config_file) self.tasks = self.manager.plan_jobs(self.planner) logger.info("Found %d tasks to execute" % (len(self.tasks))) numnodes, self.allocatedtasks = self.planner.distribute_jobs_over_nodes( self.tasks, {}, self.nodes, self.parallellism ) if len(self.allocatedtasks) == 0: logger.error("No nodes found to distribute the tasks.") self.job_status = "waiting" return if self.manager.all_hands_on_deck(): if len(self.allocatedtasks) != len(self.tasks): raise RemapException( "Not enough cores available. Have %d, need %d." % (len(self.allocatedtasks), len(self.tasks)) ) logger.info("%d new tasks distributed over %d nodes." % (len(self.allocatedtasks), numnodes)) self.job_status = "executing" self.outbound_work(self.allocatedtasks) # In outbound work we update our local "jobs" data with timestamps # when they were sent out and send the task data to nodes. def outbound_work(self, jobs): nodes = {} for key, job in jobs.items(): nodeid = job["nodeid"] if nodeid in nodes: nodes[nodeid]["cores"].append(job["jobdata"]) else: tasks = {} tasklist = [] job["ts_start"] = time.time() job["ts_finish"] = time.time() + 7 tasklist.append(job["jobdata"]) tasks["cores"] = tasklist tasks["priority"] = self.priority nodes[nodeid] = tasks for nodeid, tasks in nodes.items(): msg = remap_utils.pack_msg("%s.jobstart.%s" % (nodeid, self.jobid), tasks) self.forward_to_broker(msg) def check_progress(self): if self.manager != None: if self.manager.module_tracks_progress(): if not self.manager.check_progress(len(self.tasks)): self.manager.finish() self.manager = None self.job_status = "waiting" logger.info("Vertex job complete") else: if time.time() - self.last_check <= 4: return newtime = time.time() kill_list = [] for key, job in self.allocatedtasks.items(): if newtime > job["ts_finish"]: # This job hasn't been updated, probably dead. jobdata = job["jobdata"] # Update tasks with an attempt + 1 task = self.tasks[key] task["attempts"] = task["attempts"] + 1 nodeid = job["nodeid"] logger.info("Task %s failed on node %s. Reattempting elsewhere" % (key, nodeid)) if task["attempts"] > 4: # 5 attempts so far. let's cancel it. logger.warn("Task %s failed 5 attempts. Cancelling file to reject." % (key)) del self.tasks[key] kill_list.append(key) self.rejectedtasks[key] = task for key in kill_list: del self.allocatedtasks[key] # Now also check if there are jobs that can be started if len(self.tasks) > 0: numnodes, new_allocations = self.planner.distribute_jobs_over_nodes( self.tasks, self.allocatedtasks, self.nodes, self.parallellism ) if numnodes > 0: logger.info("%d new tasks distributed over %d nodes" % (len(new_allocations), numnodes)) self.outbound_work(new_allocations) self.allocatedtasks.update(new_allocations) if self.job_status == "executing" and len(self.tasks) == 0 and len(self.allocatedtasks) == 0: # finished all work self.job_status = "waiting" self.manager.finish() self.manager = None logger.info( "%d jobs left, %d jobs committed, %d jobs complete, %d jobs failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks)) ) if self.job_status == "preparing" and time.time() - self.prepare_start > 5: # over 5 seconds? quit it self.job_status = "waiting" if self.manager != None: self.manager.finish() self.manager = None logger.info("Cancelled job in progress.") else: # no manager. if self.job_status != "waiting": self.job_status = "waiting" logger.info("Resolving inconsistent state.") self.last_check = time.time() # ------- # Node management # ------- def refresh_nodes(self, priority): self.nodes = {} self.priority = priority self.refreshed = time.time() msg = remap_utils.pack_msg("local.showhands.%s" % (self.jobid), {"priority": self.priority}) self.forward_to_broker(msg)