def resume(self): self.manager.prepare() logger.info("Starting a %s job" % (self.jobtype)) self.planner = JobPlanner(self.manager.config_file) self.tasks = self.manager.plan_jobs(self.planner) logger.info("Found %d tasks to execute" % (len(self.tasks))) numnodes, self.allocatedtasks = self.planner.distribute_jobs_over_nodes( self.tasks, {}, self.nodes, self.parallellism) if len(self.allocatedtasks) == 0: logger.error("No nodes found to distribute the tasks.") self.job_status = "waiting" return if self.manager.all_hands_on_deck(): if len(self.allocatedtasks) != len(self.tasks): raise RemapException( "Not enough cores available. Have %d, need %d." % (len(self.allocatedtasks), len(self.tasks))) logger.info("%d new tasks distributed over %d nodes." % (len(self.allocatedtasks), numnodes)) self.job_status = "executing" self.outbound_work(self.allocatedtasks)
def resume(self): self.manager.prepare() logger.info("Starting a %s job" % (self.jobtype)) self.planner = JobPlanner(self.manager.config_file) self.tasks = self.manager.plan_jobs(self.planner) logger.info("Found %d tasks to execute" % (len(self.tasks))) numnodes, self.allocatedtasks = self.planner.distribute_jobs_over_nodes( self.tasks, {}, self.nodes, self.parallellism ) if len(self.allocatedtasks) == 0: logger.error("No nodes found to distribute the tasks.") self.job_status = "waiting" return if self.manager.all_hands_on_deck(): if len(self.allocatedtasks) != len(self.tasks): raise RemapException( "Not enough cores available. Have %d, need %d." % (len(self.allocatedtasks), len(self.tasks)) ) logger.info("%d new tasks distributed over %d nodes." % (len(self.allocatedtasks), numnodes)) self.job_status = "executing" self.outbound_work(self.allocatedtasks)
class Initiator(Monitor): def __init__(self, rootdir): Monitor.__init__(self, rootdir) self.remaproot = rootdir self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.jobid = None self.refreshed = 0 self.job_status = "waiting" self.rejectedtasks = {} self.completedtasks = {} self.tasks = {} self.allocatedtasks = {} self.jobtype = "not_started" self.priority = 0 self.parallellism = 1 self.manager = None self.last_check = time.time() def load_plugin(self, name): try: mod = __import__("module_%s" % name) return mod except ImportError as ie: raise RemapException("No such worker type: %s" % (name)) # ------- # Broker handling # ------- def setup_broker(self): self.brokerChanged = False if self.bsub != None: self.bsub.close() self.bsub = None self.apply_timeouts() if self.broker_address == "unknown": logger.error("Deferring broker setup as address is still unknown.") return self.bsub = nn.Socket(nn.SUB) self.bsub.connect("tcp://%s:8687" % (self.broker_address)) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "global") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "local") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "notlocal") if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "tracker") self.apply_timeouts() self.bpub = nn.Socket(nn.PUB) self.bpub.connect("tcp://%s:8686" % (self.broker_address)) logger.info("Broker setup complete") def apply_timeouts(self): if self.bsub != None: rcv_timeout = 100 self.bsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) def cb_broker_changed(self, broker_address): logger.info("Received new broker address: %s" % (broker_address)) self.broker_address = broker_address self.brokerChanged = True def forward_to_broker(self, msg): if self.bpub != None: try: self.bpub.send(msg) except nn.NanoMsgAPIError as e: pass def process_broker_messages(self): if self.bsub == None: # No broker is known yet. if self.brokerChanged: logger.info("The broker configuration changed.") self.setup_broker() if self.bsub == None: logger.info("Failed broker setup.") return False else: return False try: # Grab next msg from broker if any msg = self.bsub.recv() if msg != None and len(msg) > 0: msgprefix, data = remap_utils.unpack_msg(msg) recipientid, msgtype, senderid = remap_utils.split_prefix( msgprefix) if msgtype == "complete": self.update_corecomplete(recipientid, senderid, data) if msgtype == "corestatus": self.update_corestatus(recipientid, senderid, data) if msgtype == "raisehand": self.update_hands(recipientid, senderid, data) return True else: return False except nn.NanoMsgAPIError as e: return False # ------- # Messaging handling # ------- def update_corestatus(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) if key in self.allocatedtasks: job = self.allocatedtasks[key] job["ts_finish"] = time.time() + 7 def update_corecomplete(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) logger.info("Job %s completed." % (key)) if key in self.allocatedtasks: job = self.allocatedtasks[key] task = self.tasks[key] self.completedtasks[key] = task del self.tasks[key] del self.allocatedtasks[key] logger.info( "%d tasks left, %d tasks committed, %d tasks complete, %d tasks failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks))) def update_hands(self, recipientid, senderid, data): # "%s.raisehand.%s"%( senderid, self.nodeid ), {"cores":3,"interruptable":0} if senderid in self.nodes: self.nodes[senderid]["avail"] = data else: self.nodes[senderid] = {} self.nodes[senderid]["avail"] = data # ------- # Job management # ------- def start_job(self, jobdata): if self.job_status != "waiting": raise RemapException( "A job is currently in progress on this monitor") if "type" not in jobdata: raise RemapException("Must have job type specified") if "priority" not in jobdata: raise RemapException("Must have priority specified") if "parallellism" not in jobdata: raise RemapException("Must have parallellism specified") self.job_status = "preparing" self.prepare_start = time.time() self.jobtype = jobdata["type"] self.priority = jobdata["priority"] self.parallellism = jobdata["parallellism"] plugin = self.load_plugin(self.jobtype) self.rejectedtasks = {} self.completedtasks = {} if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_UNSUBSCRIBE, self.jobid) if "jobid" in jobdata: self.jobid = jobdata["jobid"] del jobdata["jobid"] else: self.jobid = remap_utils.unique_id() self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) if "app" not in jobdata: raise RemapException("The name of the app must be provided") if jobdata["app"] not in self.list_apps(): raise RemapException("No such application: %s" % (jobdata["app"])) config = {"jobid": self.jobid, "remaproot": self.remaproot} logger.info("Started a new job: %s" % (self.jobid)) self.manager = plugin.create_manager(jobdata, config) if ((time.time() - self.refreshed) > 60): # Not refreshed > 60s self.refresh_nodes(self.priority) # Wait for a bunch of nodes to advertise themselves r = Timer(1.0, self.resume, ()) r.start() else: self.resume() return {"jobid": self.jobid} def resume(self): self.manager.prepare() logger.info("Starting a %s job" % (self.jobtype)) self.planner = JobPlanner(self.manager.config_file) self.tasks = self.manager.plan_jobs(self.planner) logger.info("Found %d tasks to execute" % (len(self.tasks))) numnodes, self.allocatedtasks = self.planner.distribute_jobs_over_nodes( self.tasks, {}, self.nodes, self.parallellism) if len(self.allocatedtasks) == 0: logger.error("No nodes found to distribute the tasks.") self.job_status = "waiting" return if self.manager.all_hands_on_deck(): if len(self.allocatedtasks) != len(self.tasks): raise RemapException( "Not enough cores available. Have %d, need %d." % (len(self.allocatedtasks), len(self.tasks))) logger.info("%d new tasks distributed over %d nodes." % (len(self.allocatedtasks), numnodes)) self.job_status = "executing" self.outbound_work(self.allocatedtasks) # In outbound work we update our local "jobs" data with timestamps # when they were sent out and send the task data to nodes. def outbound_work(self, jobs): nodes = {} for key, job in jobs.items(): nodeid = job["nodeid"] if nodeid in nodes: nodes[nodeid]["cores"].append(job["jobdata"]) else: tasks = {} tasklist = [] job["ts_start"] = time.time() job["ts_finish"] = time.time() + 7 tasklist.append(job["jobdata"]) tasks["cores"] = tasklist tasks["priority"] = self.priority nodes[nodeid] = tasks for nodeid, tasks in nodes.items(): msg = remap_utils.pack_msg("%s.jobstart.%s" % (nodeid, self.jobid), tasks) self.forward_to_broker(msg) def check_progress(self): if self.manager != None: if self.manager.module_tracks_progress(): if not self.manager.check_progress(len(self.tasks)): self.manager.finish() self.manager = None self.job_status = "waiting" logger.info("Vertex job complete") else: if time.time() - self.last_check <= 4: return newtime = time.time() kill_list = [] for key, job in self.allocatedtasks.items(): if newtime > job["ts_finish"]: # This job hasn't been updated, probably dead. jobdata = job["jobdata"] # Update tasks with an attempt + 1 task = self.tasks[key] task["attempts"] = task["attempts"] + 1 nodeid = job["nodeid"] logger.info( "Task %s failed on node %s. Reattempting elsewhere" % (key, nodeid)) if task["attempts"] > 4: # 5 attempts so far. let's cancel it. logger.warn( "Task %s failed 5 attempts. Cancelling file to reject." % (key)) del self.tasks[key] kill_list.append(key) self.rejectedtasks[key] = task for key in kill_list: del self.allocatedtasks[key] # Now also check if there are jobs that can be started if len(self.tasks) > 0: numnodes, new_allocations = self.planner.distribute_jobs_over_nodes( self.tasks, self.allocatedtasks, self.nodes, self.parallellism) if numnodes > 0: logger.info("%d new tasks distributed over %d nodes" % (len(new_allocations), numnodes)) self.outbound_work(new_allocations) self.allocatedtasks.update(new_allocations) if self.job_status == "executing" and len( self.tasks) == 0 and len(self.allocatedtasks) == 0: # finished all work self.job_status = "waiting" self.manager.finish() self.manager = None logger.info( "%d jobs left, %d jobs committed, %d jobs complete, %d jobs failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks))) if self.job_status == "preparing" and time.time( ) - self.prepare_start > 5: # over 5 seconds? quit it self.job_status = "waiting" if self.manager != None: self.manager.finish() self.manager = None logger.info("Cancelled job in progress.") else: # no manager. if self.job_status != "waiting": self.job_status = "waiting" logger.info("Resolving inconsistent state.") self.last_check = time.time() # ------- # Node management # ------- def refresh_nodes(self, priority): self.nodes = {} self.priority = priority self.refreshed = time.time() msg = remap_utils.pack_msg("local.showhands.%s" % (self.jobid), {"priority": self.priority}) self.forward_to_broker(msg)
class Initiator(Monitor): def __init__(self, rootdir): Monitor.__init__(self, rootdir) self.remaproot = rootdir self.broker_address = "unknown" self.brokerChanged = False self.bsub = None self.bpub = None self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed) self.bonjour.start() self.jobid = None self.refreshed = 0 self.job_status = "waiting" self.rejectedtasks = {} self.completedtasks = {} self.tasks = {} self.allocatedtasks = {} self.jobtype = "not_started" self.priority = 0 self.parallellism = 1 self.manager = None self.last_check = time.time() def load_plugin(self, name): try: mod = __import__("module_%s" % name) return mod except ImportError as ie: raise RemapException("No such worker type: %s" % (name)) # ------- # Broker handling # ------- def setup_broker(self): self.brokerChanged = False if self.bsub != None: self.bsub.close() self.bsub = None self.apply_timeouts() if self.broker_address == "unknown": logger.error("Deferring broker setup as address is still unknown.") return self.bsub = nn.Socket(nn.SUB) self.bsub.connect("tcp://%s:8687" % (self.broker_address)) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "global") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "local") self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "notlocal") if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "tracker") self.apply_timeouts() self.bpub = nn.Socket(nn.PUB) self.bpub.connect("tcp://%s:8686" % (self.broker_address)) logger.info("Broker setup complete") def apply_timeouts(self): if self.bsub != None: rcv_timeout = 100 self.bsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout) def cb_broker_changed(self, broker_address): logger.info("Received new broker address: %s" % (broker_address)) self.broker_address = broker_address self.brokerChanged = True def forward_to_broker(self, msg): if self.bpub != None: try: self.bpub.send(msg) except nn.NanoMsgAPIError as e: pass def process_broker_messages(self): if self.bsub == None: # No broker is known yet. if self.brokerChanged: logger.info("The broker configuration changed.") self.setup_broker() if self.bsub == None: logger.info("Failed broker setup.") return False else: return False try: # Grab next msg from broker if any msg = self.bsub.recv() if msg != None and len(msg) > 0: msgprefix, data = remap_utils.unpack_msg(msg) recipientid, msgtype, senderid = remap_utils.split_prefix(msgprefix) if msgtype == "complete": self.update_corecomplete(recipientid, senderid, data) if msgtype == "corestatus": self.update_corestatus(recipientid, senderid, data) if msgtype == "raisehand": self.update_hands(recipientid, senderid, data) return True else: return False except nn.NanoMsgAPIError as e: return False # ------- # Messaging handling # ------- def update_corestatus(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) if key in self.allocatedtasks: job = self.allocatedtasks[key] job["ts_finish"] = time.time() + 7 def update_corecomplete(self, recipientid, senderid, data): if self.manager != None: key = self.manager.get_work_key(data) logger.info("Job %s completed." % (key)) if key in self.allocatedtasks: job = self.allocatedtasks[key] task = self.tasks[key] self.completedtasks[key] = task del self.tasks[key] del self.allocatedtasks[key] logger.info( "%d tasks left, %d tasks committed, %d tasks complete, %d tasks failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks)) ) def update_hands(self, recipientid, senderid, data): # "%s.raisehand.%s"%( senderid, self.nodeid ), {"cores":3,"interruptable":0} if senderid in self.nodes: self.nodes[senderid]["avail"] = data else: self.nodes[senderid] = {} self.nodes[senderid]["avail"] = data # ------- # Job management # ------- def start_job(self, jobdata): if self.job_status != "waiting": raise RemapException("A job is currently in progress on this monitor") if "type" not in jobdata: raise RemapException("Must have job type specified") if "priority" not in jobdata: raise RemapException("Must have priority specified") if "parallellism" not in jobdata: raise RemapException("Must have parallellism specified") self.job_status = "preparing" self.prepare_start = time.time() self.jobtype = jobdata["type"] self.priority = jobdata["priority"] self.parallellism = jobdata["parallellism"] plugin = self.load_plugin(self.jobtype) self.rejectedtasks = {} self.completedtasks = {} if self.jobid != None: self.bsub.set_string_option(nn.SUB, nn.SUB_UNSUBSCRIBE, self.jobid) if "jobid" in jobdata: self.jobid = jobdata["jobid"] del jobdata["jobid"] else: self.jobid = remap_utils.unique_id() self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.jobid) if "app" not in jobdata: raise RemapException("The name of the app must be provided") if jobdata["app"] not in self.list_apps(): raise RemapException("No such application: %s" % (jobdata["app"])) config = {"jobid": self.jobid, "remaproot": self.remaproot} logger.info("Started a new job: %s" % (self.jobid)) self.manager = plugin.create_manager(jobdata, config) if (time.time() - self.refreshed) > 60: # Not refreshed > 60s self.refresh_nodes(self.priority) # Wait for a bunch of nodes to advertise themselves r = Timer(1.0, self.resume, ()) r.start() else: self.resume() return {"jobid": self.jobid} def resume(self): self.manager.prepare() logger.info("Starting a %s job" % (self.jobtype)) self.planner = JobPlanner(self.manager.config_file) self.tasks = self.manager.plan_jobs(self.planner) logger.info("Found %d tasks to execute" % (len(self.tasks))) numnodes, self.allocatedtasks = self.planner.distribute_jobs_over_nodes( self.tasks, {}, self.nodes, self.parallellism ) if len(self.allocatedtasks) == 0: logger.error("No nodes found to distribute the tasks.") self.job_status = "waiting" return if self.manager.all_hands_on_deck(): if len(self.allocatedtasks) != len(self.tasks): raise RemapException( "Not enough cores available. Have %d, need %d." % (len(self.allocatedtasks), len(self.tasks)) ) logger.info("%d new tasks distributed over %d nodes." % (len(self.allocatedtasks), numnodes)) self.job_status = "executing" self.outbound_work(self.allocatedtasks) # In outbound work we update our local "jobs" data with timestamps # when they were sent out and send the task data to nodes. def outbound_work(self, jobs): nodes = {} for key, job in jobs.items(): nodeid = job["nodeid"] if nodeid in nodes: nodes[nodeid]["cores"].append(job["jobdata"]) else: tasks = {} tasklist = [] job["ts_start"] = time.time() job["ts_finish"] = time.time() + 7 tasklist.append(job["jobdata"]) tasks["cores"] = tasklist tasks["priority"] = self.priority nodes[nodeid] = tasks for nodeid, tasks in nodes.items(): msg = remap_utils.pack_msg("%s.jobstart.%s" % (nodeid, self.jobid), tasks) self.forward_to_broker(msg) def check_progress(self): if self.manager != None: if self.manager.module_tracks_progress(): if not self.manager.check_progress(len(self.tasks)): self.manager.finish() self.manager = None self.job_status = "waiting" logger.info("Vertex job complete") else: if time.time() - self.last_check <= 4: return newtime = time.time() kill_list = [] for key, job in self.allocatedtasks.items(): if newtime > job["ts_finish"]: # This job hasn't been updated, probably dead. jobdata = job["jobdata"] # Update tasks with an attempt + 1 task = self.tasks[key] task["attempts"] = task["attempts"] + 1 nodeid = job["nodeid"] logger.info("Task %s failed on node %s. Reattempting elsewhere" % (key, nodeid)) if task["attempts"] > 4: # 5 attempts so far. let's cancel it. logger.warn("Task %s failed 5 attempts. Cancelling file to reject." % (key)) del self.tasks[key] kill_list.append(key) self.rejectedtasks[key] = task for key in kill_list: del self.allocatedtasks[key] # Now also check if there are jobs that can be started if len(self.tasks) > 0: numnodes, new_allocations = self.planner.distribute_jobs_over_nodes( self.tasks, self.allocatedtasks, self.nodes, self.parallellism ) if numnodes > 0: logger.info("%d new tasks distributed over %d nodes" % (len(new_allocations), numnodes)) self.outbound_work(new_allocations) self.allocatedtasks.update(new_allocations) if self.job_status == "executing" and len(self.tasks) == 0 and len(self.allocatedtasks) == 0: # finished all work self.job_status = "waiting" self.manager.finish() self.manager = None logger.info( "%d jobs left, %d jobs committed, %d jobs complete, %d jobs failed." % (len(self.tasks), len(self.allocatedtasks), len(self.completedtasks), len(self.rejectedtasks)) ) if self.job_status == "preparing" and time.time() - self.prepare_start > 5: # over 5 seconds? quit it self.job_status = "waiting" if self.manager != None: self.manager.finish() self.manager = None logger.info("Cancelled job in progress.") else: # no manager. if self.job_status != "waiting": self.job_status = "waiting" logger.info("Resolving inconsistent state.") self.last_check = time.time() # ------- # Node management # ------- def refresh_nodes(self, priority): self.nodes = {} self.priority = priority self.refreshed = time.time() msg = remap_utils.pack_msg("local.showhands.%s" % (self.jobid), {"priority": self.priority}) self.forward_to_broker(msg)