class DADispatcher(DAServer): """ Service for job status bookkeeping. Protocol: SRV: send READY CLT: send START, EXIT, or DONE """ STATES = ["UNKNOWN", "CREATED", "PENDING", "RUNNING", "DONE", "FAILED", "EXITED"] CLUSTERS = ["lsf", "interactive", "local"] FALLBACK = {"lsf": "interactive", "interactive": "", "local": ""} MAXACTIVE = {"lsf": 40, "interactive": 10, "local": 20} class JobInfo(object): """ Struct for job information. """ def __init__(self, name_): self.name = name_ self.key = string.join(random.sample(string.ascii_lowercase, 4), "") self.cluster = "" self.proc = None self.node = "" self.state = "CREATED" self.lastHB = 0 def __init__(self, workspace_, resubmit=False, terminal=None): DAServer.__init__(self, "dispatch") self._workspace = workspace_ self._webDir = "" self._jobInfo = {} self._resubmit = resubmit self._readyJobs = dict([(k, []) for k in DADispatcher.CLUSTERS]) self._activeJobs = dict([(k, []) for k in DADispatcher.CLUSTERS]) self._lock = threading.Lock() self._stateChanged = threading.Event() if terminal: self._terminal = terminal else: self._terminal = Terminal(TERMNODE, verbose=True) self.options = dict([(k, "") for k in DADispatcher.CLUSTERS]) def __del__(self): for cluster in DADispatcher.CLUSTERS: for jobInfo in self._activeJobs[cluster]: self.kill(jobInfo) def authenticate(self, jobName_, key_): try: return key_ == self._jobInfo[jobName_].key except: return False def canServe(self, jobName_): if jobName_ in self._jobInfo: return 1 else: return -1 def serve(self, request_, jobName_): jobInfo = self._jobInfo[jobName_] jobInfo.lastHB = time.time() try: response = request_.recv(1024) request_.send("OK") if response == "HB": self.log("Heart beat from", jobName_) return elif response in DADispatcher.STATES: self.log("Set state", jobName_, response) else: raise Exception() except: response = "UNKNOWN" with self._lock: try: jobInfo.state = response finished = False if jobInfo.state == "DONE": self._activeJobs[jobInfo.cluster].remove(jobInfo) finished = True elif jobInfo.state == "FAILED": self._activeJobs[jobInfo.cluster].remove(jobInfo) if self._resubmit: self._readyJobs[jobInfo.cluster].append(jobInfo) finished = True if finished: if jobInfo.cluster == "interactive": jobInfo.proc.close() elif jobInfo.cluster == "local": jobInfo.proc.communicate() except: self.log("Exception while serving", jobName_, "\n", excDump()) if jobInfo.state == "FAILED": with open(self._workspace + "/logs/" + jobName_ + ".fail", "w") as failLog: pass self._stateChanged.set() def createJob(self, jobName_, cluster_, append=True): jobInfo = DADispatcher.JobInfo(jobName_) jobInfo.cluster = cluster_ self._jobInfo[jobName_] = jobInfo if append: self._readyJobs[cluster_].append(jobInfo) if DEBUG: self.log("Created", jobName_) return jobInfo def submitOne(self, cluster, logdir=""): if len(self._activeJobs[cluster]) >= DADispatcher.MAXACTIVE[cluster] or len(self._readyJobs[cluster]) == 0: return False with self._lock: try: jobInfo = self._readyJobs[cluster].pop(0) except IndexError: return False if DEBUG: self.log("submit", jobInfo.name) if self.submit(jobInfo, logdir): with self._lock: self._activeJobs[cluster].append(jobInfo) return True else: with self._lock: self._readyJobs[cluster].append(jobInfo) return False def submit(self, jobInfo_, logdir=""): self.log("Submitting job ", jobInfo_.name) if not logdir: logdir = self._workspace + "/logs" try: if jobInfo_.cluster == "lsf": command = "bsub -J {jobName} -o {log} -cwd '$TMPDIR' {options} 'source {environment};darun.py {workspace} {jobName} {key}'".format( jobName=jobInfo_.name, log=logdir + "/" + jobInfo_.name + ".log", options=self.options["lsf"], environment=self._workspace + "/environment", workspace=self._workspace, key=jobInfo_.key, ) self.log(command) bsubout = self._terminal.communicate(command) success = False if len(bsubout) != 0 and "submitted" in bsubout[0]: matches = re.search("<([0-9]+)>", bsubout[0]) if matches: success = True if not success: self.log("bsub failed") raise Exception self.log("lxbatch job ID for {0} is {1}".format(jobInfo_.name, matches.group(1))) proc = matches.group(1) node = "" elif jobInfo_.cluster == "interactive": node = TERMNODE if LOADBALANCE: hostProc = subprocess.Popen(["host", TERMNODE], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = hostProc.communicate() for line in out.split("\n"): if "has address" in line: addr = line.split()[3] for term in Terminal.OPENTERMS: if term.addr == addr: break else: node = addr break command = "cd $TMPDIR;source {environment};darun.py -p {workspace} {jobName} {key} >> {log} 2>&1;exit".format( environment=self._workspace + "/environment", workspace=self._workspace, jobName=jobInfo_.name, key=jobInfo_.key, log=logdir + "/" + jobInfo_.name + ".log", ) self.log(node + ":", command) term = Terminal(node) term.write(command) self.log("Command issued to", term.node) proc = term node = term.node elif jobInfo_.cluster == "local": command = "cd {tmpdir};source {environment};darun.py -p {workspace} {jobName} {key} >> {log} 2>&1".format( tmpdir=TMPDIR, environment=self._workspace + "/environment", workspace=self._workspace, jobName=jobInfo_.name, key=jobInfo_.key, log=logdir + "/" + jobInfo_.name + ".log", ) self.log(command) proc = subprocess.Popen( command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # stdout will be redirected to a log file within the job self.log("Subprocess started") node = "localhost" except: return False with self._lock: jobInfo_.proc = proc jobInfo_.state = "PENDING" jobInfo_.node = node jobInfo_.lastHB = time.time() self._stateChanged.set() return True def kill(self, jobInfo_): if jobInfo_.cluster == "lsf": response = self._terminal.communicate("bkill {0}".format(jobInfo_.proc)) for line in response: self.log(line) elif jobInfo_.cluster == "interactive": if jobInfo_.proc.isOpen(): jobInfo_.proc.close(force=True) elif jobInfo_.cluster == "local": if jobInfo_.proc.poll() is None: jobInfo_.proc.terminate() try: self._jobInfo.pop(jobInfo_.name) except: self.log("Exception while trying to remove", jobInfo_.name) def dispatch(self, logdir=""): monitorTerminate = threading.Event() monitorThread = threading.Thread(target=self.monitor, args=(monitorTerminate,), name="monitor") monitorThread.daemon = True monitorThread.start() while True: submitted = False for cluster in DADispatcher.CLUSTERS: if self.submitOne(cluster, logdir): submitted = True if submitted: continue with self._lock: nReady = reduce(lambda x, y: x + y, map(len, self._readyJobs.values())) nActive = reduce(lambda x, y: x + y, map(len, self._activeJobs.values())) if nReady == 0 and nActive == 0: break self._stateChanged.wait(60.0) if not self._stateChanged.isSet(): # timeout self.log("No job changed state in the last 60 seconds. Now checking for stale jobs..") exited = [] longpend = [] if len(self._activeJobs["lsf"]) != 0: lsfNodes = {} try: response = self._terminal.communicate("bjobs")[1:] if DEBUG: print "bjobs", response for line in response: id = line.split()[0] node = line.split()[5] lsfNodes[id] = node except: self.log("Job status query failed") with self._lock: for jobInfo in self._activeJobs["lsf"]: # Two different tasks - if id is in the list of ids, set the node name. # If not, the job may have exited abnormally - check state. if jobInfo.proc in lsfNodes: if not jobInfo.node: jobInfo.node = lsfNodes[jobInfo.proc] if jobInfo.lastHB < time.time() - 120: self.log("No heartbeat from", jobInfo.name, "for 120 seconds") if jobInfo.state == "PENDING": longpend.append(jobInfo) elif jobInfo.state == "RUNNING": exited.append(jobInfo) else: self.log(jobInfo.name, "disappeared from LSF job list") exited.append(jobInfo) for jobInfo in self._activeJobs["interactive"]: if not jobInfo.proc.isOpen(): exited.append(jobInfo) elif jobInfo.lastHB < time.time() - 120: self.log("No heartbeat from", jobInfo.name, "for 120 seconds") if jobInfo.state == "PENDING": longpend.append(jobInfo) elif jobInfo.state == "RUNNING": exited.append(jobInfo) for jobInfo in self._activeJobs["local"]: if jobInfo.proc.poll() is not None: exited.append(jobInfo) elif jobInfo.lastHB < time.time() - 120: self.log("No heartbeat from", jobInfo.name, "for 120 seconds") if jobInfo.state == "PENDING": longpend.append(jobInfo) elif jobInfo.state == "RUNNING": exited.append(jobInfo) for jobInfo in exited: self.log("Set state", jobInfo.name, "EXITED") self.kill(jobInfo) # removes from self._jobInfo jobInfo.state = "EXITED" self._activeJobs[jobInfo.cluster].remove(jobInfo) for jobInfo in exited: with open(self._workspace + "/logs/" + jobInfo.name + ".fail", "w") as failLog: pass resubmit = [] if self._resubmit: resubmit += exited available = dict( [(c, DADispatcher.MAXACTIVE[c] - len(self._activeJobs[c])) for c in DADispatcher.CLUSTERS] ) for jobInfo in longpend: fallback = DADispatcher.FALLBACK[jobInfo.cluster] if fallback and available[fallback] > 0: # This job did not start in time and there is a space in the fallback queue with self._lock: self.kill(jobInfo) # removes from self._jobInfo jobInfo.state = "EXITED" self._activeJobs[jobInfo.cluster].remove(jobInfo) jobInfo.cluster = fallback resubmit.append(jobInfo) available[fallback] -= 1 for jobInfo in resubmit: newJobInfo = self.createJob(jobInfo.name, jobInfo.cluster, append=False) if self.submit(newJobInfo, logdir): with self._lock: self._activeJobs[jobInfo.cluster].append(newJobInfo) else: with self._lock: self._readyJobs[jobInfo.cluster].append(newJobInfo) self._stateChanged.set() time.sleep(1) # allow the monitor thread to catch up self._stateChanged.clear() monitorTerminate.set() self._stateChanged.set() monitorThread.join() def monitor(self, _terminate): self.printStatus() self.printStatusWeb() lastWebUpdate = time.time() while True: self._stateChanged.wait(10.0) if _terminate.isSet(): break self.printStatus() if time.time() > lastWebUpdate + 60.0: self.printStatusWeb() lastWebUpdate = time.time() def countJobs(self): jobCounts = dict((key, 0) for key in DADispatcher.STATES) with self._lock: for jobInfo in self._jobInfo.values(): jobCounts[jobInfo.state] += 1 return jobCounts def setWebDir(self, dir_): self._webDir = dir_ try: os.mkdir(self._webDir) except OSError: pass def printStatus(self): jobCounts = self.countJobs() line = "" for state in DADispatcher.STATES: line += " {state}: {n}".format(state=state, n=jobCounts[state]) line = "\r" + line line += " " * 10 sys.stdout.write(line) if DEBUG: sys.stdout.write("\n") sys.stdout.flush() def printStatusWeb(self, copyLogs=False): if not self._webDir: return if copyLogs: logDir = self._webDir + "/logs" if not os.path.exists(logDir): os.mkdir(logDir) for fileName in os.listdir(self._workspace + "/logs"): sourceName = self._workspace + "/logs/" + fileName destName = logDir + "/" + fileName if not os.path.exists(destName) or os.stat(sourceName).st_mtime > os.stat(destName).st_mtime: shutil.copy(sourceName, destName) allJobs = self._jobInfo.keys() summaryName = self._webDir + "/summary.dat" if not os.path.exists(summaryName): with open(summaryName, "w") as summaryFile: # add more info in the future? summaryFile.write("workspace = " + self._workspace) statusDir = self._webDir + "/status" if not os.path.exists(statusDir): os.makedirs(statusDir) for job in allJobs: open(statusDir + "/" + job + ".UNKNOWN", "w").close() with self._lock: for statusFile in os.listdir(statusDir): jobName = statusFile[: statusFile.rfind(".")] if jobName not in self._jobInfo: continue current = statusFile[statusFile.rfind(".") + 1 :] actual = self._jobInfo[jobName].state if current != actual: os.rename(statusDir + "/" + statusFile, statusDir + "/" + jobName + "." + actual)
jobConfig["outputFile"] = options.outputFile.strip() jobConfig["addSuffix"] = not options.noSuffix jobConfig["reducer"] = options.reducer.strip() jobConfig["maxSize"] = options.maxSize if jobConfig["reducer"] != "None" and not jobConfig["outputFile"]: raise RuntimeError("Reducer requires output file name specification") ### OPEN WORKSPACE ### os.mkdir(workspace) os.mkdir(workspace + "/inputs") os.mkdir(workspace + "/logs") if options.environment.strip(): cmds = terminal.communicate(options.environment.strip()) with open(workspace + "/environment", "w") as envFile: for cmd in cmds: envFile.write(cmd + "\n") print "Using {0} as workspace".format(workspace) ### RUNTIME-SPECIFIC CONFIGURATIONS ### jobConfig["taskID"] = taskID jobConfig["serverHost"] = os.environ["HOSTNAME"] jobConfig["serverPort"] = tcpServer.server_address[1] jobConfig["serverWorkDir"] = TMPDIR + "/" + taskID jobConfig["logDir"] = HTMLDIR + "/" + taskID + "/logs" # In principle log directory can be anywhere; we are choosing it to be directly in the HTMLDIR for convenience