def mainLoop(config, batchSystem): """This is the main loop from which jobs are issued and processed. """ waitDuration = float(config.attrib["wait_duration"]) assert waitDuration >= 0 rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"]) maxJobDuration = float(config.attrib["max_job_duration"]) assert maxJobDuration >= 0 logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \ (waitDuration, rescueJobsFrequency, maxJobDuration)) #Kill any jobs on the batch system queue from the last time. assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs! logger.info("Checked batch system has no running jobs and no updated jobs") jobFiles = config.attrib["job_file_dir"].listFiles() logger.info("Got a list of job files") #Repair the job tree using any .old files fixJobsList(config, jobFiles) logger.info("Fixed the job files using any .old files") #Get jobs that were running, or that had failed reset to 'white' status restartFailedJobs(config, jobFiles) logger.info("Reworked failed jobs") updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started. for jobFile in jobFiles: job = ET.parse(jobFile).getroot() if job.attrib["colour"] not in ("grey", "blue"): updatedJobFiles.add(jobFile) logger.info("Got the active (non grey/blue) job files") totalJobFiles = len(jobFiles) #Total number of job files we have. jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system. idealJobTime = float(config.attrib["job_time"]) assert idealJobTime > 0.0 maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system assert maxIssuedJobs >= 1 stats = config.attrib.has_key("stats") if stats: startTime = time.time() startClock = getTotalCpuTime() logger.info("Starting the main loop") timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug while True: if len(updatedJobFiles) > 0: logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash))) jobsToIssue = [] for jobFile in list(updatedJobFiles): job = ET.parse(jobFile).getroot() assert job.attrib["colour"] not in ("grey", "blue") if job.attrib["colour"] == "white": #Get ready to start the job if len(jobIDsToJobsHash) < maxIssuedJobs: logger.debug("Job: %s is being started" % job.attrib["file"]) updatedJobFiles.remove(job.attrib["file"]) #Reset the log files for the job. open(job.attrib["slave_log_file"], 'w').close() open(job.attrib["log_file"], 'w').close() job.attrib["colour"] = "grey" #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued #issueJobs([ job ], jobIDsToJobsHash, batchSystem) jobsToIssue.append(job) else: logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash))) elif job.attrib["colour"] == "black": #Job has finished okay logger.debug("Job: %s has finished okay" % job.attrib["file"]) #Deal with stats if stats: system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"])) open(job.attrib["stats"], 'w').close() #Reset the stats file if job.find("messages") != None: for message in job.find("messages").findall("message"): logger.info("Received the following message from job: %s" % message.attrib["message"]) job.remove(job.find("messages")) childCount = int(job.attrib["child_count"]) blackChildCount = int(job.attrib["black_child_count"]) assert childCount == blackChildCount #Has no currently running child jobs #Launch any unborn children unbornChildren = job.find("children") unbornChild = unbornChildren.find("child") if unbornChild != None: #We must give birth to the unborn children logger.debug("Job: %s has children to schedule" % job.attrib["file"]) newChildren = [] while unbornChild != None: cummulativeChildTime = float(unbornChild.attrib["time"]) newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config) totalJobFiles += 1 updatedJobFiles.add(newJob.attrib["file"]) newChildren.append(newJob) unbornChildren.remove(unbornChild) unbornChild = unbornChildren.find("child") #This was code to aggregate groups of children into one job, but we don't do this now #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially). # cummulativeChildTime += float(unbornChild.attrib["time"]) # ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy()) # unbornChildren.remove(unbornChild) # unbornChild = unbornChildren.find("child") newJob.attrib["total_time"] = str(cummulativeChildTime) updatedJobFiles.remove(job.attrib["file"]) job.attrib["child_count"] = str(childCount + len(newChildren)) job.attrib["colour"] = "blue" #Blue - has children running. writeJobs([ job ] + newChildren ) #Check point elif len(job.find("followOns").findall("followOn")) != 0: #Has another job logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"]) ##Reset the job run info job.attrib["remaining_retry_count"] = config.attrib["retry_count"] job.attrib["colour"] = "white" ##End resetting the job writeJobs([ job ]) else: #Job has finished, so we can defer to any parent logger.debug("Job: %s is now dead" % job.attrib["file"]) job.attrib["colour"] = "dead" if job.attrib.has_key("parent"): parent = ET.parse(job.attrib["parent"]).getroot() assert parent.attrib["colour"] == "blue" assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"]) parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1) if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]): parent.attrib["colour"] = "black" assert parent.attrib["file"] not in updatedJobFiles updatedJobFiles.add(parent.attrib["file"]) writeJobs([ job, parent ]) #Check point else: writeJobs([ job ]) elif job.attrib["colour"] == "red": #Job failed logger.critical("Job: %s failed" % job.attrib["file"]) logger.critical("The log file of the failed job") logFile(job.attrib["log_file"], logger.critical) logger.critical("The log file of the slave for the failed job") logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop #Checks assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) remainingRetyCount = int(job.attrib["remaining_retry_count"]) if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere job.attrib["remaining_retry_count"] = str(remainingRetyCount-1) job.attrib["colour"] = "white" logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"])) writeJobs([ job ]) #Check point else: assert remainingRetyCount == 0 updatedJobFiles.remove(job.attrib["file"]) logger.critical("Job: %s is completely failed" % job.attrib["file"]) else: logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"]) assert job.attrib["colour"] == "dead" updatedJobFiles.remove(job.attrib["file"]) totalJobFiles -= 1 deleteJob(job, config) #This could be done earlier, but I like it this way. ###End of for loop writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem) if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0: logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles) break if len(updatedJobFiles) > 0: updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed. else: updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed. for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, result = updatedJobs[jobID] if jobIDsToJobsHash.has_key(jobID): if result == 0: logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID]) else: logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result)) processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash) else: logger.info("A result seems to already have been processed: %i" % jobID) #T if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem) logger.info("Reissued any over long jobs") reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem) logger.info("Rescued any (long) missing jobs") timeSinceJobsLastRescued = time.time() #Going to sleep to let the job system catch up. time.sleep(waitDuration) if stats: fileHandle = open(config.attrib["stats"], 'a') fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock))) fileHandle.close() logger.info("Finished the main loop") return totalJobFiles #Returns number of failed jobs
def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats): """Runs a job. """ from jobTree.src.bioio import getTempFile from jobTree.src.bioio import getTempDirectory from jobTree.src.bioio import logger from jobTree.src.bioio import system from jobTree.src.bioio import getTotalCpuTime assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) command = jobToRun.attrib["command"] #Copy the job file to be edited tempJob = ET.Element("job") ET.SubElement(tempJob, "children") #Log for job tempJob.attrib["log_level"] = job.attrib["log_level"] #Time length of 'ideal' job before further parallelism is required tempJob.attrib["job_time"] = job.attrib["job_time"] #Dir to put all the temp files in. localSlaveTempDir = getTempDirectory() #Temp file dirs for job. localTempDir = getTempDirectory(rootDir=localSlaveTempDir) tempJob.attrib["local_temp_dir"] = localTempDir depth = len(job.find("followOns").findall("followOn")) tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth)) if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate. os.mkdir(tempJob.attrib["global_temp_dir"]) os.chmod(tempJob.attrib["global_temp_dir"], 0777) if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))): system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1))) assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2))) #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have, #according to the batch system tempJob.attrib["available_memory"] = str(memoryAvailable) tempJob.attrib["available_cpu"] = str(cpuAvailable) if stats != None: tempJob.attrib["stats"] = getTempFile(rootDir=localSlaveTempDir) os.remove(tempJob.attrib["stats"]) #Now write the temp job file tempFile = getTempFile(rootDir=localSlaveTempDir) fileHandle = open(tempFile, 'w') tree = ET.ElementTree(tempJob) tree.write(fileHandle) fileHandle.close() logger.info("Copied the jobs files ready for the job") if "JOB_FILE" not in command: logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command) job.attrib["colour"] = "red" #Update the colour else: #First load the environment for the job. fileHandle = open(job.attrib["environment_file"], 'r') environment = cPickle.load(fileHandle) fileHandle.close() logger.info("Loaded the environment for the process") #Run the actual command tempLogFile = getTempFile(suffix=".log", rootDir=localSlaveTempDir) fileHandle = open(tempLogFile, 'w') finalCommand = command.replace("JOB_FILE", tempFile) if stats != None: startTime = time.time() startClock = getTotalCpuTime() process = subprocess.Popen(finalCommand, shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment) sts = os.waitpid(process.pid, 0) fileHandle.close() truncateFile(tempLogFile, int(job.attrib["max_log_file_size"])) #Copy across the log file system("mv %s %s" % (tempLogFile, job.attrib["log_file"])) i = sts[1] logger.info("Ran the job command=%s with exit status %i" % (finalCommand, i)) if i == 0: logger.info("Passed the job, okay") if stats != None: jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) }) if os.path.exists(tempJob.attrib["stats"]): jobTag.append(ET.parse(tempJob.attrib["stats"]).getroot()) tempJob = ET.parse(tempFile).getroot() job.attrib["colour"] = "black" #Update the colour #Deal with any logging messages directed at the master if tempJob.find("messages") != None: messages = job.find("messages") if messages == None: messages = ET.SubElement(job, "messages") for messageTag in tempJob.find("messages").findall("message"): messages.append(messageTag) #Update the runtime of the stack.. totalRuntime = float(job.attrib["total_time"]) #This is the estimate runtime of the jobs on the followon stack runtime = float(jobToRun.attrib["time"]) totalRuntime -= runtime if totalRuntime < 0.0: totalRuntime = 0.0 #The children children = job.find("children") assert len(children.findall("child")) == 0 #The children assert tempJob.find("children") != None for child in tempJob.find("children").findall("child"): memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child) ET.SubElement(children, "child", { "command":child.attrib["command"], "time":str(compTime), "memory":str(memory), "cpu":str(cpu) }) logger.info("Making a child with command: %s" % (child.attrib["command"])) #The follow on command followOns = job.find("followOns") followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job if tempJob.attrib.has_key("command"): memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob) ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], "time":str(compTime), "memory":str(memory), "cpu":str(cpu) }) ##Add the runtime to the total runtime.. totalRuntime += compTime logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"]) elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent. ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" }) logger.info("Making a stub follow on job") #Write back the runtime, after addin the follow on time and subtracting the time of the run job. job.attrib["total_time"] = str(totalRuntime) else: logger.info("Failed the job") job.attrib["colour"] = "red" #Update the colour #Clean up system("rm -rf %s" % (localSlaveTempDir)) logger.info("Cleaned up by removing temp jobfile (the copy), and the temporary file directory for the job")
def execute(self, jobFile): job = ET.parse(jobFile).getroot() setLogLevel(job.attrib["log_level"]) logger.info("Setup logging with level: %s" % job.attrib["log_level"]) self.tempDirAccessed = False self.localTempDir = job.attrib["local_temp_dir"] self.globalTempDir = job.attrib["global_temp_dir"] maxTime = float(job.attrib["job_time"]) memory = int(job.attrib["available_memory"]) cpu = int(job.attrib["available_cpu"]) if job.attrib.has_key("stats"): stats = ET.Element("stack") startTime = time.time() startClock = time.clock() else: stats = None newChildren = [] #List to add all the children to before we package them #off into stacks newChildCommands = [] #Ditto for the child commands newFollowOns = [] #Ditto for the follow-ons while self.hasRemaining(): if stats != None: #Getting the runtime of the stats module targetStartTime = time.time() targetStartClock = getTotalCpuTime() target = self.popTarget() target.setStack(self) #Debug check that we have the right amount of CPU and memory for the job in hand targetMemory = target.getMemory() if targetMemory != sys.maxint: assert targetMemory <= memory targetCpu = target.getCpu() if targetCpu != sys.maxint: assert targetCpu <= cpu #Run the target, first cleanup then run. target.run() #Cleanup after the target if self.tempDirAccessed: system("rm -rf %s/*" % self.localTempDir) self.tempDirAccessed = False #Handle the follow on followOn = target.getFollowOn() #if target.__class__ != CleanupGlobalTempDirTarget and followOn == None: # followOn = CleanupGlobalTempDirTarget() if followOn != None: #Target to get rid of follow on when done. if target.isGlobalTempDirSet(): followOn.setGlobalTempDir(target.getGlobalTempDir()) newFollowOns.append(followOn) #Now add the children to the newChildren stack newChildren += target.getChildren() #Now add the child commands to the newChildCommands stack newChildCommands += target.getChildCommands() if stats != None: ET.SubElement(stats, "target", { "time":str(time.time() - targetStartTime), "clock":str(getTotalCpuTime() - targetStartClock), "class":".".join((target.__class__.__name__,)), "e_time":str(target.getRunTime())}) for message in target.getMasterLoggingMessages(): if job.find("messages") == None: ET.SubElement(job, "messages") ET.SubElement(job.find("messages"), "message", { "message": message} ) ####### #Now build the new stacks and corresponding jobs ####### #First add all the follow ons to the existing stack and make it a follow on job for job-tree assert not self.hasRemaining() #First sort out the follow on job if len(newFollowOns) > 0: #If we have follow ons followOnRuntime = sum([ followOn.getRunTime() for followOn in newFollowOns ]) if followOnRuntime > maxTime: #We create a parallel list of follow ons followOnStack = Stack(ParallelFollowOnTarget(newFollowOns)) else: followOnStack = Stack(newFollowOns.pop()) while len(newFollowOns) > 0: followOnStack.addTarget(newFollowOns.pop()) job.attrib["command"] = followOnStack.makeRunnable(self.globalTempDir) job.attrib["time"] = str(followOnStack.getRunTime()) followOnMemory = followOnStack.getMemory() assert not job.attrib.has_key("memory") if followOnMemory != sys.maxint: job.attrib["memory"] = str(followOnMemory) assert not job.attrib.has_key("cpu") followOnCpu = followOnStack.getCpu() if followOnCpu != sys.maxint: job.attrib["cpu"] = str(followOnCpu) #Now build stacks of children.. childrenTag = job.find("children") while len(newChildren) > 0: childStack = Stack(newChildren.pop()) while len(newChildren) > 0 and childStack.getRunTime() <= maxTime: childStack.addTarget(newChildren.pop()) childJob = ET.SubElement(childrenTag, "child", { "command":childStack.makeRunnable(self.globalTempDir), "time":str(childStack.getRunTime()) }) childMemory = childStack.getMemory() assert not childJob.attrib.has_key("memory") if childMemory != sys.maxint: childJob.attrib["memory"] = str(childMemory) assert not childJob.attrib.has_key("cpu") childCpu = childStack.getCpu() if childCpu != sys.maxint: childJob.attrib["cpu"] = str(childCpu) #Now build jobs for each child command for childCommand, runTime in newChildCommands: ET.SubElement(childrenTag, "child", { "command":str(childCommand), "time":str(runTime) }) #Now write the updated job file fileHandle = open(jobFile, 'w') ET.ElementTree(job).write(fileHandle) fileHandle.close() #Finish up the stats if stats != None: stats.attrib["time"] = str(time.time() - startTime) stats.attrib["clock"] = str(getTotalCpuTime() - startClock) fileHandle = open(job.attrib["stats"], 'w') ET.ElementTree(stats).write(fileHandle) fileHandle.close()
def main(): sys.path += [ sys.argv[1] ] sys.argv.remove(sys.argv[1]) #Now we can import all the stuff.. from jobTree.src.bioio import getBasicOptionParser from jobTree.src.bioio import parseBasicOptions from jobTree.src.bioio import logger from jobTree.src.bioio import addLoggingFileHandler from jobTree.src.bioio import setLogLevel from jobTree.src.bioio import getTotalCpuTime from jobTree.src.master import writeJobs ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--job", dest="jobFile", help="Job file containing command to run", default="None") options, args = parseBasicOptions(parser) assert len(args) == 0 ########################################## #Parse the job. ########################################## job = ET.parse(options.jobFile).getroot() ########################################## #Setup the logging ########################################## #Setup the logging setLogLevel(job.attrib["log_level"]) addLoggingFileHandler(job.attrib["slave_log_file"], rotatingLogging=False) logger.info("Parsed arguments and set up logging") ########################################## #Setup the stats, if requested ########################################## if job.attrib.has_key("stats"): startTime = time.time() startClock = time.clock() stats = ET.Element("slave") else: stats = None ########################################## #Run the script. ########################################## maxTime = float(job.attrib["job_time"]) assert maxTime > 0.0 assert maxTime < sys.maxint jobToRun = job.find("followOns").findall("followOn")[-1] memoryAvailable = int(jobToRun.attrib["memory"]) cpuAvailable = int(jobToRun.attrib["cpu"]) while True: processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats) if job.attrib["colour"] != "black": logger.info("Exiting the slave because of a failed job") break totalRuntime = float(job.attrib["total_time"]) #This is the estimate runtime of the jobs on the followon stack childrenNode = job.find("children") childrenList = childrenNode.findall("child") #childRuntime = sum([ float(child.attrib["time"]) for child in childrenList ]) if len(childrenList) >= 2: # or totalRuntime + childRuntime > maxTime: #We are going to have to return to the parent logger.info("No more jobs can run in series by this slave, its got %i children" % len(childrenList)) break followOns = job.find("followOns") while len(childrenList) > 0: child = childrenList.pop() childrenNode.remove(child) totalRuntime += float(child.attrib["time"]) ET.SubElement(followOns, "followOn", child.attrib.copy()) #assert totalRuntime <= maxTime + 1 #The plus one second to avoid unimportant rounding errors job.attrib["total_time"] = str(totalRuntime) assert len(childrenNode.findall("child")) == 0 if len(followOns.findall("followOn")) == 0: logger.info("No more jobs can run by this slave as we have exhausted the follow ons") break #Get the next job and see if we have enough cpu and memory to run it.. jobToRun = job.find("followOns").findall("followOn")[-1] if int(jobToRun.attrib["memory"]) > memoryAvailable: logger.info("We need more memory for the next job, so finishing") break if int(jobToRun.attrib["cpu"]) > cpuAvailable: logger.info("We need more cpus for the next job, so finishing") break ##Updated the job so we can start the next loop cycle job.attrib["colour"] = "grey" writeJobs([ job ]) logger.info("Updated the status of the job to grey and starting the next job") #Write back the job file with the updated jobs, using the checkpoint method. writeJobs([ job ]) logger.info("Written out an updated job file") logger.info("Finished running the chain of jobs on this node") ########################################## #Finish up the stats ########################################## if stats != None: stats.attrib["time"] = str(time.time() - startTime) stats.attrib["clock"] = str(getTotalCpuTime() - startClock) fileHandle = open(job.attrib["stats"], 'w') ET.ElementTree(stats).write(fileHandle) fileHandle.close()