def parasolRestart(): """Function starts the parasol hub and node. """ parasolStop() while True: machineList = os.path.join(workflowRootPath(), "workflow", "jobTree", "machineList") #pathEnvVar = os.environ["PATH"] os.system("paraNode start -hub=localhost") #-umask=002 -userPath=%s -sysPath=%s" % (pathEnvVar, pathEnvVar)) os.system("paraHub %s subnet=127.0.0 &" % (machineList,)) tempFile = getTempFile() dead = True try: popen("parasol status", tempFile) fileHandle = open(tempFile, 'r') line = fileHandle.readline() while line != '': if "Nodes dead" in line: print line if int(line.split()[-1]) == 0: dead = False line = fileHandle.readline() fileHandle.close() except RuntimeError: pass os.remove(tempFile) if not dead: break else: logger.info("Tried to restart the parasol process, but failed, will try again") parasolStop() time.sleep(5) logger.info("Restarted the parasol process")
def issueJobs(self, jobCommands): """Issues parasol with job commands. """ issuedJobs = {} for jobCommand, memory, cpu, logFile in jobCommands: assert memory != None assert cpu != None assert logFile != None pattern = re.compile("your job ([0-9]+).*") command = "parasol -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (memory, cpu, self.parasolResultsFile, jobCommand) while True: #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary popenParasolCommand(command, self.scratchFile) fileHandle = open(self.scratchFile, 'r') line = fileHandle.readline() fileHandle.close() match = pattern.match(line) if match != None: #This is because parasol add job will return success, even if the job was not properly issued! break else: logger.info("We failed to properly add the job, we will try again after a sleep") time.sleep(5) jobID = int(match.group(1)) logger.debug("Got the job id: %s from line: %s" % (jobID, line)) assert jobID not in issuedJobs.keys() issuedJobs[jobID] = jobCommand logger.debug("Issued the job command: %s with job id: %i " % (command, jobID)) return issuedJobs
def parseJobFile(absFileName): try: job = ET.parse(absFileName).getroot() return job except IOError: logger.info("Encountered error while parsing job file %s, so we will ignore it" % absFileName) return None
def run(self): ########################################## #Setup a file tree. ########################################## tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString())) fileTreeRootFile = tempFileTree.getTempFile() makeFileTree(fileTreeRootFile, \ self.depth, tempFileTree) treePointer = tempFileTree.getTempFile() makeTreePointer(fileTreeRootFile, treePointer) logger.info("We've set up the file tree") ########################################## #Issue the child and follow on jobs ########################################## self.addChildTarget(ChildTarget(treePointer)) self.setFollowOnTarget(DestructFileTree(tempFileTree)) logger.info("We've added the child target and finished SetupFileTree.run()")
def setupTempFileTrees(config): """Load the temp file trees """ config.attrib["job_file_dir"] = TempFileTree(config.attrib["job_file_dir"]) config.attrib["temp_dir_dir"] = TempFileTree(config.attrib["temp_dir_dir"]) config.attrib["log_file_dir"] = TempFileTree(config.attrib["log_file_dir"]) config.attrib["slave_log_file_dir"] = TempFileTree(config.attrib["slave_log_file_dir"]) logger.info("Setup the temp file trees")
def run (self): parasolRestart() while True: time.sleep(random.choice(xrange(240))) if self.kill == True: return logger.info("Going to kill a parasol/master process") killMasterAndParasol()
def loadEnvironment(config): """Puts the environment in the pickle file. """ #Dump out the environment of this process in the environment pickle file. fileHandle = open(config.attrib["environment_file"], 'w') cPickle.dump(os.environ, fileHandle) fileHandle.close() logger.info("Written the environment for the jobs to the environment file")
def createFirstJob(command, config, memory=None, cpu=None, time=sys.maxint): """Adds the first job to to the jobtree. """ logger.info("Adding the first job") if memory == None: memory = config.attrib["default_memory"] if cpu == None: cpu = config.attrib["default_cpu"] job = createJob({ "command":command, "memory":str(int(memory)), "cpu":str(int(cpu)), "time":str(float(time)) }, None, config) writeJobs([job]) logger.info("Added the first job")
def killJobs(self, jobIDs): """Kills the given jobs, represented as Job ids, then checks they are dead by checking they are not in the list of issued jobs. """ while True: for jobID in jobIDs: i = popenParasolCommand("parasol remove job %i" % jobID, tmpFileForStdOut=self.scratchFile, runUntilSuccessful=None) logger.info("Tried to remove jobID: %i, with exit value: %i" % (jobID, i)) runningJobs = self.getIssuedJobIDs() if set(jobIDs).difference(set(runningJobs)) == set(jobIDs): return time.sleep(5) logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
def checkFileTreeCounts(rootFile): """Check the file tree produced by the test. """ tree = ET.parse(rootFile).getroot() i = 0 children = tree.find("children").findall("child") if len(children) == 0: i = 1 else: for child in children: i += checkFileTreeCounts(child.attrib["file"]) logger.info("File tree counts: %i %i" % (i, int(tree.attrib["count"]))) assert i == int(tree.attrib["count"]) return i
def runJobTree(command, jobTreeDir, logLevel="DEBUG", retryCount=0, batchSystem="single_machine", rescueJobFrequency=None): """A convenience function for running job tree from within a python script. """ if rescueJobFrequency != None: rescueJobFrequencyString = "--rescueJobsFrequency %s" % float(rescueJobFrequency) else: rescueJobFrequencyString = "" command = "jobTree --command \"%s\" --jobTree %s --logLevel %s \ --retryCount %i --batchSystem %s %s" % \ (command, jobTreeDir, logLevel, retryCount, batchSystem, rescueJobFrequencyString) logger.info("Running command : %s" % command) system(command) logger.info("Ran the jobtree apparently okay")
def restartFailedJobs(config, jobFiles): """Traverses through the file tree and resets the restart count of all jobs. """ for absFileName in jobFiles: if os.path.isfile(absFileName): job = ET.parse(absFileName).getroot() logger.info("Restarting job: %s" % job.attrib["file"]) job.attrib["remaining_retry_count"] = config.attrib["retry_count"] if job.attrib["colour"] == "red": job.attrib["colour"] = "white" #Is leaf and job failed when the system went downbut the status did not get updated. if job.attrib["colour"] == "grey": job.attrib["colour"] = "white" writeJobs([ job ])
def reloadJobTree(jobTree): """Load the job tree from a dir. """ logger.info("The job tree appears to already exist, so we'll reload it") assert os.path.isfile(os.path.join(jobTree, "config.xml")) #A valid job tree must contain the config file assert os.path.isfile(os.path.join(jobTree, "environ.pickle")) #A valid job tree must contain a pickle file which encodes the path environment of the job assert os.path.isfile(os.path.join(jobTree, "jobNumber.xml")) #A valid job tree must contain a file which is updated with the number of jobs that have been run. assert os.path.isdir(os.path.join(jobTree, "jobs")) #A job tree must have a directory of jobs. assert os.path.isdir(os.path.join(jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in). assert os.path.isdir(os.path.join(jobTree, "logFileDir")) #A job tree must have a directory of log files. assert os.path.isdir(os.path.join(jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files. config = ET.parse(os.path.join(jobTree, "config.xml")).getroot() setupTempFileTrees(config) batchSystem = loadTheBatchSystem(config) logger.info("Reloaded the jobtree") return config, batchSystem
def killMasterAndParasol(): """Method to destroy master process """ tempFile = getTempFile() popen("ps -a", tempFile) fileHandle = open(tempFile, 'r') line = fileHandle.readline() #Example parasol state lines: #67401 ttys002 0:00.06 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i #67403 ttys002 0:00.65 /Users/benedictpaten/kent/src/parasol/bin/paraHub -log=/tmp/hub.2009-07-08.log machineList subnet=127.0.0 #68573 ttys002 0:00.00 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i while line != '': tokens = line.split() if 'paraNode' in line or 'paraHub' in line: if random.random() > 0.5: i = os.system("kill %i" % int(tokens[0])) logger.info("Tried to kill parasol process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i)) break elif 'jobTreeMaster.py' in line: logger.info("Have job tree master line") if random.random() > 0.5: i = os.system("kill %i" % int(tokens[0])) logger.info("Tried to kill master process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i)) break line = fileHandle.readline() fileHandle.close() os.remove(tempFile) parasolRestart()
def testJobTree_Parasol(self): """Runs a test program using the job tree, whilst constantly restarting parasol by killing the nodes. """ for test in xrange(self.testNo): #Does not run this test when doing short testing jobTreeCommand, fileTreeRootFile = setupJobTree(self.tempFileTree, self.jobTreeDir, "parasol", depth=self.depth) jobTreeCommand += " --rescueJobsFrequency 20" #Run the job parasolAndMasterKiller = ParasolAndMasterKiller() parasolAndMasterKiller.start() while True: while True: process = subprocess.Popen(jobTreeCommand, shell=True) sts = os.waitpid(process.pid, 0) if sts[1] == 0: logger.info("The job tree master ended, with an okay exit value (using parasol)") break else: logger.info("The job tree master ended with an error exit value, restarting: %i" % sts[1]) if checkEndStateOfJobTree(self.jobTreeDir): #Check the state of the job files break jobTreeCommand = "jobTree --jobTree %s --logDebug" % self.jobTreeDir checkFileTreeCounts(fileTreeRootFile) os.system("rm -rf %s" % self.jobTreeDir) parasolAndMasterKiller.stopKilling() logger.info("Test done okay")
def setupJobTree(tempFileTree, jobTreeDir, batchSystem, depth=2): """Sets up a job tree using the jobTreeSetup.py command. """ #Setup a job retryCount = random.choice(xrange(1,10)) logger.info("Setup the basic files for the test") fileTreeRootFile = tempFileTree.getTempFile() makeFileTree(fileTreeRootFile, depth, tempFileTree) treePointerFile = makeTreePointer(fileTreeRootFile, tempFileTree.getTempFile()) #Setup the job command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % \ (treePointerFile) jobTreeCommand = "jobTree --jobTree %s --retryCount %i\ --command '%s' --logLevel=INFO --maxJobDuration 100 --batchSystem %s" % \ (jobTreeDir, retryCount, command, batchSystem) logger.info("Setup the job okay") return (jobTreeCommand, fileTreeRootFile)
def testJobTree(testNo, depth, tempFileTree, jobTreeDir, batchSystem): """Runs a test program using the job tree using the single machine batch system. """ for test in xrange(testNo): jobTreeCommand, fileTreeRootFile = setupJobTree(tempFileTree, jobTreeDir, batchSystem, depth=depth) #Run the job while True: print "job tree command", jobTreeCommand process = subprocess.Popen(jobTreeCommand, shell=True) sts = os.waitpid(process.pid, 0) assert sts[1] == 0 logger.info("The job tree master ended, with an okay exit value") if checkEndStateOfJobTree(jobTreeDir): #Check the state of the job files, exit if none break jobTreeCommand = "jobTree --jobTree %s --logInfo" % jobTreeDir checkFileTreeCounts(fileTreeRootFile) os.system("rm -rf %s" % jobTreeDir) logger.info("Test done okay")
def createJobTree(options): logger.info("Starting to create the job tree setup for the first time") options.jobTree = os.path.abspath(options.jobTree) os.mkdir(options.jobTree) config = ET.Element("config") config.attrib["environment_file"] = os.path.join(options.jobTree, "environ.pickle") config.attrib["job_number_file"] = os.path.join(options.jobTree, "jobNumber.xml") config.attrib["job_file_dir"] = os.path.join(options.jobTree, "jobs") config.attrib["temp_dir_dir"] = os.path.join(options.jobTree, "tempDirDir") config.attrib["log_file_dir"] = os.path.join(options.jobTree, "logFileDir") config.attrib["slave_log_file_dir"] = os.path.join(options.jobTree, "slaveLogFileDir") config.attrib["results_file"] = os.path.join(options.jobTree, "results.txt") config.attrib["scratch_file"] = os.path.join(options.jobTree, "scratch.txt") config.attrib["retry_count"] = str(int(options.retryCount)) config.attrib["max_job_duration"] = str(float(options.maxJobDuration)) config.attrib["batch_system"] = options.batchSystem config.attrib["job_time"] = str(float(options.jobTime)) config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize)) config.attrib["default_memory"] = str(int(options.defaultMemory)) config.attrib["default_cpu"] = str(int(options.defaultCpu)) config.attrib["max_jobs"] = str(int(options.maxJobs)) config.attrib["max_threads"] = str(int(options.maxThreads)) if options.stats: config.attrib["stats"] = os.path.join(options.jobTree, "stats.xml") fileHandle = open(config.attrib["stats"], 'w') fileHandle.write("<stats>") fileHandle.close() #Load the batch system. batchSystem = loadTheBatchSystem(config) #Set the two parameters determining the polling frequency of the system. config.attrib["wait_duration"] = str(float(batchSystem.getWaitDuration())) if options.waitDuration != None: config.attrib["wait_duration"] = str(float(options.waitDuration)) config.attrib["rescue_jobs_frequency"] = str(float(batchSystem.getRescueJobFrequency())) if options.rescueJobsFrequency != None: config.attrib["rescue_jobs_frequency"] = str(float(options.rescueJobsFrequency)) #Write the config file to disk fileHandle = open(os.path.join(options.jobTree, "config.xml"), 'w') tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() logger.info("Written the config file") #Set up the jobNumber file fileHandle = open(config.attrib["job_number_file"], 'w') ET.ElementTree(ET.Element("job_number", { "job_number":'0' })).write(fileHandle) fileHandle.close() #Setup the temp file trees. setupTempFileTrees(config) logger.info("Finished the job tree setup") return config, batchSystem
def main(): parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree to kill") options, args = parseBasicOptions(parser) logger.info("Parsed arguments") assert len(args) == 0 #This program takes no arguments assert options.jobTree != None #The jobtree should not be null assert os.path.isdir(options.jobTree) #The job tree must exist if we are going to kill it. logger.info("Starting routine to kill running jobs in the jobTree: %s" % options.jobTree) config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot() batchSystem = loadTheBatchSystem(config) #This should automatically kill the existing jobs.. so we're good. for job in batchSystem.getIssuedJobIDs(): #Just in case we do it again. batchSystem.killJobs(job) logger.info("All jobs SHOULD have been killed")
def loadTheBatchSystem(config): """Load the batch system. """ batchSystemString = config.attrib["batch_system"] if batchSystemString == "parasol": batchSystem = ParasolBatchSystem(config) logger.info("Using the parasol batch system") elif batchSystemString == "single_machine" or batchSystemString == "singleMachine": batchSystem = SingleMachineBatchSystem(config) logger.info("Using the single machine batch system") elif batchSystemString == "gridengine" or batchSystemString == "gridEngine": batchSystem = GridengineBatchSystem(config) logger.info("Using the grid engine machine batch system") elif batchSystemString == "acid_test" or batchSystemString == "acidTest": batchSystem = SingleMachineBatchSystem(config, workerClass=BadWorker) config.attrib["retry_count"] = str(32) #The chance that a job does not complete after 32 goes in one in 4 billion, so you need a lot of jobs before this becomes probable else: raise RuntimeError("Unrecognised batch system: %s" % batchSystemString) return batchSystem
def __init__(self, config): AbstractBatchSystem.__init__(self, config) #Call the parent constructor #Keep the name of the results file for the pstat2 command.. self.parasolResultsFile = config.attrib["results_file"] #Reset the job queue and results (initially, we do this again once we've killed the jobs) self.parasolResultsFileHandle = open(self.parasolResultsFile, 'w') self.parasolResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence self.queuePattern = re.compile("q\s+([0-9]+)") self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+") #The scratch file self.scratchFile = self.config.attrib["scratch_file"] self.killJobs(self.getIssuedJobIDs()) #Kill any jobs on the current stack logger.info("Going to sleep for a few seconds to kill any existing jobs") time.sleep(5) #Give batch system a second to sort itself out. logger.info("Removed any old jobs from the queue") #Reset the job queue and results self.parasolResultsFileHandle = open(self.parasolResultsFile, 'w') self.parasolResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence self.parasolResultsFileHandle = open(self.parasolResultsFile, 'r') logger.info("Reset the results queue")
def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats): from workflow.jobTree.lib.bioio import getTempFile from workflow.jobTree.lib.bioio import getTempDirectory from workflow.jobTree.lib.bioio import logger from workflow.jobTree.lib.bioio import system from workflow.jobTree.lib.bioio import getTotalCpuTime assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) command = jobToRun.attrib["command"] #Copy the job file to be edited tempJob = ET.Element("job") ET.SubElement(tempJob, "children") #Log for job tempJob.attrib["log_level"] = job.attrib["log_level"] #Time length of 'ideal' job before further parallelism is required tempJob.attrib["job_time"] = job.attrib["job_time"] #Dir to put all the temp files in. localSlaveTempDir = getTempDirectory() #Temp file dirs for job. localTempDir = getTempDirectory(rootDir=localSlaveTempDir) tempJob.attrib["local_temp_dir"] = localTempDir depth = len(job.find("followOns").findall("followOn")) tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth)) if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate. os.mkdir(tempJob.attrib["global_temp_dir"]) os.chmod(tempJob.attrib["global_temp_dir"], 0777) if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))): system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1))) assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2))) #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have, #according to the batch system tempJob.attrib["available_memory"] = str(memoryAvailable) tempJob.attrib["available_cpu"] = str(cpuAvailable) if stats != None: tempJob.attrib["stats"] = getTempFile(rootDir=localSlaveTempDir) os.remove(tempJob.attrib["stats"]) #Now write the temp job file tempFile = getTempFile(rootDir=localSlaveTempDir) fileHandle = open(tempFile, 'w') tree = ET.ElementTree(tempJob) tree.write(fileHandle) fileHandle.close() logger.info("Copied the jobs files ready for the job") if "JOB_FILE" not in command: logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command) job.attrib["colour"] = "red" #Update the colour else: #First load the environment for the job. fileHandle = open(job.attrib["environment_file"], 'r') environment = cPickle.load(fileHandle) fileHandle.close() logger.info("Loaded the environment for the process") #Run the actual command tempLogFile = getTempFile(suffix=".log", rootDir=localSlaveTempDir) fileHandle = open(tempLogFile, 'w') finalCommand = command.replace("JOB_FILE", tempFile) if stats != None: startTime = time.time() startClock = getTotalCpuTime() process = subprocess.Popen(finalCommand, shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment) sts = os.waitpid(process.pid, 0) fileHandle.close() truncateFile(tempLogFile, int(job.attrib["max_log_file_size"])) #Copy across the log file system("mv %s %s" % (tempLogFile, job.attrib["log_file"])) i = sts[1] logger.info("Ran the job command=%s with exit status %i" % (finalCommand, i)) if i == 0: logger.info("Passed the job, okay") if stats != None: jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) }) if os.path.exists(tempJob.attrib["stats"]): jobTag.append(ET.parse(tempJob.attrib["stats"]).getroot()) tempJob = ET.parse(tempFile).getroot() job.attrib["colour"] = "black" #Update the colour #Update the runtime of the stack.. totalRuntime = float(job.attrib["total_time"]) #This is the estimate runtime of the jobs on the followon stack runtime = float(jobToRun.attrib["time"]) totalRuntime -= runtime if totalRuntime < 0.0: totalRuntime = 0.0 #The children children = job.find("children") assert len(children.findall("child")) == 0 #The children assert tempJob.find("children") != None for child in tempJob.find("children").findall("child"): memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child) ET.SubElement(children, "child", { "command":child.attrib["command"], "time":str(compTime), "memory":str(memory), "cpu":str(cpu) }) logger.info("Making a child with command: %s" % (child.attrib["command"])) #The follow on command followOns = job.find("followOns") followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job if tempJob.attrib.has_key("command"): memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob) ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], "time":str(compTime), "memory":str(memory), "cpu":str(cpu) }) ##Add the runtime to the total runtime.. totalRuntime += compTime logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"]) elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent. ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" }) logger.info("Making a stub follow on job") #Write back the runtime, after addin the follow on time and subtracting the time of the run job. job.attrib["total_time"] = str(totalRuntime) else: logger.info("Failed the job") job.attrib["colour"] = "red" #Update the colour #Clean up system("rm -rf %s" % (localSlaveTempDir)) logger.info("Cleaned up by removing temp jobfile (the copy), and the temporary file directory for the job")
def main(): sys.path += [ sys.argv[1] ] sys.argv.remove(sys.argv[1]) #Now we can import all the stuff.. from workflow.jobTree.lib.bioio import getBasicOptionParser from workflow.jobTree.lib.bioio import parseBasicOptions from workflow.jobTree.lib.bioio import logger from workflow.jobTree.lib.bioio import addLoggingFileHandler from workflow.jobTree.lib.bioio import setLogLevel from workflow.jobTree.lib.bioio import getTotalCpuTime from workflow.jobTree.lib.master import writeJobs ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--job", dest="jobFile", help="Job file containing command to run", default="None") options, args = parseBasicOptions(parser) assert len(args) == 0 ########################################## #Parse the job. ########################################## job = ET.parse(options.jobFile).getroot() ########################################## #Setup the logging ########################################## #Setup the logging setLogLevel(job.attrib["log_level"]) addLoggingFileHandler(job.attrib["slave_log_file"], rotatingLogging=False) logger.info("Parsed arguments and set up logging") ########################################## #Setup the stats, if requested ########################################## if job.attrib.has_key("stats"): startTime = time.time() startClock = time.clock() stats = ET.Element("slave") else: stats = None ########################################## #Run the script. ########################################## maxTime = float(job.attrib["job_time"]) assert maxTime > 0.0 assert maxTime < sys.maxint jobToRun = job.find("followOns").findall("followOn")[-1] memoryAvailable = int(jobToRun.attrib["memory"]) cpuAvailable = int(jobToRun.attrib["cpu"]) while True: processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats) if job.attrib["colour"] != "black": logger.info("Exiting the slave because of a failed job") break totalRuntime = float(job.attrib["total_time"]) #This is the estimate runtime of the jobs on the followon stack childrenNode = job.find("children") childrenList = childrenNode.findall("child") #childRuntime = sum([ float(child.attrib["time"]) for child in childrenList ]) if len(childrenList) >= 2: # or totalRuntime + childRuntime > maxTime: #We are going to have to return to the parent logger.info("No more jobs can run in series by this slave, its got %i children" % len(childrenList)) break followOns = job.find("followOns") while len(childrenList) > 0: child = childrenList.pop() childrenNode.remove(child) totalRuntime += float(child.attrib["time"]) ET.SubElement(followOns, "followOn", child.attrib.copy()) #assert totalRuntime <= maxTime + 1 #The plus one second to avoid unimportant rounding errors job.attrib["total_time"] = str(totalRuntime) assert len(childrenNode.findall("child")) == 0 if len(followOns.findall("followOn")) == 0: logger.info("No more jobs can run by this slave as we have exhausted the follow ons") break #Get the next job and see if we have enough cpu and memory to run it.. jobToRun = job.find("followOns").findall("followOn")[-1] if int(jobToRun.attrib["memory"]) > memoryAvailable: logger.info("We need more memory for the next job, so finishing") break if int(jobToRun.attrib["cpu"]) > cpuAvailable: logger.info("We need more cpus for the next job, so finishing") break ##Updated the job so we can start the next loop cycle job.attrib["colour"] = "grey" writeJobs([ job ]) logger.info("Updated the status of the job to grey and starting the next job") #Write back the job file with the updated jobs, using the checkpoint method. writeJobs([ job ]) logger.info("Written out an updated job file") logger.info("Finished running the chain of jobs on this node") ########################################## #Finish up the stats ########################################## if stats != None: stats.attrib["time"] = str(time.time() - startTime) stats.attrib["clock"] = str(getTotalCpuTime() - startClock) fileHandle = open(job.attrib["stats"], 'w') ET.ElementTree(stats).write(fileHandle) fileHandle.close()
def main(): parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--job", dest="jobFile", help="Job file containing command to run", default="None") parser.add_option( "--treePointer", dest="treePointerFile", help="File containing pointer to the tree data", default="None" ) options, args = parseBasicOptions(parser) logger.info("Parsed the input arguments") job = ET.parse(options.jobFile).getroot() setLogLevel(job.attrib["log_level"]) logger.info("Parsed the job XML") treePointer = ET.parse(options.treePointerFile).getroot() logger.info("Parsed the tree pointer XML") tree = ET.parse(treePointer.attrib["file"]).getroot() logger.info("Parsed the tree XML") for child in tree.find("children").findall("child"): # Make the chuld tree pointer childTreePointerFile = makeTreePointer(child.attrib["file"], getTempFile(rootDir=job.attrib["global_temp_dir"])) # Make the child command unbornChild = ET.SubElement(job.find("children"), "child") command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % (childTreePointerFile,) unbornChild.attrib["command"] = command if random.random() > 0.2: unbornChild.attrib["time"] = str(random.random() * 10) # Make the child tree pointer ET.SubElement(treePointer.find("children"), "child", {"file": childTreePointerFile}) job.attrib["command"] = "jobTreeTest_CommandSecond.py --treePointer %s --job JOB_FILE" % (options.treePointerFile,) logger.info("Made new command") fileHandle = open(options.jobFile, "w") ET.ElementTree(job).write(fileHandle) fileHandle.close() logger.info("Updated the job file") print >> sys.stderr, "Checking that we can report to std err" # These lines should end up in the logs print "Checking that we can report to std out" if random.random() > 0.9: logger.info("Going to fail the job") sys.exit(1) logger.info("Going to pass the job done okay") sys.exit(0)
def main(): """Reports the state of the job tree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [options] \nThe colours returned indicate the state of the job.\n\ \twhite: job has not been started yet\n\ \tgrey: job is issued to batch system\n\ \tred: job failed\n\ \tblue: job has children currently being processed\n\ \tblack: job has finished and will be processed (transient state)\n\ \tdead: job is totally finished and is awaiting deletion (transient state)", "%prog 0.1") parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree") parser.add_option("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of errors", default=False) parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if job tree jobs not all completed", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") assert len(args) == 0 if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) #The given job dir tree must exist. assert os.path.isfile(os.path.join(options.jobTree, "config.xml")) #A valid job tree must contain the config gile assert os.path.isdir(os.path.join(options.jobTree, "jobs")) #A job tree must have a directory of jobs. assert os.path.isdir(os.path.join(options.jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in). assert os.path.isdir(os.path.join(options.jobTree, "logFileDir")) #A job tree must have a directory of log files. assert os.path.isdir(os.path.join(options.jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files. ########################################## #Read the total job number ########################################## config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot() ########################################## #Survey the status of the job and report. ########################################## colours = {} jobFiles = TempFileTree(config.attrib["job_file_dir"]).listFiles() if len(jobFiles) > 0: logger.info("Collating the colours of the job tree") for absFileName in jobFiles: job = parseJobFile(absFileName) if job != None: if not colours.has_key(job.attrib["colour"]): colours[job.attrib["colour"]] = 0 colours[job.attrib["colour"]] += 1 else: logger.info("There are no jobs to collate") print "There are %i jobs currently in job tree: %s" % \ (len(jobFiles), options.jobTree) for colour in colours.keys(): print "\tColour: %s, number of jobs: %s" % (colour, colours[colour]) if options.verbose: #Verbose currently means outputting the files that have failed. for absFileName in jobFiles: job = parseJobFile(absFileName) if job != None: if job.attrib["colour"] == "red": if os.path.isfile(job.attrib["log_file"]): def fn(string): print string logFile(job.attrib["log_file"], fn) else: logger.info("Log file for job %s is not present" % job.attrib["file"]) if len(jobFiles) != 0 and options.failIfNotComplete: sys.exit(1)
def main(): """Reports stats on the job-tree, use in conjunction with --stats options to jobTree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog", "%prog 0.1") parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree") parser.add_option("--outputFile", dest="outputFile", default=None, help="File in which to write results") options, args = parseBasicOptions(parser) logger.info("Parsed arguments") assert len(args) == 0 if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") if options.jobTree == None: raise RuntimeError("You did not specify the job-tree") if not os.path.isdir(options.jobTree): raise RuntimeError("The given job dir tree does not exist: %s" % options.jobTree) if not os.path.isfile(os.path.join(options.jobTree, "config.xml")): raise RuntimeError("A valid job tree must contain the config file") if not os.path.isfile(os.path.join(options.jobTree, "stats.xml")): raise RuntimeError("The job-tree was run without the --stats flag, so no stats were created") ########################################## #Read the stats and config ########################################## config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot() stats = ET.parse(os.path.join(options.jobTree, "stats.xml")).getroot() ########################################## #Collate the stats and report ########################################## def fn(element, items, itemName): itemTimes = [ float(item.attrib["time"]) for item in items ] itemTimes.sort() itemClocks = [ float(item.attrib["clock"]) for item in items ] itemClocks.sort() itemWaits = [ float(item.attrib["time"]) - float(item.attrib["clock"]) for item in items ] itemWaits.sort() if len(itemTimes) == 0: itemTimes.append(0) itemClocks.append(0) itemWaits.append(0) return ET.SubElement(element, itemName, { "total_number":str(len(items)), "total_time":str(sum(itemTimes)), "median_time":str(itemTimes[len(itemTimes)/2]), "average_time":str(sum(itemTimes)/len(itemTimes)), "min_time":str(min(itemTimes)), "max_time":str(max(itemTimes)), "total_clock":str(sum(itemClocks)), "median_clock":str(itemClocks[len(itemClocks)/2]), "average_clock":str(sum(itemClocks)/len(itemClocks)), "min_clock":str(min(itemClocks)), "max_clock":str(max(itemClocks)), "total_wait":str(sum(itemWaits)), "median_wait":str(itemWaits[len(itemWaits)/2]), "average_wait":str(sum(itemWaits)/len(itemWaits)), "min_wait":str(min(itemWaits)), "max_wait":str(max(itemWaits)) }) def fn2(element, containingItems, containingItemName, getFn): itemCounts = [ len(getFn(containingItem)) for containingItem in containingItems ] itemCounts.sort() if len(itemCounts) == 0: itemCounts.append(0) element.attrib["median_number_per_%s" % containingItemName] = str(itemCounts[len(itemCounts)/2]) element.attrib["average_number_per_%s" % containingItemName] = str(float(sum(itemCounts))/len(itemCounts)) element.attrib["min_number_per_%s" % containingItemName] = str(min(itemCounts)) element.attrib["max_number_per_%s" % containingItemName] = str(max(itemCounts)) if stats.find("total_time") == None: #Hack to allow it to work on unfinished jobtrees. ET.SubElement(stats, "total_time", { "time":"0.0", "clock":"0.0"}) collatedStatsTag = ET.Element("collated_stats", { "total_run_time":stats.find("total_time").attrib["time"], "total_clock":stats.find("total_time").attrib["clock"], "batch_system":config.attrib["batch_system"], "job_time":config.attrib["job_time"], "default_memory":config.attrib["default_memory"], "default_cpu":config.attrib["default_cpu"], "max_jobs":config.attrib["max_jobs"], "max_threads":config.attrib["max_threads"] }) #Add slave info slaves = stats.findall("slave") fn(collatedStatsTag, slaves, "slave") #Add job info jobs = [] for slave in slaves: jobs += slave.findall("job") def fn3(slave): return slave.findall("job") fn2(fn(collatedStatsTag, jobs, "job"), slaves, "slave", fn3) #Add aggregated target info targets = [] for job in jobs: for stack in job.findall("stack"): targets += stack.findall("target") def fn4(job): targets = [] for stack in job.findall("stack"): targets += stack.findall("target") return targets fn2(fn(collatedStatsTag, targets, "target"), jobs, "job", fn4) #Get info for each target targetNames = set() for target in targets: targetNames.add(target.attrib["class"]) targetTypesTag = ET.SubElement(collatedStatsTag, "target_types") for targetName in targetNames: targetTypes = [ target for target in targets if target.attrib["class"] == targetName ] targetTypeTag = fn(targetTypesTag, targetTypes, targetName) estimatedRunTimes = [ float(target.attrib["e_time"]) for target in targetTypes ] targetTypeTag.attrib["estimated_time"] = str(sum(estimatedRunTimes)/len(estimatedRunTimes)) def prettify(elem): """Return a pretty-printed XML string for the Element. """ rough_string = ET.tostring(elem, 'utf-8') reparsed = minidom.parseString(rough_string) return reparsed.toprettyxml(indent=" ") #Now dump it all out to file if options.outputFile != None: fileHandle = open(options.outputFile, 'w') #ET.ElementTree(collatedStatsTag).write(fileHandle) fileHandle.write(prettify(collatedStatsTag)) fileHandle.close() #Now dump onto the screen print prettify(collatedStatsTag)
def mainLoop(config, batchSystem): """This is the main loop from which jobs are issued and processed. """ waitDuration = float(config.attrib["wait_duration"]) assert waitDuration >= 0 rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"]) maxJobDuration = float(config.attrib["max_job_duration"]) assert maxJobDuration >= 0 logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \ (waitDuration, rescueJobsFrequency, maxJobDuration)) #Kill any jobs on the batch system queue from the last time. assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs! logger.info("Checked batch system has no running jobs and no updated jobs") jobFiles = config.attrib["job_file_dir"].listFiles() logger.info("Got a list of job files") #Repair the job tree using any .old files fixJobsList(config, jobFiles) logger.info("Fixed the job files using any .old files") #Get jobs that were running, or that had failed reset to 'white' status restartFailedJobs(config, jobFiles) logger.info("Reworked failed jobs") updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started. for jobFile in jobFiles: job = ET.parse(jobFile).getroot() if job.attrib["colour"] not in ("grey", "blue"): updatedJobFiles.add(jobFile) logger.info("Got the active (non grey/blue) job files") totalJobFiles = len(jobFiles) #Total number of job files we have. jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system. idealJobTime = float(config.attrib["job_time"]) assert idealJobTime > 0.0 maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system assert maxIssuedJobs >= 1 stats = config.attrib.has_key("stats") if stats: startTime = time.time() startClock = getTotalCpuTime() logger.info("Starting the main loop") timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug while True: if len(updatedJobFiles) > 0: logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash))) jobsToIssue = [] for jobFile in list(updatedJobFiles): job = ET.parse(jobFile).getroot() assert job.attrib["colour"] not in ("grey", "blue") if job.attrib["colour"] == "white": #Get ready to start the job if len(jobIDsToJobsHash) < maxIssuedJobs: logger.debug("Job: %s is being started" % job.attrib["file"]) updatedJobFiles.remove(job.attrib["file"]) #Reset the log files for the job. open(job.attrib["slave_log_file"], 'w').close() open(job.attrib["log_file"], 'w').close() job.attrib["colour"] = "grey" #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued #issueJobs([ job ], jobIDsToJobsHash, batchSystem) jobsToIssue.append(job) else: logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash))) elif job.attrib["colour"] == "black": #Job has finished okay logger.debug("Job: %s has finished okay" % job.attrib["file"]) #Deal with stats if stats: system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"])) open(job.attrib["stats"], 'w').close() #Reset the stats file childCount = int(job.attrib["child_count"]) blackChildCount = int(job.attrib["black_child_count"]) assert childCount == blackChildCount #Has no currently running child jobs #Launch any unborn children unbornChildren = job.find("children") unbornChild = unbornChildren.find("child") if unbornChild != None: #We must give birth to the unborn children logger.debug("Job: %s has children to schedule" % job.attrib["file"]) newChildren = [] while unbornChild != None: cummulativeChildTime = float(unbornChild.attrib["time"]) newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config) totalJobFiles += 1 updatedJobFiles.add(newJob.attrib["file"]) newChildren.append(newJob) unbornChildren.remove(unbornChild) unbornChild = unbornChildren.find("child") #This was code to aggregate groups of children into one job, but we don't do this now #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially). # cummulativeChildTime += float(unbornChild.attrib["time"]) # ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy()) # unbornChildren.remove(unbornChild) # unbornChild = unbornChildren.find("child") newJob.attrib["total_time"] = str(cummulativeChildTime) updatedJobFiles.remove(job.attrib["file"]) job.attrib["child_count"] = str(childCount + len(newChildren)) job.attrib["colour"] = "blue" #Blue - has children running. writeJobs([ job ] + newChildren ) #Check point elif len(job.find("followOns").findall("followOn")) != 0: #Has another job logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"]) ##Reset the job run info job.attrib["remaining_retry_count"] = config.attrib["retry_count"] job.attrib["colour"] = "white" ##End resetting the job writeJobs([ job ]) else: #Job has finished, so we can defer to any parent logger.debug("Job: %s is now dead" % job.attrib["file"]) job.attrib["colour"] = "dead" if job.attrib.has_key("parent"): parent = ET.parse(job.attrib["parent"]).getroot() assert parent.attrib["colour"] == "blue" assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"]) parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1) if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]): parent.attrib["colour"] = "black" assert parent.attrib["file"] not in updatedJobFiles updatedJobFiles.add(parent.attrib["file"]) writeJobs([ job, parent ]) #Check point else: writeJobs([ job ]) elif job.attrib["colour"] == "red": #Job failed logger.critical("Job: %s failed" % job.attrib["file"]) logger.critical("The log file of the failed job") logFile(job.attrib["log_file"], logger.critical) logger.critical("The log file of the slave for the failed job") logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop #Checks assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) remainingRetyCount = int(job.attrib["remaining_retry_count"]) if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere job.attrib["remaining_retry_count"] = str(remainingRetyCount-1) job.attrib["colour"] = "white" logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"])) writeJobs([ job ]) #Check point else: assert remainingRetyCount == 0 updatedJobFiles.remove(job.attrib["file"]) logger.critical("Job: %s is completely failed" % job.attrib["file"]) else: logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"]) assert job.attrib["colour"] == "dead" updatedJobFiles.remove(job.attrib["file"]) totalJobFiles -= 1 deleteJob(job, config) #This could be done earlier, but I like it this way. ###End of for loop writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem) if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0: logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles) break if len(updatedJobFiles) > 0: updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed. else: updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed. for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, result = updatedJobs[jobID] if jobIDsToJobsHash.has_key(jobID): if result == 0: logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID]) else: logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result)) processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash) else: logger.info("A result seems to already have been processed: %i" % jobID) #T if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem) logger.info("Reissued any over long jobs") reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem) logger.info("Rescued any (long) missing jobs") timeSinceJobsLastRescued = time.time() #Going to sleep to let the job system catch up. time.sleep(waitDuration) if stats: fileHandle = open(config.attrib["stats"], 'a') fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock))) fileHandle.close() logger.info("Finished the main loop") return totalJobFiles #Returns number of failed jobs
def main(): parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--job", dest="jobFile", help="Job file containing command to run", default="None") parser.add_option("--treePointer", dest="treePointer", help="File containing pointer to the tree data", default="None") options, args = parseBasicOptions(parser) logger.info("Parsed the input arguments") print >>sys.stderr, "Checking that we can report to std err" #These lines should end up in the logs print "Checking that we can report to std out" job = ET.parse(options.jobFile).getroot() setLogLevel(job.attrib["log_level"]) logger.info("Parsed the job XML") treePointer = ET.parse(options.treePointer).getroot() logger.info("Parsed the tree pointer XML") tree = ET.parse(treePointer.attrib["file"]).getroot() logger.info("Parsed the tree XML") i = 0 children = tree.find("children").findall("child") if len(children) > 0: for child in children: #Parse the child XML tree childTree = ET.parse(child.attrib["file"]).getroot() i += int(childTree.attrib["count"]) else: i = 1 tree.attrib["count"] = str(i) logger.info("Calculated the leaf count: %i" % i) fileHandle = open(treePointer.attrib["file"], 'w') ET.ElementTree(tree).write(fileHandle) fileHandle.close() logger.info("Updated the tree file: %s" % treePointer.attrib["file"]) for childPointer in treePointer.find("children").findall("child"): if os.path.isfile(childPointer.attrib["file"]): os.remove(childPointer.attrib["file"]) logger.info("Removed the child pointer files") logger.info("No need to update the job file, as we didn't make anything new!") if random.random() > 0.9: logger.info("Going to fail the job") sys.exit(1) logger.info("Going to pass the job done okay") sys.exit(0)
def runJobTreeStats(jobTree, outputFile): system("jobTreeStats --jobTree %s --outputFile %s" % (jobTree, outputFile)) logger.info("Ran the job-tree stats command apparently okay")
from workflow.jobTree.scriptTree.stack import loadPickleFile parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--job", dest="jobFile", help="Job file containing command to run") parser.add_option("--target", dest="target", help="File containing a pickled, wrapped instance of target classes") options, args = parseBasicOptions(parser) assert options.target != None logger.info("Parsed the input arguments") #Naughty stuff to do the import of the target we need for className in args: logger.info("Loading the class name", className) l = className.split(".") moduleName = ".".join(l[:-1]) className = l[-1] _temp = __import__(moduleName, globals(), locals(), [ className ], -1) exec "%s = 1" % className vars()[className] = _temp.__dict__[className] target = loadPickleFile(options.target) target.execute(options.jobFile)