示例#1
0
文件: master.py 项目: cvaske/jobTree
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """    
    waitDuration = float(config.attrib["wait_duration"])
    assert waitDuration >= 0
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \
                (waitDuration, rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    jobFiles = config.attrib["job_file_dir"].listFiles()
    logger.info("Got a list of job files")
    
    #Repair the job tree using any .old files
    fixJobsList(config, jobFiles)
    logger.info("Fixed the job files using any .old files")
    
    #Get jobs that were running, or that had failed reset to 'white' status
    restartFailedJobs(config, jobFiles)
    logger.info("Reworked failed jobs")
    
    updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started.
    for jobFile in jobFiles:
        job = ET.parse(jobFile).getroot()
        if job.attrib["colour"] not in ("grey", "blue"):
            updatedJobFiles.add(jobFile)
    logger.info("Got the active (non grey/blue) job files")
    
    totalJobFiles = len(jobFiles) #Total number of job files we have.
    jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system.
    
    idealJobTime = float(config.attrib["job_time"]) 
    assert idealJobTime > 0.0
    
    maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system
    assert maxIssuedJobs >= 1
    
    stats = config.attrib.has_key("stats")
    if stats:
        startTime = time.time()
        startClock = getTotalCpuTime()
    
    logger.info("Starting the main loop")
    timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug
    while True:
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash)))
        
	jobsToIssue = []
        for jobFile in list(updatedJobFiles):
            job = ET.parse(jobFile).getroot()
            assert job.attrib["colour"] not in ("grey", "blue")
            
            if job.attrib["colour"] == "white": #Get ready to start the job
                if len(jobIDsToJobsHash) < maxIssuedJobs:
                    logger.debug("Job: %s is being started" % job.attrib["file"])
                    updatedJobFiles.remove(job.attrib["file"])
                    
                    #Reset the log files for the job.
                    open(job.attrib["slave_log_file"], 'w').close()
                    open(job.attrib["log_file"], 'w').close()
                    
                    job.attrib["colour"] = "grey"
                    #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued
                    
                    #issueJobs([ job ], jobIDsToJobsHash, batchSystem)
		    jobsToIssue.append(job)
                else:
                    logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash)))
            elif job.attrib["colour"] == "black": #Job has finished okay
                logger.debug("Job: %s has finished okay" % job.attrib["file"])
                #Deal with stats
                if stats:
                    system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"]))
                    open(job.attrib["stats"], 'w').close() #Reset the stats file
                if job.find("messages") != None:
                    for message in job.find("messages").findall("message"):
                        logger.info("Received the following message from job: %s" % message.attrib["message"])
                    job.remove(job.find("messages"))
                childCount = int(job.attrib["child_count"])
                blackChildCount = int(job.attrib["black_child_count"])
                assert childCount == blackChildCount #Has no currently running child jobs
                #Launch any unborn children
                unbornChildren = job.find("children")
                unbornChild = unbornChildren.find("child")
                if unbornChild != None: #We must give birth to the unborn children
                    logger.debug("Job: %s has children to schedule" % job.attrib["file"])
                    newChildren = []
                    while unbornChild != None:
                        cummulativeChildTime = float(unbornChild.attrib["time"])
                        
                        newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config)
                        
                        totalJobFiles += 1
                        updatedJobFiles.add(newJob.attrib["file"])
                        
                        newChildren.append(newJob)
                        unbornChildren.remove(unbornChild)
                        unbornChild = unbornChildren.find("child")
                        
                        #This was code to aggregate groups of children into one job, but we don't do this now
                        #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially).
                        #    cummulativeChildTime += float(unbornChild.attrib["time"])
                        #    ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy())    
                        #    unbornChildren.remove(unbornChild)
                        #    unbornChild = unbornChildren.find("child")
                        newJob.attrib["total_time"] = str(cummulativeChildTime)
                    
                    updatedJobFiles.remove(job.attrib["file"])
                    job.attrib["child_count"] = str(childCount + len(newChildren))
                    job.attrib["colour"] = "blue" #Blue - has children running.
                    writeJobs([ job ] + newChildren ) #Check point
                    
                elif len(job.find("followOns").findall("followOn")) != 0: #Has another job
                    logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"])
                    ##Reset the job run info
                    job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
                    job.attrib["colour"] = "white"
                    ##End resetting the job
                    writeJobs([ job ])
                    
                else: #Job has finished, so we can defer to any parent
                    logger.debug("Job: %s is now dead" % job.attrib["file"])
                    job.attrib["colour"] = "dead"
                    if job.attrib.has_key("parent"):
                        parent = ET.parse(job.attrib["parent"]).getroot()
                        assert parent.attrib["colour"] == "blue"
                        assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"])
                        parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1)
                        if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]):
                            parent.attrib["colour"] = "black"
                            assert parent.attrib["file"] not in updatedJobFiles
                            updatedJobFiles.add(parent.attrib["file"])
                        writeJobs([ job, parent ]) #Check point
                    else:
                        writeJobs([ job ])
                         
            elif job.attrib["colour"] == "red": #Job failed
                logger.critical("Job: %s failed" % job.attrib["file"])
                logger.critical("The log file of the failed job")
                logFile(job.attrib["log_file"], logger.critical)
                logger.critical("The log file of the slave for the failed job")
                logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop
                #Checks
                assert len(job.find("children").findall("child")) == 0
                assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
                
                remainingRetyCount = int(job.attrib["remaining_retry_count"])
                if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere
                    job.attrib["remaining_retry_count"] = str(remainingRetyCount-1)
                    job.attrib["colour"] = "white"
                    logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"]))
                    writeJobs([ job ]) #Check point
                else:
                    assert remainingRetyCount == 0
                    updatedJobFiles.remove(job.attrib["file"])
                    logger.critical("Job: %s is completely failed" % job.attrib["file"])
                    
            else:
                logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"])
                assert job.attrib["colour"] == "dead"
                updatedJobFiles.remove(job.attrib["file"])
                totalJobFiles -= 1
                deleteJob(job, config) #This could be done earlier, but I like it this way.

        ###End of for loop
        writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued
        issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem)

                
        if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles)
            break
        
        if len(updatedJobFiles) > 0:
            updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed.
        else:
            updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed.
        for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, 
            result = updatedJobs[jobID]
            if jobIDsToJobsHash.has_key(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID])   
                else:
                    logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result))  
                processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash)
            else:
                logger.info("A result seems to already have been processed: %i" % jobID) #T
        
        if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds
            reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem)
            logger.info("Reissued any over long jobs")
            
            reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem)
            logger.info("Rescued any (long) missing jobs")
            timeSinceJobsLastRescued = time.time()
        #Going to sleep to let the job system catch up.
        time.sleep(waitDuration)
    
    if stats:
        fileHandle = open(config.attrib["stats"], 'a')
        fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
        fileHandle.close()
    
    logger.info("Finished the main loop")     
    
    return totalJobFiles #Returns number of failed jobs
示例#2
0
def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats):
    """Runs a job.
    """
    from jobTree.src.bioio import getTempFile
    from jobTree.src.bioio import getTempDirectory
    from jobTree.src.bioio import logger
    from jobTree.src.bioio import system
    from jobTree.src.bioio import getTotalCpuTime
    
    assert len(job.find("children").findall("child")) == 0
    assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
    command = jobToRun.attrib["command"]
    #Copy the job file to be edited
    
    tempJob = ET.Element("job")
    ET.SubElement(tempJob, "children")
    
    #Log for job
    tempJob.attrib["log_level"] = job.attrib["log_level"]
    
    #Time length of 'ideal' job before further parallelism is required
    tempJob.attrib["job_time"] = job.attrib["job_time"]
    
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()

    #Temp file dirs for job.
    localTempDir = getTempDirectory(rootDir=localSlaveTempDir)
    tempJob.attrib["local_temp_dir"] = localTempDir
    depth = len(job.find("followOns").findall("followOn"))
    tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth))
    if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate.
        os.mkdir(tempJob.attrib["global_temp_dir"])
        os.chmod(tempJob.attrib["global_temp_dir"], 0777)
    if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))):
        system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1)))
    assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2)))
    
    #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have,
    #according to the batch system
    tempJob.attrib["available_memory"] = str(memoryAvailable)
    tempJob.attrib["available_cpu"] = str(cpuAvailable)
    if stats != None:
        tempJob.attrib["stats"] = getTempFile(rootDir=localSlaveTempDir)
        os.remove(tempJob.attrib["stats"])
    
    #Now write the temp job file
    tempFile = getTempFile(rootDir=localSlaveTempDir)
    fileHandle = open(tempFile, 'w') 
    tree = ET.ElementTree(tempJob)
    tree.write(fileHandle)
    fileHandle.close()
    logger.info("Copied the jobs files ready for the job")
    
    if "JOB_FILE" not in command:
        logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command)
        job.attrib["colour"] = "red" #Update the colour
    else:
        #First load the environment for the job.
        fileHandle = open(job.attrib["environment_file"], 'r')
        environment = cPickle.load(fileHandle)
        fileHandle.close()
        logger.info("Loaded the environment for the process")
        
        #Run the actual command
        tempLogFile = getTempFile(suffix=".log", rootDir=localSlaveTempDir)
        fileHandle = open(tempLogFile, 'w')
        finalCommand = command.replace("JOB_FILE", tempFile)
        if stats != None:
            startTime = time.time()
            startClock = getTotalCpuTime()
        process = subprocess.Popen(finalCommand, shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment)
            
        sts = os.waitpid(process.pid, 0)
        fileHandle.close()
        truncateFile(tempLogFile, int(job.attrib["max_log_file_size"]))
        
        #Copy across the log file
        system("mv %s %s" % (tempLogFile, job.attrib["log_file"]))
        i = sts[1]
        
        logger.info("Ran the job command=%s with exit status %i" % (finalCommand, i))
        
        if i == 0:
            logger.info("Passed the job, okay")
            
            if stats != None:
                jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) })
                if os.path.exists(tempJob.attrib["stats"]):
                    jobTag.append(ET.parse(tempJob.attrib["stats"]).getroot())
            
            tempJob = ET.parse(tempFile).getroot()
            job.attrib["colour"] = "black" #Update the colour
            
            #Deal with any logging messages directed at the master
            if tempJob.find("messages") != None:
                messages = job.find("messages")
                if messages == None:
                    messages = ET.SubElement(job, "messages")
                for messageTag in tempJob.find("messages").findall("message"):
                    messages.append(messageTag)
            
            #Update the runtime of the stack..
            totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
            runtime = float(jobToRun.attrib["time"])
            totalRuntime -= runtime
            if totalRuntime < 0.0:
                totalRuntime = 0.0
            
            #The children
            children = job.find("children")
            assert len(children.findall("child")) == 0 #The children
            assert tempJob.find("children") != None
            for child in tempJob.find("children").findall("child"):
                memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child)
                ET.SubElement(children, "child", { "command":child.attrib["command"], 
                        "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
                logger.info("Making a child with command: %s" % (child.attrib["command"]))
            
            #The follow on command
            followOns = job.find("followOns")
            followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job
            if tempJob.attrib.has_key("command"):
                memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob)
                ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], 
                        "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
                ##Add the runtime to the total runtime..
                totalRuntime += compTime
                logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"])
                
            elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent.
                ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" })
                logger.info("Making a stub follow on job")
            #Write back the runtime, after addin the follow on time and subtracting the time of the run job.
            job.attrib["total_time"] = str(totalRuntime)
        else:
            logger.info("Failed the job")
            job.attrib["colour"] = "red" #Update the colour
    
    #Clean up
    system("rm -rf %s" % (localSlaveTempDir))
    logger.info("Cleaned up by removing temp jobfile (the copy), and the temporary file directory for the job")
示例#3
0
文件: stack.py 项目: cvaske/jobTree
 def execute(self, jobFile):
     job = ET.parse(jobFile).getroot()
     setLogLevel(job.attrib["log_level"])
     logger.info("Setup logging with level: %s" % job.attrib["log_level"])
     self.tempDirAccessed = False
     self.localTempDir = job.attrib["local_temp_dir"]
     self.globalTempDir = job.attrib["global_temp_dir"]
     maxTime = float(job.attrib["job_time"])
     memory = int(job.attrib["available_memory"])
     cpu = int(job.attrib["available_cpu"])
     
     if job.attrib.has_key("stats"):
         stats = ET.Element("stack")
         startTime = time.time()
         startClock = time.clock()
     else:
         stats = None
     
     newChildren = [] #List to add all the children to before we package them
     #off into stacks
     newChildCommands = [] #Ditto for the child commands
     newFollowOns = [] #Ditto for the follow-ons 
     while self.hasRemaining():
         if stats != None: #Getting the runtime of the stats module
             targetStartTime = time.time()
             targetStartClock = getTotalCpuTime()
             
         target = self.popTarget()
         target.setStack(self)
         #Debug check that we have the right amount of CPU and memory for the job in hand
         targetMemory = target.getMemory()
         if targetMemory != sys.maxint:
             assert targetMemory <= memory
         targetCpu = target.getCpu()
         if targetCpu != sys.maxint:
             assert targetCpu <= cpu
         #Run the target, first cleanup then run.
         target.run()
         #Cleanup after the target
         if self.tempDirAccessed:
             system("rm -rf %s/*" % self.localTempDir)
             self.tempDirAccessed = False
         #Handle the follow on
         followOn = target.getFollowOn()
         #if target.__class__ != CleanupGlobalTempDirTarget and followOn == None:
         #    followOn = CleanupGlobalTempDirTarget()
         if followOn != None: #Target to get rid of follow on when done.
             if target.isGlobalTempDirSet():
                 followOn.setGlobalTempDir(target.getGlobalTempDir())
             newFollowOns.append(followOn)
         
         #Now add the children to the newChildren stack
         newChildren += target.getChildren()
         
         #Now add the child commands to the newChildCommands stack
         newChildCommands += target.getChildCommands()
         
         if stats != None:
             ET.SubElement(stats, "target", { "time":str(time.time() - targetStartTime), 
                                             "clock":str(getTotalCpuTime() - targetStartClock),
                                             "class":".".join((target.__class__.__name__,)),
                                             "e_time":str(target.getRunTime())})
             
         for message in target.getMasterLoggingMessages():
             if job.find("messages") == None:
                 ET.SubElement(job, "messages")
             ET.SubElement(job.find("messages"), "message", { "message": message} )
     
     #######
     #Now build the new stacks and corresponding jobs
     #######
     
     #First add all the follow ons to the existing stack and make it a follow on job for job-tree
     assert not self.hasRemaining()
     
     #First sort out the follow on job
     if len(newFollowOns) > 0: #If we have follow ons
         followOnRuntime = sum([ followOn.getRunTime() for followOn in newFollowOns ])
         
         if followOnRuntime > maxTime: #We create a parallel list of follow ons
             followOnStack = Stack(ParallelFollowOnTarget(newFollowOns))
         else:
             followOnStack = Stack(newFollowOns.pop())
             while len(newFollowOns) > 0:
                 followOnStack.addTarget(newFollowOns.pop())
     
         job.attrib["command"] = followOnStack.makeRunnable(self.globalTempDir)
         job.attrib["time"] = str(followOnStack.getRunTime())
         followOnMemory = followOnStack.getMemory()
         assert not job.attrib.has_key("memory")
         if followOnMemory != sys.maxint:
             job.attrib["memory"] = str(followOnMemory)
         assert not job.attrib.has_key("cpu")
         followOnCpu = followOnStack.getCpu()
         if followOnCpu != sys.maxint:
             job.attrib["cpu"] = str(followOnCpu)
           
     #Now build stacks of children..
     childrenTag = job.find("children")
     while len(newChildren) > 0:
         childStack = Stack(newChildren.pop())
         while len(newChildren) > 0 and childStack.getRunTime() <= maxTime:
             childStack.addTarget(newChildren.pop())
         childJob = ET.SubElement(childrenTag, "child", { "command":childStack.makeRunnable(self.globalTempDir),
                                           "time":str(childStack.getRunTime()) })
         childMemory = childStack.getMemory()
         assert not childJob.attrib.has_key("memory")
         if childMemory != sys.maxint:
             childJob.attrib["memory"] = str(childMemory)
         assert not childJob.attrib.has_key("cpu")
         childCpu = childStack.getCpu()
         if childCpu != sys.maxint:
             childJob.attrib["cpu"] = str(childCpu)
     
     #Now build jobs for each child command
     for childCommand, runTime in newChildCommands:
         ET.SubElement(childrenTag, "child", { "command":str(childCommand),
                                           "time":str(runTime) })
 
     #Now write the updated job file
     fileHandle = open(jobFile, 'w')
     ET.ElementTree(job).write(fileHandle)
     fileHandle.close()
     
     #Finish up the stats
     if stats != None:
         stats.attrib["time"] = str(time.time() - startTime)
         stats.attrib["clock"] = str(getTotalCpuTime() - startClock)
         fileHandle = open(job.attrib["stats"], 'w')
         ET.ElementTree(stats).write(fileHandle)
         fileHandle.close()
示例#4
0
def main():
    sys.path +=  [ sys.argv[1] ]
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from jobTree.src.bioio import getBasicOptionParser
    from jobTree.src.bioio import parseBasicOptions
    from jobTree.src.bioio import logger
    from jobTree.src.bioio import addLoggingFileHandler
    from jobTree.src.bioio import setLogLevel
    from jobTree.src.bioio import getTotalCpuTime
    
    from jobTree.src.master import writeJobs
    
    ##########################################
    #Construct the arguments.
    ##########################################
    
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--job", dest="jobFile", 
                      help="Job file containing command to run",
                      default="None")
    
    options, args = parseBasicOptions(parser)
    assert len(args) == 0

    ##########################################
    #Parse the job.
    ##########################################
    
    job = ET.parse(options.jobFile).getroot()
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    setLogLevel(job.attrib["log_level"])
    addLoggingFileHandler(job.attrib["slave_log_file"], rotatingLogging=False)
    logger.info("Parsed arguments and set up logging")
    
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if job.attrib.has_key("stats"):
        startTime = time.time()
        startClock = time.clock()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #Run the script.
    ##########################################
    
    maxTime = float(job.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint
    jobToRun = job.find("followOns").findall("followOn")[-1]
    memoryAvailable = int(jobToRun.attrib["memory"])
    cpuAvailable = int(jobToRun.attrib["cpu"])
    while True:
        processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats)
        
        if job.attrib["colour"] != "black":
            logger.info("Exiting the slave because of a failed job")
            break
   
        totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
        
        childrenNode = job.find("children")
        childrenList = childrenNode.findall("child")
        #childRuntime = sum([ float(child.attrib["time"]) for child in childrenList ])
            
        if len(childrenList) >= 2: # or totalRuntime + childRuntime > maxTime: #We are going to have to return to the parent
            logger.info("No more jobs can run in series by this slave, its got %i children" % len(childrenList))
            break
        
        followOns = job.find("followOns")
        while len(childrenList) > 0:
            child = childrenList.pop()
            childrenNode.remove(child)
            totalRuntime += float(child.attrib["time"])
            ET.SubElement(followOns, "followOn", child.attrib.copy())
        #assert totalRuntime <= maxTime + 1 #The plus one second to avoid unimportant rounding errors
        job.attrib["total_time"] = str(totalRuntime)
        assert len(childrenNode.findall("child")) == 0
        
        if len(followOns.findall("followOn")) == 0:
            logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
            break
        
        #Get the next job and see if we have enough cpu and memory to run it..
        jobToRun = job.find("followOns").findall("followOn")[-1]
        if int(jobToRun.attrib["memory"]) > memoryAvailable:
            logger.info("We need more memory for the next job, so finishing")
            break
        if int(jobToRun.attrib["cpu"]) > cpuAvailable:
            logger.info("We need more cpus for the next job, so finishing")
            break
        
        ##Updated the job so we can start the next loop cycle
        job.attrib["colour"] = "grey"
        writeJobs([ job ])
        logger.info("Updated the status of the job to grey and starting the next job")
    
    #Write back the job file with the updated jobs, using the checkpoint method.
    writeJobs([ job ])
    logger.info("Written out an updated job file")
    
    logger.info("Finished running the chain of jobs on this node")
    
    ##########################################
    #Finish up the stats
    ##########################################
    
    if stats != None:
        stats.attrib["time"] = str(time.time() - startTime)
        stats.attrib["clock"] = str(getTotalCpuTime() - startClock)
        fileHandle = open(job.attrib["stats"], 'w')
        ET.ElementTree(stats).write(fileHandle)
        fileHandle.close()