def __pollJobTree(self):
        childJobFileToParentJob, childCounts =  {}, {}
        updatedJobFiles, shellJobs = set(), set()
        try:
            parseJobFiles(getJobFileDirName(self.jobTreePath),
                          updatedJobFiles, childJobFileToParentJob,
                          childCounts, shellJobs)
            failedJobs = [ job for job in updatedJobFiles | \
                           set(childCounts.keys()) \
                           if job.remainingRetryCount == 0 ]

            self.curActiveJobs = set()
            for job in updatedJobFiles:
                self.curActiveJobs.add(job.getJobFileName())
            self.failedJobs = max(len(failedJobs), self.failedJobs)

        except:
            self.curActiveJobs = set()

        if len(self.prevActiveJobs) > 0 and len(self.curActiveJobs) > 0 and\
               self.curActiveJobs == self.prevActiveJobs:
            self.sameJobsTime += self.pollTime
        else:
            self.sameJobsTime = 0
            self.prevActiveJobs = set(self.curActiveJobs)
 def jobtree_is_finished(self, jobtree_path):
     """
     See if this jobTree has finished before. Code extracted from the jobTree repo.
     """
     childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set(), set()
     parseJobFiles(getJobFileDirName(jobtree_path), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs)
     return len(updatedJobFiles) == 0
示例#3
0
def createFirstJob(command, config, memory=None, cpu=None, time=sys.maxint):
    """Adds the first job to to the jobtree.
    """
    logger.info("Adding the first job")
    if memory == None or memory == sys.maxint:
        memory = float(config.attrib["default_memory"])
    if cpu == None or cpu == sys.maxint:
        cpu = float(config.attrib["default_cpu"])
    job = Job(command=command, memory=memory, cpu=cpu, 
              tryCount=int(config.attrib["try_count"]), jobDir=getJobFileDirName(config.attrib["job_tree"]))
    job.write()
    logger.info("Added the first job")
示例#4
0
def reloadJobTree(jobTree):
    """Load the job tree from a dir.
    """
    logger.info("The job tree appears to already exist, so we'll reload it")
    assert os.path.isfile(getConfigFileName(jobTree)) #A valid job tree must contain the config file
    assert os.path.isfile(getEnvironmentFileName(jobTree)) #A valid job tree must contain a pickle file which encodes the path environment of the job
    assert os.path.isdir(getJobFileDirName(jobTree)) #A job tree must have a directory of jobs.
    
    config = ET.parse(getConfigFileName(jobTree)).getroot()
    config.attrib["log_level"] = getLogLevelString()
    writeConfig(config) #This updates the on disk config file with the new logging setting
    
    batchSystem = loadTheBatchSystem(config)
    logger.info("Reloaded the jobtree")
    return config, batchSystem
示例#5
0
def createJobTree(options):
    logger.info("Starting to create the job tree setup for the first time")
    options.jobTree = absSymPath(options.jobTree)
    os.mkdir(options.jobTree)
    os.mkdir(getJobFileDirName(options.jobTree))
    config = ET.Element("config")
    config.attrib["log_level"] = getLogLevelString()
    config.attrib["job_tree"] = options.jobTree
    config.attrib["parasol_command"] = options.parasolCommand
    config.attrib["try_count"] = str(int(options.retryCount) + 1)
    config.attrib["max_job_duration"] = str(float(options.maxJobDuration))
    config.attrib["batch_system"] = options.batchSystem
    config.attrib["job_time"] = str(float(options.jobTime))
    config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize))
    config.attrib["default_memory"] = str(int(options.defaultMemory))
    config.attrib["default_cpu"] = str(int(options.defaultCpu))
    config.attrib["max_cpus"] = str(int(options.maxCpus))
    config.attrib["max_memory"] = str(int(options.maxMemory))
    config.attrib["max_threads"] = str(int(options.maxThreads))
    if options.bigBatchSystem != None:
        config.attrib["big_batch_system"] = options.bigBatchSystem
        config.attrib["big_memory_threshold"] = str(
            int(options.bigMemoryThreshold))
        config.attrib["big_cpu_threshold"] = str(int(options.bigCpuThreshold))
        config.attrib["big_max_cpus"] = str(int(options.bigMaxCpus))
        config.attrib["big_max_memory"] = str(int(options.bigMaxMemory))

    if options.stats:
        config.attrib["stats"] = ""
    #Load the batch system.
    batchSystem = loadTheBatchSystem(config, options)
    logger.info("Loaded the batch system %s" % batchSystem)

    #Set the parameters determining the polling frequency of the system.
    config.attrib["rescue_jobs_frequency"] = str(
        float(batchSystem.getRescueJobFrequency()))
    if options.rescueJobsFrequency != None:
        config.attrib["rescue_jobs_frequency"] = str(
            float(options.rescueJobsFrequency))

    writeConfig(config)

    logger.info("Finished the job tree setup")
    return config, batchSystem
示例#6
0
def createJobTree(options):
    logger.info("Starting to create the job tree setup for the first time")
    options.jobTree = absSymPath(options.jobTree)
    os.mkdir(options.jobTree)
    os.mkdir(getJobFileDirName(options.jobTree))
    config = ET.Element("config")
    config.attrib["log_level"] = getLogLevelString()
    config.attrib["job_tree"] = options.jobTree
    config.attrib["parasol_command"] = options.parasolCommand
    config.attrib["try_count"] = str(int(options.retryCount) + 1)
    config.attrib["max_job_duration"] = str(float(options.maxJobDuration))
    config.attrib["batch_system"] = options.batchSystem
    config.attrib["job_time"] = str(float(options.jobTime))
    config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize))
    config.attrib["default_memory"] = str(int(options.defaultMemory))
    config.attrib["default_cpu"] = str(int(options.defaultCpu))
    config.attrib["max_cpus"] = str(int(options.maxCpus))
    config.attrib["max_memory"] = str(int(options.maxMemory))
    config.attrib["max_threads"] = str(int(options.maxThreads))
    if options.bigBatchSystem != None:
        config.attrib["big_batch_system"] = options.bigBatchSystem
        config.attrib["big_memory_threshold"] = str(int(options.bigMemoryThreshold))
        config.attrib["big_cpu_threshold"] = str(int(options.bigCpuThreshold))
        config.attrib["big_max_cpus"] = str(int(options.bigMaxCpus))
        config.attrib["big_max_memory"] = str(int(options.bigMaxMemory))
        
    if options.stats:
        config.attrib["stats"] = ""
    #Load the batch system.
    batchSystem = loadTheBatchSystem(config)
    
    #Set the parameters determining the polling frequency of the system.  
    config.attrib["rescue_jobs_frequency"] = str(float(batchSystem.getRescueJobFrequency()))
    if options.rescueJobsFrequency != None:
        config.attrib["rescue_jobs_frequency"] = str(float(options.rescueJobsFrequency))
    
    writeConfig(config)
    
    logger.info("Finished the job tree setup")
    return config, batchSystem
示例#7
0
    def __pollJobTree(self):
        childJobFileToParentJob, childCounts = {}, {}
        updatedJobFiles, shellJobs = set(), set()
        try:
            parseJobFiles(getJobFileDirName(self.jobTreePath), updatedJobFiles,
                          childJobFileToParentJob, childCounts, shellJobs)
            failedJobs = [ job for job in updatedJobFiles | \
                           set(childCounts.keys()) \
                           if job.remainingRetryCount == 0 ]

            self.curActiveJobs = set()
            for job in updatedJobFiles:
                self.curActiveJobs.add(job.getJobFileName())
            self.failedJobs = max(len(failedJobs), self.failedJobs)

        except:
            self.curActiveJobs = set()

        if len(self.prevActiveJobs) > 0 and len(self.curActiveJobs) > 0 and\
               self.curActiveJobs == self.prevActiveJobs:
            self.sameJobsTime += self.pollTime
        else:
            self.sameJobsTime = 0
            self.prevActiveJobs = set(self.curActiveJobs)
示例#8
0
def main():
    """Reports the state of the job tree.
    """

    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser(
        "usage: %prog [--jobTree] JOB_TREE_DIR [options]", "%prog 0.1")

    parser.add_option(
        "--jobTree",
        dest="jobTree",
        help=
        "Directory containing the job tree. The jobTree location can also be specified as the argument to the script. default=%default",
        default='./jobTree')

    parser.add_option(
        "--verbose",
        dest="verbose",
        action="store_true",
        help=
        "Print loads of information, particularly all the log files of jobs that failed. default=%default",
        default=False)

    parser.add_option(
        "--failIfNotComplete",
        dest="failIfNotComplete",
        action="store_true",
        help=
        "Return exit value of 1 if job tree jobs not all completed. default=%default",
        default=False)

    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    assert len(args) <= 1  #Only jobtree may be specified as argument
    if len(args) == 1:  #Allow jobTree directory as arg
        options.jobTree = args[0]

    ##########################################
    #Do some checks.
    ##########################################

    logger.info("Checking if we have files for job tree")
    assert options.jobTree != None
    assert os.path.isdir(options.jobTree)  #The given job dir tree must exist.
    assert os.path.isfile(getConfigFileName(
        options.jobTree))  #A valid job tree must contain the config gile
    assert os.path.isdir(getJobFileDirName(
        options.jobTree))  #A job tree must have a directory of jobs.

    ##########################################
    #Survey the status of the job and report.
    ##########################################

    childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set(
    ), set()
    parseJobFiles(getJobFileDirName(options.jobTree), updatedJobFiles,
                  childJobFileToParentJob, childCounts, shellJobs)

    failedJobs = [
        job for job in updatedJobFiles | set(childCounts.keys())
        if job.remainingRetryCount == 0
    ]

    print "There are %i active jobs, %i parent jobs with children, %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) currently in job tree: %s" % \
    (len(updatedJobFiles), len(childCounts), len(failedJobs), len(shellJobs), options.jobTree)

    if options.verbose:  #Verbose currently means outputting the files that have failed.
        for job in failedJobs:
            if os.path.isfile(job.getLogFileName()):
                print "Log file of failed job: %s" % job.getLogFileName()
                logFile(job.getLogFileName(), logger.critical)
            else:
                print "Log file for job %s is not present" % job.getJobFileName(
                )
        if len(failedJobs) == 0:
            print "There are no failed jobs to report"

    if (len(updatedJobFiles) +
            len(childCounts)) != 0 and options.failIfNotComplete:
        sys.exit(1)
示例#9
0
def main():
    """Reports the state of the job tree.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser("usage: %prog [--jobTree] JOB_TREE_DIR [options]", "%prog 0.1")
    
    parser.add_option("--jobTree", dest="jobTree", 
                      help="Directory containing the job tree")
    
    parser.add_option("--verbose", dest="verbose", action="store_true",
                      help="Print loads of information, particularly all the log files of errors. default=%default",
                      default=False)
    
    parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true",
                      help="Return exit value of 1 if job tree jobs not all completed. default=%default",
                      default=False)
    
    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    assert len(args) <= 1 #Only jobtree may be specified as argument
    if len(args) == 1: #Allow jobTree directory as arg
        options.jobTree = args[0]
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for job tree")
    assert options.jobTree != None
    assert os.path.isdir(options.jobTree) #The given job dir tree must exist.
    assert os.path.isfile(getConfigFileName(options.jobTree)) #A valid job tree must contain the config gile
    assert os.path.isdir(getJobFileDirName(options.jobTree)) #A job tree must have a directory of jobs.
    
    ##########################################
    #Survey the status of the job and report.
    ##########################################  
    
    childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set(), set()
    parseJobFiles(getJobFileDirName(options.jobTree), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs)
    
    failedJobs = [ job for job in updatedJobFiles | set(childCounts.keys()) if job.remainingRetryCount == 0 ]
           
    print "There are %i active jobs, %i parent jobs with children, %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) currently in job tree: %s" % \
    (len(updatedJobFiles), len(childCounts), len(failedJobs), len(shellJobs), options.jobTree)
    
    if options.verbose: #Verbose currently means outputting the files that have failed.
        for job in failedJobs:
            if os.path.isfile(job.getLogFileName()):
                print "Log file of failed job: %s" % job.getLogFileName()
                logFile(job.getLogFileName(), logger.critical)
            else:
                print "Log file for job %s is not present" % job.getJobFileName() 
        if len(failedJobs) == 0:
            print "There are no failed jobs to report"   
    
    if (len(updatedJobFiles) + len(childCounts)) != 0 and options.failIfNotComplete:
        sys.exit(1)