def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobBatcher, childJobFileToParentJob, childCounts, config): """Function reads a processed job file and updates it state. """ jobFile = jobBatcher.removeJobID(jobID) updatingFilePresent = processAnyUpdatingFile(jobFile) newFilePresent = processAnyNewFile(jobFile) jobDir = os.path.split(jobFile)[0] if os.path.exists(getJobLogFileName(jobDir)): logger.critical("The job seems to have left a log file, indicating failure: %s", jobFile) logFile(getJobLogFileName(jobDir), logger.critical) if os.path.isfile(jobFile): job = Job.read(jobFile) assert job not in updatedJobFiles if resultStatus != 0 or newFilePresent or updatingFilePresent: if not os.path.exists(job.getLogFileName()): logger.critical("No log file is present, despite job failing: %s", jobFile) setupJobAfterFailure(job, config) if len(job.followOnCommands) > 0 or len(job.children) > 0: updatedJobFiles.add(job) #Now we know the job is done we can add it to the list of updated job files logger.debug("Added job: %s to active jobs" % jobFile) else: for message in job.messages: #This is here because jobs with no children or follow ons may log to master. logger.critical("Got message from job at time: %s : %s" % (time.time(), message)) logger.debug("Job has no follow-ons or children despite job file being present so we'll consider it done: %s" % jobFile) updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts) else: #The job is done if resultStatus != 0: logger.critical("Despite the batch system claiming failure the job %s seems to have finished and been removed" % jobFile) updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts)
def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobBatcher, childJobFileToParentJob, childCounts, config): """Function reads a processed job file and updates it state. """ jobFile = jobBatcher.removeJobID(jobID) updatingFilePresent = processAnyUpdatingFile(jobFile) newFilePresent = processAnyNewFile(jobFile) jobDir = os.path.split(jobFile)[0] if os.path.exists(getJobLogFileName(jobDir)): logger.critical( "The job seems to have left a log file, indicating failure: %s", jobFile) logFile(getJobLogFileName(jobDir), logger.critical) if os.path.isfile(jobFile): job = Job.read(jobFile) assert job not in updatedJobFiles if resultStatus != 0 or newFilePresent or updatingFilePresent: if not os.path.exists(job.getLogFileName()): logger.critical( "No log file is present, despite job failing: %s", jobFile) setupJobAfterFailure(job, config) if len(job.followOnCommands) > 0 or len(job.children) > 0: updatedJobFiles.add( job ) #Now we know the job is done we can add it to the list of updated job files logger.debug("Added job: %s to active jobs" % jobFile) else: for message in job.messages: #This is here because jobs with no children or follow ons may log to master. logger.critical("Got message from job at time: %s : %s" % (time.time(), message)) logger.debug( "Job has no follow-ons or children despite job file being present so we'll consider it done: %s" % jobFile) updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts) else: #The job is done if resultStatus != 0: logger.critical( "Despite the batch system claiming failure the job %s seems to have finished and been removed" % jobFile) updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts)
def main(): """Reports the state of the job tree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser( "usage: %prog [--jobTree] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option( "--jobTree", dest="jobTree", help= "Directory containing the job tree. The jobTree location can also be specified as the argument to the script. default=%default", default='./jobTree') parser.add_option( "--verbose", dest="verbose", action="store_true", help= "Print loads of information, particularly all the log files of jobs that failed. default=%default", default=False) parser.add_option( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if job tree jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only jobtree may be specified as argument if len(args) == 1: #Allow jobTree directory as arg options.jobTree = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) #The given job dir tree must exist. assert os.path.isfile(getConfigFileName( options.jobTree)) #A valid job tree must contain the config gile assert os.path.isdir(getJobFileDirName( options.jobTree)) #A job tree must have a directory of jobs. ########################################## #Survey the status of the job and report. ########################################## childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set( ), set() parseJobFiles(getJobFileDirName(options.jobTree), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs) failedJobs = [ job for job in updatedJobFiles | set(childCounts.keys()) if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) currently in job tree: %s" % \ (len(updatedJobFiles), len(childCounts), len(failedJobs), len(shellJobs), options.jobTree) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if os.path.isfile(job.getLogFileName()): print "Log file of failed job: %s" % job.getLogFileName() logFile(job.getLogFileName(), logger.critical) else: print "Log file for job %s is not present" % job.getJobFileName( ) if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(updatedJobFiles) + len(childCounts)) != 0 and options.failIfNotComplete: sys.exit(1)
def reportJobLogFiles(job): logger.critical("The log file of the job") logFile(job.attrib["log_file"], logger.critical) logger.critical("The log file of the slave for the job") logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop
def main(): """Reports the state of the job tree. """ ########################################## # Construct the arguments. ########################################## parser = getBasicOptionParser( "usage: %prog [options] \nThe colours returned indicate the state of the job.\n\ \twhite: job has not been started yet\n\ \tgrey: job is issued to batch system\n\ \tred: job failed\n\ \tblue: job has children currently being processed\n\ \tblack: job has finished and will be processed (transient state)\n\ \tdead: job is totally finished and is awaiting deletion (transient state)", "%prog 0.1", ) parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree") parser.add_option( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of errors. default=%default", default=False, ) parser.add_option( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if job tree jobs not all completed. default=%default", default=False, ) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") assert len(args) == 0 if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## # Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) # The given job dir tree must exist. assert os.path.isfile(os.path.join(options.jobTree, "config.xml")) # A valid job tree must contain the config gile assert os.path.isdir(os.path.join(options.jobTree, "jobs")) # A job tree must have a directory of jobs. assert os.path.isdir( os.path.join(options.jobTree, "tempDirDir") ) # A job tree must have a directory of temporary directories (for jobs to make temp files in). assert os.path.isdir(os.path.join(options.jobTree, "logFileDir")) # A job tree must have a directory of log files. assert os.path.isdir( os.path.join(options.jobTree, "slaveLogFileDir") ) # A job tree must have a directory of slave log files. ########################################## # Read the total job number ########################################## config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot() ########################################## # Survey the status of the job and report. ########################################## colours = {} jobFiles = TempFileTree(config.attrib["job_file_dir"]).listFiles() if len(jobFiles) > 0: logger.info("Collating the colours of the job tree") for absFileName in jobFiles: job = parseJobFile(absFileName) if job != None: if not colours.has_key(job.attrib["colour"]): colours[job.attrib["colour"]] = 0 colours[job.attrib["colour"]] += 1 else: logger.info("There are no jobs to collate") print "There are %i jobs currently in job tree: %s" % (len(jobFiles), options.jobTree) for colour in colours.keys(): print "\tColour: %s, number of jobs: %s" % (colour, colours[colour]) if options.verbose: # Verbose currently means outputting the files that have failed. for absFileName in jobFiles: job = parseJobFile(absFileName) if job != None: if job.attrib["colour"] == "red": if os.path.isfile(job.attrib["log_file"]): def fn(string): print string logFile(job.attrib["log_file"], fn) else: logger.info("Log file for job %s is not present" % job.attrib["file"]) if len(jobFiles) != 0 and options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the job tree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [--jobTree] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree") parser.add_option("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of errors. default=%default", default=False) parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if job tree jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only jobtree may be specified as argument if len(args) == 1: #Allow jobTree directory as arg options.jobTree = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) #The given job dir tree must exist. assert os.path.isfile(getConfigFileName(options.jobTree)) #A valid job tree must contain the config gile assert os.path.isdir(getJobFileDirName(options.jobTree)) #A job tree must have a directory of jobs. ########################################## #Survey the status of the job and report. ########################################## childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set(), set() parseJobFiles(getJobFileDirName(options.jobTree), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs) failedJobs = [ job for job in updatedJobFiles | set(childCounts.keys()) if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) currently in job tree: %s" % \ (len(updatedJobFiles), len(childCounts), len(failedJobs), len(shellJobs), options.jobTree) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if os.path.isfile(job.getLogFileName()): print "Log file of failed job: %s" % job.getLogFileName() logFile(job.getLogFileName(), logger.critical) else: print "Log file for job %s is not present" % job.getJobFileName() if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(updatedJobFiles) + len(childCounts)) != 0 and options.failIfNotComplete: sys.exit(1)