def getStatus(jobStoreName: str) -> str: """ Determine the status of a workflow. If the jobstore does not exist, this returns 'QUEUED', assuming it has not been created yet. Checks for the existence of files created in the toil.Leader.run(). In toil.Leader.run(), if a workflow completes with failed jobs, 'failed.log' is created, otherwise 'succeeded.log' is written. If neither of these exist, the leader is still running jobs. :return: A string indicating the status of the workflow. ['COMPLETED', 'RUNNING', 'ERROR', 'QUEUED'] :rtype: str """ try: jobstore = Toil.resumeJobStore(jobStoreName) except NoSuchJobStoreException: return 'QUEUED' except NoSuchFileException: return 'QUEUED' try: with jobstore.readSharedFileStream('succeeded.log') as successful: pass return 'COMPLETED' except NoSuchFileException: try: with jobstore.readSharedFileStream('failed.log') as failed: pass return 'ERROR' except NoSuchFileException: pass return 'RUNNING'
def _runDebugJob(self, jobCommand, jobID, environment): """ Run the jobCommand right now, in the current thread. May only be called in debug-worker mode. Assumes resources are available. """ assert self.debugWorker # TODO: It is not possible to kill running jobs in forkless mode, # because they are run immediately in the main thread. info = Info(time.time(), None, None, killIntended=False) self.runningJobs[jobID] = info if jobCommand.startswith("_toil_worker "): # We can actually run in this thread jobName, jobStoreLocator, jobStoreID = jobCommand.split()[ 1:] # Parse command jobStore = Toil.resumeJobStore(jobStoreLocator) toil_worker.workerScript( jobStore, jobStore.config, jobName, jobStoreID, redirectOutputToLogFile=not self.debugWorker ) # Call the worker else: # Run synchronously. If starting or running the command fails, let the exception stop us. subprocess.check_call(jobCommand, shell=True, env=dict(os.environ, **environment)) self.runningJobs.pop(jobID) if not info.killIntended: self.outputQueue.put((jobID, 0, time.time() - info.time))
def main(): parser = parser_with_common_options(jobstore_option=True) parser.add_argument( "jobID", nargs=1, help= "The job store id of a job within the provided jobstore to run by itself." ) parser.add_argument( "--printJobInfo", nargs=1, help= "Return information about this job to the user including preceding jobs, " "inputs, outputs, and runtime from the last known run.") options = parser.parse_args() set_logging_from_options(options) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) if options.printJobInfo: printContentsOfJobStore(jobStorePath=config.jobStore, nameOfJob=options.printJobInfo) # TODO: Option to print list of successor jobs # TODO: Option to run job within python debugger, allowing step through of arguments # idea would be to have option to import pdb and set breakpoint at the start of the user's code jobID = options.jobID[0] logger.debug(f"Running the following job locally: {jobID}") workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False) logger.debug(f"Finished running: {jobID}")
def main(): parser = parser_with_common_options() options = parser.parse_args() set_logging_from_options(options) config = Config() config.setOptions(options) config.jobStore = config.jobStore[5:] if config.jobStore.startswith('file:') else config.jobStore # ':' means an aws/google jobstore; use the old (broken?) method if ':' in config.jobStore: jobStore = Toil.resumeJobStore(config.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) # TODO: This behaviour is now broken: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1 batchSystem = Toil.createBatchSystem(jobStore.config) # Should automatically kill existing jobs, so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): # Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed") # otherwise, kill the pid recorded in the jobstore else: pid_log = os.path.join(os.path.abspath(config.jobStore), 'pid.log') with open(pid_log, 'r') as f: pid2kill = f.read().strip() try: os.kill(int(pid2kill), signal.SIGKILL) logger.info("Toil process %s successfully terminated." % str(pid2kill)) except OSError: logger.error("Toil process %s could not be terminated." % str(pid2kill)) raise
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help= "The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) jobStore = Toil.resumeJobStore(options.jobStore) logger.info( "Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem( jobStore.config ) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs( ): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def main(argv=None): if argv is None: argv = sys.argv # Parse input args jobName = argv[1] jobStoreLocator = argv[2] jobStoreID = argv[3] ########################################## #Load the jobStore/config file ########################################## # Try to monkey-patch boto early so that credentials are cached. try: import boto except ImportError: pass else: # boto is installed, monkey patch it now from toil.lib.ec2Credentials import enable_metadata_credential_caching enable_metadata_credential_caching() jobStore = Toil.resumeJobStore(jobStoreLocator) config = jobStore.config # Call the worker workerScript(jobStore, config, jobName, jobStoreID)
def main(argv=None): if argv is None: argv = sys.argv # Do a little argument validation, in case someone tries to run us manually. if len(argv) < 4: if len(argv) < 1: sys.stderr.write("Error: Toil worker invoked without its own name\n") sys.exit(1) else: sys.stderr.write("Error: usage: %s JOB_NAME JOB_STORE_LOCATOR JOB_STORE_ID\n" % argv[0]) sys.exit(1) # Parse input args jobName = argv[1] jobStoreLocator = argv[2] jobStoreID = argv[3] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.resumeJobStore(jobStoreLocator) config = jobStore.config # Call the worker workerScript(jobStore, config, jobName, jobStoreID)
def getPIDStatus(jobStoreName: str) -> str: """ Determine the status of a process with a particular pid. Checks to see if a process exists or not. :return: A string indicating the status of the PID of the workflow as stored in the jobstore. :rtype: str """ try: jobstore = Toil.resumeJobStore(jobStoreName) except NoSuchJobStoreException: return 'QUEUED' except NoSuchFileException: return 'QUEUED' try: with jobstore.readSharedFileStream('pid.log') as pidFile: pid = int(pidFile.read()) try: os.kill(pid, 0) # Does not kill process when 0 is passed. except OSError: # Process not found, must be done. return 'COMPLETED' else: return 'RUNNING' except NoSuchFileException: pass return 'QUEUED'
def main(argv=None): if argv is None: argv = sys.argv # Parse our command line options = parse_args(argv) # Parse input args jobName = argv[1] jobStoreLocator = argv[2] jobStoreID = argv[3] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.resumeJobStore(options.jobStoreLocator) config = jobStore.config with in_contexts(options.context): # Call the worker exit_code = workerScript(jobStore, config, options.jobName, options.jobStoreID) # Exit with its return value sys.exit(exit_code)
def toil_jobstore_info(jobstore: str) -> dict: """parses a toil jobstore folder""" try: jobStore = Toil.resumeJobStore(jobstore) except NoSuchJobStoreException: return {} else: stats = getStats(jobStore) return processData(jobStore.config, stats)
def __init__(self, jobStoreName, specifiedJobs=None): self.jobStoreName = jobStoreName self.jobStore = Toil.resumeJobStore(jobStoreName) if specifiedJobs is None: rootJob = self.fetchRootJob() logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.') self.jobsToReport = self.traverseJobGraph(rootJob) else: self.jobsToReport = self.fetchUserJobs(specifiedJobs)
def main(): """ Reports stats on the workflow, use with --stats option to toil. """ parser = getBasicOptionParser() initializeOptions(parser) options = parseBasicOptions(parser) checkOptions(options, parser) jobStore = Toil.resumeJobStore(options.jobStore) stats = getStats(jobStore) collatedStatsTag = processData(jobStore.config, stats) reportData(collatedStatsTag, options)
def testMultipleJobsPerWorkerStats(self): """ Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.clean = 'never' options.stats = True Job.Runner.startToil(RunTwoJobsPerWorker(), options) jobStore = Toil.resumeJobStore(options.jobStore) stats = getStats(jobStore) collatedStats = processData(jobStore.config, stats) self.assertTrue(len(collatedStats.job_types) == 2, "Some jobs are not represented in the stats")
def testMultipleJobsPerWorkerStats(self): """ Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.clean = 'never' options.stats = True Job.Runner.startToil(RunTwoJobsPerWorker(), options) jobStore = Toil.resumeJobStore(options.jobStore) stats = getStats(jobStore) collatedStats = processData(jobStore.config, stats) self.assertTrue( len(collatedStats.job_types) == 2, "Some jobs are not represented in the stats")
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("--localFilePath", nargs=1, help="Location to which to copy job store files.") parser.add_argument("--fetch", nargs="+", help="List of job-store files to be copied locally." "Use either explicit names (i.e. 'data.txt'), or " "specify glob patterns (i.e. '*.txt')") parser.add_argument( "--listFilesInJobStore", help="Prints a list of the current files in the jobStore.") parser.add_argument( "--fetchEntireJobStore", help="Copy all job store files into a local directory.") parser.add_argument( "--useSymlinks", help="Creates symlink 'shortcuts' of files in the localFilePath" " instead of hardlinking or copying, where possible. If this is" " not possible, it will copy the files (shutil.copyfile()).") parser.add_argument("--version", action='version', version=version) # Load the jobStore options = parseBasicOptions(parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) logger.debug("Connected to job store: %s", config.jobStore) if options.fetch: # Copy only the listed files locally logger.debug("Fetching local files: %s", options.fetch) fetchJobStoreFiles(jobStore=jobStore, options=options) elif options.fetchEntireJobStore: # Copy all jobStore files locally logger.debug("Fetching all local files.") options.fetch = "*" fetchJobStoreFiles(jobStore=jobStore, options=options) if options.listFilesInJobStore: # Log filenames and create a file containing these names in cwd printContentsOfJobStore(jobStorePath=options.jobStore)
def _runWorker(self, jobCommand, jobID, environment): """ Run the jobCommand using the worker and wait for it to finish. The worker is forked unless it is a '_toil_worker' job and debugWorker is True. """ startTime = time.time() # Time job is started if self.debugWorker and "_toil_worker" in jobCommand: # Run the worker without forking jobName, jobStoreLocator, jobStoreID = jobCommand.split()[ 1:] # Parse command jobStore = Toil.resumeJobStore(jobStoreLocator) # TODO: The following does not yet properly populate self.runningJobs so it is not possible to kill # running jobs in forkless mode - see the "None" value in place of popen info = Info(time.time(), None, killIntended=False) try: self.runningJobs[jobID] = info try: toil_worker.workerScript( jobStore, jobStore.config, jobName, jobStoreID, redirectOutputToLogFile=not self.debugWorker ) # Call the worker finally: self.runningJobs.pop(jobID) finally: if not info.killIntended: self.outputQueue.put((jobID, 0, time.time() - startTime)) else: with self.popenLock: popen = subprocess.Popen(jobCommand, shell=True, env=dict(os.environ, **environment)) info = Info(time.time(), popen, killIntended=False) try: self.runningJobs[jobID] = info try: statusCode = popen.wait() if statusCode != 0 and not info.killIntended: log.error( "Got exit code %i (indicating failure) " "from job %s.", statusCode, self.jobs[jobID]) finally: self.runningJobs.pop(jobID) finally: if not info.killIntended: self.outputQueue.put( (jobID, statusCode, time.time() - startTime))
def main() -> None: parser = parser_with_common_options() options = parser.parse_args() set_logging_from_options(options) config = Config() config.setOptions(options) job_store_type, _ = Toil.parseLocator(config.jobStore) if job_store_type != 'file': # Remote (aws/google) jobstore; use the old (broken?) method job_store = Toil.resumeJobStore(config.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) # TODO: This behaviour is now broken: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1 # There's no guarantee that the batch system in use can enumerate # running jobs belonging to the job store we've attached to. And # moreover we don't even bother trying to kill the leader at its # recorded PID, even if it is a local process. batch_system = Toil.createBatchSystem(job_store.config) # Should automatically kill existing jobs, so we're good. for job_id in batch_system.getIssuedBatchJobIDs(): # Just in case we do it again. batch_system.killBatchJobs([job_id]) logger.info("All jobs SHOULD have been killed") else: # otherwise, kill the pid recorded in the jobstore. # TODO: We assume thnis is a local PID. job_store = Toil.resumeJobStore(config.jobStore) assert isinstance(job_store, FileJobStore), "Need a FileJobStore which has a sharedFilesDir" pid_log = os.path.join(job_store.sharedFilesDir, 'pid.log') with open(pid_log) as f: pid_to_kill = f.read().strip() try: os.kill(int(pid_to_kill), signal.SIGTERM) logger.info("Toil process %s successfully terminated." % str(pid_to_kill)) except OSError: logger.error("Toil process %s could not be terminated." % str(pid_to_kill)) raise
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) jobStore = Toil.resumeJobStore(options.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem(jobStore.config) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def main(argv=None): if argv is None: argv = sys.argv # Parse input args jobName = argv[1] jobStoreLocator = argv[2] jobStoreID = argv[3] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.resumeJobStore(jobStoreLocator) config = jobStore.config # Call the worker workerScript(jobStore, config, jobName, jobStoreID)
def main() -> None: """Reports stats on the workflow, use with --stats option to toil.""" parser = parser_with_common_options() add_stats_options(parser) options = parser.parse_args() for c in options.categories.split(","): if c.strip() not in category_choices: raise ValueError(f'{c} not in {category_choices}!') options.categories = [ x.strip().lower() for x in options.categories.split(",") ] set_logging_from_options(options) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) stats = getStats(jobStore) collatedStatsTag = processData(jobStore.config, stats) reportData(collatedStatsTag, options)
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("jobID", nargs=1, help="The job store id of a job " "within the provided jobstore to run by itself.") parser.add_argument( "--printJobInfo", nargs=1, help="Return information about this job to the user" " including preceding jobs, inputs, outputs, and runtime" " from the last known run.") parser.add_argument("--version", action='version', version=version) # Parse options options = parseBasicOptions(parser) config = Config() config.setOptions(options) # Load the job store jobStore = Toil.resumeJobStore(config.jobStore) if options.printJobInfo: printContentsOfJobStore(jobStorePath=options.jobStore, nameOfJob=options.printJobInfo) # TODO: Option to print list of successor jobs # TODO: Option to run job within python debugger, allowing step through of arguments # idea would be to have option to import pdb and set breakpoint at the start of the user's code # Run the job locally jobID = options.jobID[0] logger.debug("Going to run the following job locally: %s", jobID) workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False) logger.debug("Ran the following job locally: %s", jobID)
def main(): """Reports the state of a Toil workflow.""" parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--noAggStats", dest="stats", action="store_false", help="Do not print overall, aggregate status of workflow.", default=True) parser.add_argument("--printDot", action="store_true", help="Print dot formatted description of the graph. If using --jobs will " "restrict to subgraph including only those jobs. default=%(default)s", default=False) parser.add_argument("--jobs", nargs='+', help="Restrict reporting to the following jobs (allows subsetting of the report).", default=None) parser.add_argument("--printPerJobStats", action="store_true", help="Print info about each job. default=%(default)s", default=False) parser.add_argument("--printLogs", action="store_true", help="Print the log files of jobs (if they exist). default=%(default)s", default=False) parser.add_argument("--printChildren", action="store_true", help="Print children of each job. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) ########################################## # Gather the jobs to report ########################################## # Gather all jobs in the workflow in jobsToReport if options.jobs == None: rootJob = fetchRootJob(jobStore) logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.') jobsToReport = traverseJobGraph(rootJob, jobStore) # Only gather jobs specified in options.jobs else: jobsToReport = fetchUserJobs(jobStore, jobs=options.jobs) ########################################## # Report on the jobs ########################################## jobStats = report_on_jobs(jobsToReport, jobStore, options) hasChildren = jobStats['hasChildren'] readyToRun = jobStats['readyToRun'] zombies = jobStats['zombies'] hasServices = jobStats['hasServices'] services = jobStats['services'] hasLogFile = jobStats['hasLogFile'] properties = jobStats['properties'] childNumber = jobStats['childNumber'] if options.printPerJobStats: printAggregateJobStats(jobsToReport, properties, childNumber) if options.printLogs: printJobLog(jobsToReport, jobStore) if options.printChildren: printJobChildren(jobsToReport) if options.printDot: print_dot_chart(jobsToReport, jobStore_name=config.jobStore) if options.stats: print('Of the %i jobs considered, ' 'there are %i jobs with children, ' '%i jobs ready to run, ' '%i zombie jobs, ' '%i jobs with services, ' '%i services, ' 'and %i jobs with log files currently in %s.' % (len(jobsToReport), len(hasChildren), len(readyToRun), len(zombies), len(hasServices), len(services), len(hasLogFile), config.jobStore)) if len(jobsToReport) > 0 and options.failIfNotComplete: # Upon workflow completion, all jobs will have been removed from job store exit(1)
def main(): logging.basicConfig() ########################################## #Import necessary modules ########################################## # This is assuming that worker.py is at a path ending in "/toil/worker.py". sourcePath = os.path.dirname(os.path.dirname(__file__)) if sourcePath not in sys.path: sys.path.append(sourcePath) #Now we can import all the necessary functions from toil.lib.bioio import setLogLevel from toil.lib.bioio import getTotalCpuTime from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage from toil.job import Job try: import boto except ImportError: pass else: # boto is installed, monkey patch it now from bd2k.util.ec2.credentials import enable_metadata_credential_caching enable_metadata_credential_caching() ########################################## #Input args ########################################## jobStoreLocator = sys.argv[1] jobStoreID = sys.argv[2] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.resumeJobStore(jobStoreLocator) config = jobStore.config ########################################## #Create the worker killer, if requested ########################################## if config.badWorker > 0 and random.random() < config.badWorker: def badWorker(): #This will randomly kill the worker process at a random time time.sleep(config.badWorkerFailInterval * random.random()) os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT) #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine) t = Thread(target=badWorker) # Ideally this would be a daemon thread but that causes an intermittent (but benign) # exception similar to the one described here: # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd # Our exception is: # Exception in thread Thread-1 (most likely raised during interpreter shutdown): # <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill' # This attribute error is caused by the call os.kill() and apparently unavoidable with a # daemon t.start() ########################################## #Load the environment for the jobGraph ########################################## #First load the environment for the jobGraph. with jobStore.readSharedFileStream("environment.pickle") as fileHandle: environment = cPickle.load(fileHandle) for i in environment: if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"): os.environ[i] = environment[i] # sys.path is used by __import__ to find modules if "PYTHONPATH" in environment: for e in environment["PYTHONPATH"].split(':'): if e != '': sys.path.append(e) setLogLevel(config.logLevel) toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir) ########################################## #Setup the temporary directories. ########################################## # Dir to put all this worker's temp files in. localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir) os.chmod(localWorkerTempDir, 0755) ########################################## #Setup the logging ########################################## #This is mildly tricky because we don't just want to #redirect stdout and stderr for this Python process; we want to redirect it #for this process and all children. Consequently, we can't just replace #sys.stdout and sys.stderr; we need to mess with the underlying OS-level #file descriptors. See <http://stackoverflow.com/a/11632982/402891> #When we start, standard input is file descriptor 0, standard output is #file descriptor 1, and standard error is file descriptor 2. #What file do we want to point FDs 1 and 2 to? tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt") #Save the original stdout and stderr (by opening new file descriptors to the #same files) origStdOut = os.dup(1) origStdErr = os.dup(2) #Open the file to send stdout/stderr to. logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND) #Replace standard output with a descriptor for the log file os.dup2(logFh, 1) #Replace standard error with a descriptor for the log file os.dup2(logFh, 2) #Since we only opened the file once, all the descriptors duped from the #original will share offset information, and won't clobber each others' #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't #matter, since O_APPEND seeks to the end of the file before every write, but #maybe there's something odd going on... #Close the descriptor we used to open the file os.close(logFh) for handler in list(logger.handlers): #Remove old handlers logger.removeHandler(handler) #Add the new handler. The sys.stderr stream has been redirected by swapping #the file descriptor out from under it. logger.addHandler(logging.StreamHandler(sys.stderr)) debugging = logging.getLogger().isEnabledFor(logging.DEBUG) ########################################## #Worker log file trapped from here on in ########################################## workerFailed = False statsDict = MagicExpando() statsDict.jobs = [] statsDict.workers.logsToMaster = [] blockFn = lambda: True cleanCacheFn = lambda x: True try: #Put a message at the top of the log, just to make sure it's working. print "---TOIL WORKER OUTPUT LOG---" sys.stdout.flush() #Log the number of open file descriptors so we can tell if we're leaking #them. logger.debug("Next available file descriptor: {}".format( nextOpenDescriptor())) logProcessContext(config, logger) ########################################## #Load the jobGraph ########################################## jobGraph = jobStore.load(jobStoreID) logger.debug("Parsed jobGraph") ########################################## #Cleanup from any earlier invocation of the jobGraph ########################################## if jobGraph.command == None: # Cleanup jobs already finished f = lambda jobs: filter( lambda x: len(x) > 0, map( lambda x: filter(lambda y: jobStore.exists(y.jobStoreID), x ), jobs)) jobGraph.stack = f(jobGraph.stack) jobGraph.services = f(jobGraph.services) logger.debug( "Cleaned up any references to completed successor jobs") #This cleans the old log file which may #have been left if the job is being retried after a job failure. oldLogFile = jobGraph.logJobStoreFileID if oldLogFile != None: jobGraph.logJobStoreFileID = None jobStore.update(jobGraph) #Update first, before deleting any files jobStore.deleteFile(oldLogFile) ########################################## # If a checkpoint exists, restart from the checkpoint ########################################## # The job is a checkpoint, and is being restarted after previously completing if jobGraph.checkpoint != None: logger.debug("Job is a checkpoint") if len(jobGraph.stack) > 0 or len( jobGraph.services) > 0 or jobGraph.command != None: if jobGraph.command != None: assert jobGraph.command == jobGraph.checkpoint logger.debug( "Checkpoint job already has command set to run") else: jobGraph.command = jobGraph.checkpoint # Reduce the retry count assert jobGraph.remainingRetryCount >= 0 jobGraph.remainingRetryCount = max( 0, jobGraph.remainingRetryCount - 1) jobStore.update( jobGraph) # Update immediately to ensure that checkpoint # is made before deleting any remaining successors if len(jobGraph.stack) > 0 or len(jobGraph.services) > 0: # If the subtree of successors is not complete restart everything logger.debug( "Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s " % (jobGraph.stack, jobGraph.services)) # Delete everything on the stack, as these represent successors to clean # up as we restart the queue def recursiveDelete(jobGraph2): # Recursive walk the stack to delete all remaining jobs for jobs in jobGraph2.stack + jobGraph2.services: for jobNode in jobs: if jobStore.exists(jobNode.jobStoreID): recursiveDelete( jobStore.load(jobNode.jobStoreID)) else: logger.debug( "Job %s has already been deleted", jobNode) if jobGraph2 != jobGraph: logger.debug( "Checkpoint is deleting old successor job: %s", jobGraph2.jobStoreID) jobStore.delete(jobGraph2.jobStoreID) recursiveDelete(jobGraph) jobGraph.stack = [ [], [] ] # Initialise the job to mimic the state of a job # that has been previously serialised but which as yet has no successors jobGraph.services = [] # Empty the services # Update the jobStore to avoid doing this twice on failure and make this clean. jobStore.update(jobGraph) # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean # because of the job being a checkpoint else: logger.debug( "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete." ) #Delete any remnant files map( jobStore.deleteFile, filter(jobStore.fileExists, jobGraph.checkpointFilesToDelete)) ########################################## #Setup the stats, if requested ########################################## if config.stats: startTime = time.time() startClock = getTotalCpuTime() #Make a temporary file directory for the jobGraph #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir")) startTime = time.time() while True: ########################################## #Run the jobGraph, if there is one ########################################## if jobGraph.command is not None: assert jobGraph.command.startswith("_toil ") logger.debug("Got a command to run: %s" % jobGraph.command) #Load the job job = Job._loadJob(jobGraph.command, jobStore) # If it is a checkpoint job, save the command if job.checkpoint: jobGraph.checkpoint = jobGraph.command # Create a fileStore object for the job fileStore = FileStore.createFileStore( jobStore, jobGraph, localWorkerTempDir, blockFn, caching=not config.disableCaching) with job._executor(jobGraph=jobGraph, stats=statsDict if config.stats else None, fileStore=fileStore): with fileStore.open(job): # Get the next block function and list that will contain any messages blockFn = fileStore._blockFn job._runner(jobGraph=jobGraph, jobStore=jobStore, fileStore=fileStore) # Accumulate messages from this job & any subsequent chained jobs statsDict.workers.logsToMaster += fileStore.loggingMessages else: #The command may be none, in which case #the jobGraph is either a shell ready to be deleted or has #been scheduled after a failure to cleanup break if FileStore._terminateEvent.isSet(): raise RuntimeError("The termination flag is set") ########################################## #Establish if we can run another jobGraph within the worker ########################################## #If no more jobs to run or services not finished, quit if len(jobGraph.stack) == 0 or len( jobGraph.services) > 0 or jobGraph.checkpoint != None: logger.debug( "Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s", len(jobGraph.stack), len(jobGraph.services), jobGraph.checkpoint != None) break #Get the next set of jobs to run jobs = jobGraph.stack[-1] assert len(jobs) > 0 #If there are 2 or more jobs to run in parallel we quit if len(jobs) >= 2: logger.debug( "No more jobs can run in series by this worker," " it's got %i children", len(jobs) - 1) break #We check the requirements of the jobGraph to see if we can run it #within the current worker successorJobNode = jobs[0] if successorJobNode.memory > jobGraph.memory: logger.debug( "We need more memory for the next job, so finishing") break if successorJobNode.cores > jobGraph.cores: logger.debug( "We need more cores for the next job, so finishing") break if successorJobNode.disk > jobGraph.disk: logger.debug( "We need more disk for the next job, so finishing") break if successorJobNode.predecessorNumber > 1: logger.debug( "The jobGraph has multiple predecessors, we must return to the leader." ) break # Load the successor jobGraph successorJobGraph = jobStore.load(successorJobNode.jobStoreID) # Somewhat ugly, but check if job is a checkpoint job and quit if # so if successorJobGraph.command.startswith("_toil "): #Load the job successorJob = Job._loadJob(successorJobGraph.command, jobStore) # Check it is not a checkpoint if successorJob.checkpoint: logger.debug("Next job is checkpoint, so finishing") break ########################################## #We have a single successor job that is not a checkpoint job. #We transplant the successor jobGraph command and stack #into the current jobGraph object so that it can be run #as if it were a command that were part of the current jobGraph. #We can then delete the successor jobGraph in the jobStore, as it is #wholly incorporated into the current jobGraph. ########################################## #Clone the jobGraph and its stack jobGraph = copy.deepcopy(jobGraph) #Remove the successor jobGraph jobGraph.stack.pop() #These should all match up assert successorJobGraph.memory == successorJobNode.memory assert successorJobGraph.cores == successorJobNode.cores assert successorJobGraph.predecessorsFinished == set() assert successorJobGraph.predecessorNumber == 1 assert successorJobGraph.command is not None assert successorJobGraph.jobStoreID == successorJobNode.jobStoreID #Transplant the command and stack to the current jobGraph jobGraph.command = successorJobGraph.command jobGraph.stack += successorJobGraph.stack assert jobGraph.memory >= successorJobGraph.memory assert jobGraph.cores >= successorJobGraph.cores #Build a fileStore to update the job fileStore = FileStore.createFileStore( jobStore, jobGraph, localWorkerTempDir, blockFn, caching=not config.disableCaching) #Update blockFn blockFn = fileStore._blockFn #Add successorJobGraph to those to be deleted fileStore.jobsToDelete.add(successorJobGraph.jobStoreID) #This will update the job once the previous job is done fileStore._updateJobWhenDone() #Clone the jobGraph and its stack again, so that updates to it do #not interfere with this update jobGraph = copy.deepcopy(jobGraph) logger.debug("Starting the next job") ########################################## #Finish up the stats ########################################## if config.stats: totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage() statsDict.workers.time = str(time.time() - startTime) statsDict.workers.clock = str(totalCPUTime - startClock) statsDict.workers.memory = str(totalMemoryUsage) # log the worker log path here so that if the file is truncated the path can still be found logger.info( "Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir) logger.info( "Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime) ########################################## #Trapping where worker goes wrong ########################################## except: #Case that something goes wrong in worker traceback.print_exc() logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname()) FileStore._terminateEvent.set() ########################################## #Wait for the asynchronous chain of writes/updates to finish ########################################## blockFn() ########################################## #All the asynchronous worker/update threads must be finished now, #so safe to test if they completed okay ########################################## if FileStore._terminateEvent.isSet(): jobGraph = jobStore.load(jobStoreID) jobGraph.setupJobAfterFailure(config) workerFailed = True ########################################## #Cleanup ########################################## #Close the worker logging #Flush at the Python level sys.stdout.flush() sys.stderr.flush() #Flush at the OS level os.fsync(1) os.fsync(2) #Close redirected stdout and replace with the original standard output. os.dup2(origStdOut, 1) #Close redirected stderr and replace with the original standard error. os.dup2(origStdErr, 2) #sys.stdout and sys.stderr don't need to be modified at all. We don't need #to call redirectLoggerStreamHandlers since they still log to sys.stderr #Close our extra handles to the original standard output and standard error #streams, so we don't leak file handles. os.close(origStdOut) os.close(origStdErr) #Now our file handles are in exactly the state they were in before. #Copy back the log file to the global dir, if needed if workerFailed: jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID( jobGraph.jobStoreID) with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w: with open(tempWorkerLogPath, "r") as f: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit: f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file w.write(f.read()) jobStore.update(jobGraph) elif debugging: # write log messages with open(tempWorkerLogPath, 'r') as logFile: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit: logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file logMessages = logFile.read().splitlines() statsDict.logs = [ Expando(jobStoreID=jobStoreID, text=logMessage) for logMessage in logMessages ] if (debugging or config.stats or statsDict.workers.logsToMaster ) and not workerFailed: # We have stats/logging to report back jobStore.writeStatsAndLogging(json.dumps(statsDict)) #Remove the temp dir cleanUp = config.cleanWorkDir if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed): shutil.rmtree(localWorkerTempDir) #This must happen after the log file is done with, else there is no place to put the log if (not workerFailed) and jobGraph.command == None and len( jobGraph.stack) == 0 and len(jobGraph.services) == 0: # We can now safely get rid of the jobGraph jobStore.delete(jobGraph.jobStoreID)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) def traverseGraph(jobGraph): foundJobStoreIDs = set() totalJobs = [] def inner(jobGraph): if jobGraph.jobStoreID in foundJobStoreIDs: return foundJobStoreIDs.add(jobGraph.jobStoreID) totalJobs.append(jobGraph) # Traverse jobs in stack for jobs in jobGraph.stack: for successorJobStoreID in [x.jobStoreID for x in jobs]: if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)): inner(jobStore.load(successorJobStoreID)) # Traverse service jobs for jobs in jobGraph.services: for serviceJobStoreID in [x.jobStoreID for x in jobs]: if jobStore.exists(serviceJobStoreID): assert serviceJobStoreID not in foundJobStoreIDs foundJobStoreIDs.add(serviceJobStoreID) totalJobs.append(jobStore.load(serviceJobStoreID)) inner(jobGraph) return totalJobs logger.info('Traversing the job graph. This may take a couple minutes.') totalJobs = traverseGraph(rootJob) failedJobs = [] hasChildren = [] hasServices = [] services = [] currentlyRunnning = [] for job in totalJobs: if job.logJobStoreFileID is not None: failedJobs.append(job) if job.stack: hasChildren.append(job) elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command: # The job has no children, hasn't failed, and has a command to run. This indicates that the job is # likely currently running, or at least could be run. currentlyRunnning.append(job) if job.services: hasServices.append(job) if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID: # these attributes are only set in service jobs services.append(job) logger.info('There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, ' 'and %i totally failed jobs currently in %s.' % (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore)) if currentlyRunnning: logger.info('These %i jobs are currently active: %s', len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning))) if options.verbose: #Verbose currently means outputting the files that have failed. if failedJobs: msg = "Outputting logs for the %i failed jobs" % (len(failedJobs)) msg += ": %s" % ", ".join((str(failedJob) for failedJob in failedJobs)) for jobNode in failedJobs: job = jobStore.load(jobNode.jobStoreID) msg += "\n=========> Failed job %s \n" % jobNode with job.getLogFileHandle(jobStore) as fH: msg += fH.read() msg += "<=========\n" print(msg) else: print('There are no failed jobs to report.', file=sys.stderr) if totalJobs and options.failIfNotComplete: exit(1) # when the workflow is complete, all jobs will have been removed from job store
def main(): logging.basicConfig() ########################################## #Import necessary modules ########################################## # This is assuming that worker.py is at a path ending in "/toil/worker.py". sourcePath = os.path.dirname(os.path.dirname(__file__)) if sourcePath not in sys.path: sys.path.append(sourcePath) #Now we can import all the necessary functions from toil.lib.bioio import setLogLevel from toil.lib.bioio import getTotalCpuTime from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage from toil.job import Job try: import boto except ImportError: pass else: # boto is installed, monkey patch it now from bd2k.util.ec2.credentials import enable_metadata_credential_caching enable_metadata_credential_caching() ########################################## #Input args ########################################## jobStoreLocator = sys.argv[1] jobStoreID = sys.argv[2] # we really want a list of job names but the ID will suffice if the job graph can't # be loaded. If we can discover the name, we will replace this initial entry listOfJobs = [jobStoreID] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.resumeJobStore(jobStoreLocator) config = jobStore.config ########################################## #Create the worker killer, if requested ########################################## logFileByteReportLimit = config.maxLogFileSize if config.badWorker > 0 and random.random() < config.badWorker: def badWorker(): #This will randomly kill the worker process at a random time time.sleep(config.badWorkerFailInterval * random.random()) os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT) #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine) t = Thread(target=badWorker) # Ideally this would be a daemon thread but that causes an intermittent (but benign) # exception similar to the one described here: # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd # Our exception is: # Exception in thread Thread-1 (most likely raised during interpreter shutdown): # <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill' # This attribute error is caused by the call os.kill() and apparently unavoidable with a # daemon t.start() ########################################## #Load the environment for the jobGraph ########################################## #First load the environment for the jobGraph. with jobStore.readSharedFileStream("environment.pickle") as fileHandle: environment = cPickle.load(fileHandle) for i in environment: if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"): os.environ[i] = environment[i] # sys.path is used by __import__ to find modules if "PYTHONPATH" in environment: for e in environment["PYTHONPATH"].split(':'): if e != '': sys.path.append(e) setLogLevel(config.logLevel) toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir) ########################################## #Setup the temporary directories. ########################################## # Dir to put all this worker's temp files in. localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir) os.chmod(localWorkerTempDir, 0o755) ########################################## #Setup the logging ########################################## #This is mildly tricky because we don't just want to #redirect stdout and stderr for this Python process; we want to redirect it #for this process and all children. Consequently, we can't just replace #sys.stdout and sys.stderr; we need to mess with the underlying OS-level #file descriptors. See <http://stackoverflow.com/a/11632982/402891> #When we start, standard input is file descriptor 0, standard output is #file descriptor 1, and standard error is file descriptor 2. #What file do we want to point FDs 1 and 2 to? tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt") #Save the original stdout and stderr (by opening new file descriptors to the #same files) origStdOut = os.dup(1) origStdErr = os.dup(2) #Open the file to send stdout/stderr to. logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND) #Replace standard output with a descriptor for the log file os.dup2(logFh, 1) #Replace standard error with a descriptor for the log file os.dup2(logFh, 2) #Since we only opened the file once, all the descriptors duped from the #original will share offset information, and won't clobber each others' #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't #matter, since O_APPEND seeks to the end of the file before every write, but #maybe there's something odd going on... #Close the descriptor we used to open the file os.close(logFh) debugging = logging.getLogger().isEnabledFor(logging.DEBUG) ########################################## #Worker log file trapped from here on in ########################################## workerFailed = False statsDict = MagicExpando() statsDict.jobs = [] statsDict.workers.logsToMaster = [] blockFn = lambda : True cleanCacheFn = lambda x : True try: #Put a message at the top of the log, just to make sure it's working. print("---TOIL WORKER OUTPUT LOG---") sys.stdout.flush() #Log the number of open file descriptors so we can tell if we're leaking #them. logger.debug("Next available file descriptor: {}".format( nextOpenDescriptor())) logProcessContext(config) ########################################## #Load the jobGraph ########################################## jobGraph = jobStore.load(jobStoreID) listOfJobs[0] = str(jobGraph) logger.debug("Parsed jobGraph") ########################################## #Cleanup from any earlier invocation of the jobGraph ########################################## if jobGraph.command == None: # Cleanup jobs already finished f = lambda jobs : filter(lambda x : len(x) > 0, map(lambda x : filter(lambda y : jobStore.exists(y.jobStoreID), x), jobs)) jobGraph.stack = f(jobGraph.stack) jobGraph.services = f(jobGraph.services) logger.debug("Cleaned up any references to completed successor jobs") #This cleans the old log file which may #have been left if the job is being retried after a job failure. oldLogFile = jobGraph.logJobStoreFileID if oldLogFile != None: jobGraph.logJobStoreFileID = None jobStore.update(jobGraph) #Update first, before deleting any files jobStore.deleteFile(oldLogFile) ########################################## # If a checkpoint exists, restart from the checkpoint ########################################## # The job is a checkpoint, and is being restarted after previously completing if jobGraph.checkpoint != None: logger.debug("Job is a checkpoint") if len(jobGraph.stack) > 0 or len(jobGraph.services) > 0 or jobGraph.command != None: if jobGraph.command != None: assert jobGraph.command == jobGraph.checkpoint logger.debug("Checkpoint job already has command set to run") else: jobGraph.command = jobGraph.checkpoint jobStore.update(jobGraph) # Update immediately to ensure that checkpoint # is made before deleting any remaining successors if len(jobGraph.stack) > 0 or len(jobGraph.services) > 0: # If the subtree of successors is not complete restart everything logger.debug("Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s " % (jobGraph.stack, jobGraph.services)) # Delete everything on the stack, as these represent successors to clean # up as we restart the queue def recursiveDelete(jobGraph2): # Recursive walk the stack to delete all remaining jobs for jobs in jobGraph2.stack + jobGraph2.services: for jobNode in jobs: if jobStore.exists(jobNode.jobStoreID): recursiveDelete(jobStore.load(jobNode.jobStoreID)) else: logger.debug("Job %s has already been deleted", jobNode) if jobGraph2 != jobGraph: logger.debug("Checkpoint is deleting old successor job: %s", jobGraph2.jobStoreID) jobStore.delete(jobGraph2.jobStoreID) recursiveDelete(jobGraph) jobGraph.stack = [ [], [] ] # Initialise the job to mimic the state of a job # that has been previously serialised but which as yet has no successors jobGraph.services = [] # Empty the services # Update the jobStore to avoid doing this twice on failure and make this clean. jobStore.update(jobGraph) # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean # because of the job being a checkpoint else: logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.") #Delete any remnant files map(jobStore.deleteFile, filter(jobStore.fileExists, jobGraph.checkpointFilesToDelete)) ########################################## #Setup the stats, if requested ########################################## if config.stats: startTime = time.time() startClock = getTotalCpuTime() #Make a temporary file directory for the jobGraph #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir")) startTime = time.time() while True: ########################################## #Run the jobGraph, if there is one ########################################## if jobGraph.command is not None: assert jobGraph.command.startswith( "_toil " ) logger.debug("Got a command to run: %s" % jobGraph.command) #Load the job job = Job._loadJob(jobGraph.command, jobStore) # If it is a checkpoint job, save the command if job.checkpoint: jobGraph.checkpoint = jobGraph.command # Create a fileStore object for the job fileStore = FileStore.createFileStore(jobStore, jobGraph, localWorkerTempDir, blockFn, caching=not config.disableCaching) with job._executor(jobGraph=jobGraph, stats=statsDict if config.stats else None, fileStore=fileStore): with fileStore.open(job): # Get the next block function and list that will contain any messages blockFn = fileStore._blockFn job._runner(jobGraph=jobGraph, jobStore=jobStore, fileStore=fileStore) # Accumulate messages from this job & any subsequent chained jobs statsDict.workers.logsToMaster += fileStore.loggingMessages else: #The command may be none, in which case #the jobGraph is either a shell ready to be deleted or has #been scheduled after a failure to cleanup break if FileStore._terminateEvent.isSet(): raise RuntimeError("The termination flag is set") ########################################## #Establish if we can run another jobGraph within the worker ########################################## #If no more jobs to run or services not finished, quit if len(jobGraph.stack) == 0 or len(jobGraph.services) > 0 or jobGraph.checkpoint != None: logger.debug("Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s", len(jobGraph.stack), len(jobGraph.services), jobGraph.checkpoint != None) break #Get the next set of jobs to run jobs = jobGraph.stack[-1] assert len(jobs) > 0 #If there are 2 or more jobs to run in parallel we quit if len(jobs) >= 2: logger.debug("No more jobs can run in series by this worker," " it's got %i children", len(jobs)-1) break #We check the requirements of the jobGraph to see if we can run it #within the current worker successorJobNode = jobs[0] if successorJobNode.memory > jobGraph.memory: logger.debug("We need more memory for the next job, so finishing") break if successorJobNode.cores > jobGraph.cores: logger.debug("We need more cores for the next job, so finishing") break if successorJobNode.disk > jobGraph.disk: logger.debug("We need more disk for the next job, so finishing") break if successorJobNode.preemptable != jobGraph.preemptable: logger.debug("Preemptability is different for the next job, returning to the leader") break if successorJobNode.predecessorNumber > 1: logger.debug("The jobGraph has multiple predecessors, we must return to the leader.") break # Load the successor jobGraph successorJobGraph = jobStore.load(successorJobNode.jobStoreID) # add the successor to the list of jobs run listOfJobs.append(str(successorJobGraph)) # Somewhat ugly, but check if job is a checkpoint job and quit if # so if successorJobGraph.command.startswith( "_toil " ): #Load the job successorJob = Job._loadJob(successorJobGraph.command, jobStore) # Check it is not a checkpoint if successorJob.checkpoint: logger.debug("Next job is checkpoint, so finishing") break ########################################## #We have a single successor job that is not a checkpoint job. #We transplant the successor jobGraph command and stack #into the current jobGraph object so that it can be run #as if it were a command that were part of the current jobGraph. #We can then delete the successor jobGraph in the jobStore, as it is #wholly incorporated into the current jobGraph. ########################################## #Clone the jobGraph and its stack jobGraph = copy.deepcopy(jobGraph) #Remove the successor jobGraph jobGraph.stack.pop() #These should all match up assert successorJobGraph.memory == successorJobNode.memory assert successorJobGraph.cores == successorJobNode.cores assert successorJobGraph.predecessorsFinished == set() assert successorJobGraph.predecessorNumber == 1 assert successorJobGraph.command is not None assert successorJobGraph.jobStoreID == successorJobNode.jobStoreID #Transplant the command and stack to the current jobGraph jobGraph.command = successorJobGraph.command jobGraph.stack += successorJobGraph.stack # include some attributes for better identification of chained jobs in # logging output jobGraph.unitName = successorJobGraph.unitName jobGraph.jobName = successorJobGraph.jobName assert jobGraph.memory >= successorJobGraph.memory assert jobGraph.cores >= successorJobGraph.cores #Build a fileStore to update the job fileStore = FileStore.createFileStore(jobStore, jobGraph, localWorkerTempDir, blockFn, caching=not config.disableCaching) #Update blockFn blockFn = fileStore._blockFn #Add successorJobGraph to those to be deleted fileStore.jobsToDelete.add(successorJobGraph.jobStoreID) #This will update the job once the previous job is done fileStore._updateJobWhenDone() #Clone the jobGraph and its stack again, so that updates to it do #not interfere with this update jobGraph = copy.deepcopy(jobGraph) logger.debug("Starting the next job") ########################################## #Finish up the stats ########################################## if config.stats: totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage() statsDict.workers.time = str(time.time() - startTime) statsDict.workers.clock = str(totalCPUTime - startClock) statsDict.workers.memory = str(totalMemoryUsage) # log the worker log path here so that if the file is truncated the path can still be found logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir) logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime) ########################################## #Trapping where worker goes wrong ########################################## except: #Case that something goes wrong in worker traceback.print_exc() logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname()) FileStore._terminateEvent.set() ########################################## #Wait for the asynchronous chain of writes/updates to finish ########################################## blockFn() ########################################## #All the asynchronous worker/update threads must be finished now, #so safe to test if they completed okay ########################################## if FileStore._terminateEvent.isSet(): jobGraph = jobStore.load(jobStoreID) jobGraph.setupJobAfterFailure(config) workerFailed = True ########################################## #Cleanup ########################################## #Close the worker logging #Flush at the Python level sys.stdout.flush() sys.stderr.flush() #Flush at the OS level os.fsync(1) os.fsync(2) #Close redirected stdout and replace with the original standard output. os.dup2(origStdOut, 1) #Close redirected stderr and replace with the original standard error. os.dup2(origStdErr, 2) #sys.stdout and sys.stderr don't need to be modified at all. We don't need #to call redirectLoggerStreamHandlers since they still log to sys.stderr #Close our extra handles to the original standard output and standard error #streams, so we don't leak file handles. os.close(origStdOut) os.close(origStdErr) #Now our file handles are in exactly the state they were in before. #Copy back the log file to the global dir, if needed if workerFailed: jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID(jobGraph.jobStoreID) jobGraph.chainedJobs = listOfJobs with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w: with open(tempWorkerLogPath, "r") as f: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0: if logFileByteReportLimit > 0: f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file elif logFileByteReportLimit < 0: f.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file w.write(f.read()) jobStore.update(jobGraph) elif debugging: # write log messages with open(tempWorkerLogPath, 'r') as logFile: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0: if logFileByteReportLimit > 0: logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file elif logFileByteReportLimit < 0: logFile.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file logMessages = logFile.read().splitlines() statsDict.logs.names = listOfJobs statsDict.logs.messages = logMessages if (debugging or config.stats or statsDict.workers.logsToMaster) and not workerFailed: # We have stats/logging to report back jobStore.writeStatsAndLogging(json.dumps(statsDict)) #Remove the temp dir cleanUp = config.cleanWorkDir if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed): shutil.rmtree(localWorkerTempDir) #This must happen after the log file is done with, else there is no place to put the log if (not workerFailed) and jobGraph.command == None and len(jobGraph.stack) == 0 and len(jobGraph.services) == 0: # We can now safely get rid of the jobGraph jobStore.delete(jobGraph.jobStoreID)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print( 'The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) def traverseGraph(jobGraph): foundJobStoreIDs = set() totalJobs = [] def inner(jobGraph): if jobGraph.jobStoreID in foundJobStoreIDs: return foundJobStoreIDs.add(jobGraph.jobStoreID) totalJobs.append(jobGraph) # Traverse jobs in stack for jobs in jobGraph.stack: for successorJobStoreID in map(lambda x: x.jobStoreID, jobs): if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)): inner(jobStore.load(successorJobStoreID)) # Traverse service jobs for jobs in jobGraph.services: for serviceJobStoreID in map(lambda x: x.jobStoreID, jobs): if jobStore.exists(serviceJobStoreID): assert serviceJobStoreID not in foundJobStoreIDs foundJobStoreIDs.add(serviceJobStoreID) totalJobs.append(jobStore.load(serviceJobStoreID)) inner(jobGraph) return totalJobs logger.info('Traversing the job graph. This may take a couple minutes.') totalJobs = traverseGraph(rootJob) failedJobs = [] hasChildren = [] hasServices = [] services = [] currentlyRunnning = [] for job in totalJobs: if job.logJobStoreFileID is not None: failedJobs.append(job) if job.stack: hasChildren.append(job) elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command: # The job has no children, hasn't failed, and has a command to run. This indicates that the job is # likely currently running, or at least could be run. currentlyRunnning.append(job) if job.services: hasServices.append(job) if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID: # these attributes are only set in service jobs services.append(job) logger.info( 'There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, ' 'and %i totally failed jobs currently in %s.' % (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore)) if currentlyRunnning: logger.info('These %i jobs are currently active: %s', len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning))) if options.verbose: #Verbose currently means outputting the files that have failed. if failedJobs: msg = "Outputting logs for the %i failed jobs" % (len(failedJobs)) msg += ": %s" % ", ".join( (str(failedJob) for failedJob in failedJobs)) for jobNode in failedJobs: job = jobStore.load(jobNode.jobStoreID) msg += "\n=========> Failed job %s \n" % jobNode with job.getLogFileHandle(jobStore) as fH: msg += fH.read() msg += "<=========\n" print(msg) else: print('There are no failed jobs to report.', file=sys.stderr) if totalJobs and options.failIfNotComplete: exit( 1 ) # when the workflow is complete, all jobs will have been removed from job store
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(options.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(options.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print( 'The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob) # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [job for job in totalJobs if job.remainingRetryCount == 0] print( 'There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len( toilState.successorCounts), len(failedJobs), options.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)