Exemplo n.º 1
0
    def getStatus(jobStoreName: str) -> str:
        """
        Determine the status of a workflow.

        If the jobstore does not exist, this returns 'QUEUED', assuming it has not been created yet.

        Checks for the existence of files created in the toil.Leader.run(). In toil.Leader.run(), if a workflow completes
        with failed jobs, 'failed.log' is created, otherwise 'succeeded.log' is written. If neither of these exist,
        the leader is still running jobs.

        :return: A string indicating the status of the workflow. ['COMPLETED', 'RUNNING', 'ERROR', 'QUEUED']
        :rtype: str
        """
        try:
            jobstore = Toil.resumeJobStore(jobStoreName)
        except NoSuchJobStoreException:
            return 'QUEUED'
        except NoSuchFileException:
            return 'QUEUED'

        try:
            with jobstore.readSharedFileStream('succeeded.log') as successful:
                pass
            return 'COMPLETED'
        except NoSuchFileException:
            try:
                with jobstore.readSharedFileStream('failed.log') as failed:
                    pass
                return 'ERROR'
            except NoSuchFileException:
                pass
        return 'RUNNING'
Exemplo n.º 2
0
    def _runDebugJob(self, jobCommand, jobID, environment):
        """
        Run the jobCommand right now, in the current thread.
        May only be called in debug-worker mode.
        Assumes resources are available.
        """

        assert self.debugWorker

        # TODO: It is not possible to kill running jobs in forkless mode,
        # because they are run immediately in the main thread.
        info = Info(time.time(), None, None, killIntended=False)
        self.runningJobs[jobID] = info

        if jobCommand.startswith("_toil_worker "):
            # We can actually run in this thread
            jobName, jobStoreLocator, jobStoreID = jobCommand.split()[
                1:]  # Parse command
            jobStore = Toil.resumeJobStore(jobStoreLocator)
            toil_worker.workerScript(
                jobStore,
                jobStore.config,
                jobName,
                jobStoreID,
                redirectOutputToLogFile=not self.debugWorker
            )  # Call the worker
        else:
            # Run synchronously. If starting or running the command fails, let the exception stop us.
            subprocess.check_call(jobCommand,
                                  shell=True,
                                  env=dict(os.environ, **environment))

        self.runningJobs.pop(jobID)
        if not info.killIntended:
            self.outputQueue.put((jobID, 0, time.time() - info.time))
Exemplo n.º 3
0
def main():
    parser = parser_with_common_options(jobstore_option=True)
    parser.add_argument(
        "jobID",
        nargs=1,
        help=
        "The job store id of a job within the provided jobstore to run by itself."
    )
    parser.add_argument(
        "--printJobInfo",
        nargs=1,
        help=
        "Return information about this job to the user including preceding jobs, "
        "inputs, outputs, and runtime from the last known run.")

    options = parser.parse_args()
    set_logging_from_options(options)
    config = Config()
    config.setOptions(options)

    jobStore = Toil.resumeJobStore(config.jobStore)

    if options.printJobInfo:
        printContentsOfJobStore(jobStorePath=config.jobStore,
                                nameOfJob=options.printJobInfo)

    # TODO: Option to print list of successor jobs
    # TODO: Option to run job within python debugger, allowing step through of arguments
    # idea would be to have option to import pdb and set breakpoint at the start of the user's code

    jobID = options.jobID[0]
    logger.debug(f"Running the following job locally: {jobID}")
    workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False)
    logger.debug(f"Finished running: {jobID}")
Exemplo n.º 4
0
def main():
    parser = parser_with_common_options()
    options = parser.parse_args()
    set_logging_from_options(options)
    config = Config()
    config.setOptions(options)
    config.jobStore = config.jobStore[5:] if config.jobStore.startswith('file:') else config.jobStore

    # ':' means an aws/google jobstore; use the old (broken?) method
    if ':' in config.jobStore:
        jobStore = Toil.resumeJobStore(config.jobStore)
        logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore)
        # TODO: This behaviour is now broken: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1
        batchSystem = Toil.createBatchSystem(jobStore.config)  # Should automatically kill existing jobs, so we're good.
        for jobID in batchSystem.getIssuedBatchJobIDs():  # Just in case we do it again.
            batchSystem.killBatchJobs(jobID)
        logger.info("All jobs SHOULD have been killed")
    # otherwise, kill the pid recorded in the jobstore
    else:
        pid_log = os.path.join(os.path.abspath(config.jobStore), 'pid.log')
        with open(pid_log, 'r') as f:
            pid2kill = f.read().strip()
        try:
            os.kill(int(pid2kill), signal.SIGKILL)
            logger.info("Toil process %s successfully terminated." % str(pid2kill))
        except OSError:
            logger.error("Toil process %s could not be terminated." % str(pid2kill))
            raise
Exemplo n.º 5
0
def main():
    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help=
        "The location of the job store used by the workflow whose jobs should "
        "be killed." + jobStoreLocatorHelp)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)

    jobStore = Toil.resumeJobStore(options.jobStore)

    logger.info(
        "Starting routine to kill running jobs in the toil workflow: %s" %
        options.jobStore)
    ####This behaviour is now broken
    batchSystem = Toil.createBatchSystem(
        jobStore.config
    )  #This should automatically kill the existing jobs.. so we're good.
    for jobID in batchSystem.getIssuedBatchJobIDs(
    ):  #Just in case we do it again.
        batchSystem.killBatchJobs(jobID)
    logger.info("All jobs SHOULD have been killed")
Exemplo n.º 6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Parse input args
    jobName = argv[1]
    jobStoreLocator = argv[2]
    jobStoreID = argv[3]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    # Try to monkey-patch boto early so that credentials are cached.
    try:
        import boto
    except ImportError:
        pass
    else:
        # boto is installed, monkey patch it now
        from toil.lib.ec2Credentials import enable_metadata_credential_caching
        enable_metadata_credential_caching()

    jobStore = Toil.resumeJobStore(jobStoreLocator)
    config = jobStore.config

    # Call the worker
    workerScript(jobStore, config, jobName, jobStoreID)
Exemplo n.º 7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Do a little argument validation, in case someone tries to run us manually.
    if len(argv) < 4:
        if len(argv) < 1:
            sys.stderr.write("Error: Toil worker invoked without its own name\n")
            sys.exit(1)
        else:
            sys.stderr.write("Error: usage: %s JOB_NAME JOB_STORE_LOCATOR JOB_STORE_ID\n" % argv[0])
            sys.exit(1)

    # Parse input args
    jobName = argv[1]
    jobStoreLocator = argv[2]
    jobStoreID = argv[3]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = Toil.resumeJobStore(jobStoreLocator)
    config = jobStore.config

    # Call the worker
    workerScript(jobStore, config, jobName, jobStoreID)
Exemplo n.º 8
0
    def getPIDStatus(jobStoreName: str) -> str:
        """
        Determine the status of a process with a particular pid.

        Checks to see if a process exists or not.

        :return: A string indicating the status of the PID of the workflow as stored in the jobstore.
        :rtype: str
        """
        try:
            jobstore = Toil.resumeJobStore(jobStoreName)
        except NoSuchJobStoreException:
            return 'QUEUED'
        except NoSuchFileException:
            return 'QUEUED'

        try:
            with jobstore.readSharedFileStream('pid.log') as pidFile:
                pid = int(pidFile.read())
                try:
                    os.kill(pid, 0)  # Does not kill process when 0 is passed.
                except OSError:  # Process not found, must be done.
                    return 'COMPLETED'
                else:
                    return 'RUNNING'
        except NoSuchFileException:
            pass
        return 'QUEUED'
Exemplo n.º 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Parse our command line
    options = parse_args(argv)

    # Parse input args
    jobName = argv[1]
    jobStoreLocator = argv[2]
    jobStoreID = argv[3]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = Toil.resumeJobStore(options.jobStoreLocator)
    config = jobStore.config

    with in_contexts(options.context):
        # Call the worker
        exit_code = workerScript(jobStore, config, options.jobName,
                                 options.jobStoreID)

    # Exit with its return value
    sys.exit(exit_code)
Exemplo n.º 10
0
def toil_jobstore_info(jobstore: str) -> dict:
    """parses a toil jobstore folder"""
    try:
        jobStore = Toil.resumeJobStore(jobstore)
    except NoSuchJobStoreException:
        return {}
    else:
        stats = getStats(jobStore)
        return processData(jobStore.config, stats)
Exemplo n.º 11
0
    def __init__(self, jobStoreName, specifiedJobs=None):
        self.jobStoreName = jobStoreName
        self.jobStore = Toil.resumeJobStore(jobStoreName)

        if specifiedJobs is None:
            rootJob = self.fetchRootJob()
            logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.')
            self.jobsToReport = self.traverseJobGraph(rootJob)
        else:
            self.jobsToReport = self.fetchUserJobs(specifiedJobs)
Exemplo n.º 12
0
def main():
    """ Reports stats on the workflow, use with --stats option to toil.
    """
    parser = getBasicOptionParser()
    initializeOptions(parser)
    options = parseBasicOptions(parser)
    checkOptions(options, parser)
    jobStore = Toil.resumeJobStore(options.jobStore)
    stats = getStats(jobStore)
    collatedStatsTag = processData(jobStore.config, stats)
    reportData(collatedStatsTag, options)
Exemplo n.º 13
0
 def testMultipleJobsPerWorkerStats(self):
     """
     Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data
     """
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.clean = 'never'
     options.stats = True
     Job.Runner.startToil(RunTwoJobsPerWorker(), options)
     jobStore = Toil.resumeJobStore(options.jobStore)
     stats = getStats(jobStore)
     collatedStats = processData(jobStore.config, stats)
     self.assertTrue(len(collatedStats.job_types) == 2,
                     "Some jobs are not represented in the stats")
Exemplo n.º 14
0
 def testMultipleJobsPerWorkerStats(self):
     """
     Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data
     """
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.clean = 'never'
     options.stats = True
     Job.Runner.startToil(RunTwoJobsPerWorker(), options)
     jobStore = Toil.resumeJobStore(options.jobStore)
     stats = getStats(jobStore)
     collatedStats = processData(jobStore.config, stats)
     self.assertTrue(
         len(collatedStats.job_types) == 2,
         "Some jobs are not represented in the stats")
Exemplo n.º 15
0
def main():
    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help="The location of the job store used by the workflow." +
        jobStoreLocatorHelp)
    parser.add_argument("--localFilePath",
                        nargs=1,
                        help="Location to which to copy job store files.")
    parser.add_argument("--fetch",
                        nargs="+",
                        help="List of job-store files to be copied locally."
                        "Use either explicit names (i.e. 'data.txt'), or "
                        "specify glob patterns (i.e. '*.txt')")
    parser.add_argument(
        "--listFilesInJobStore",
        help="Prints a list of the current files in the jobStore.")
    parser.add_argument(
        "--fetchEntireJobStore",
        help="Copy all job store files into a local directory.")
    parser.add_argument(
        "--useSymlinks",
        help="Creates symlink 'shortcuts' of files in the localFilePath"
        " instead of hardlinking or copying, where possible.  If this is"
        " not possible, it will copy the files (shutil.copyfile()).")
    parser.add_argument("--version", action='version', version=version)

    # Load the jobStore
    options = parseBasicOptions(parser)
    config = Config()
    config.setOptions(options)
    jobStore = Toil.resumeJobStore(config.jobStore)
    logger.debug("Connected to job store: %s", config.jobStore)

    if options.fetch:
        # Copy only the listed files locally
        logger.debug("Fetching local files: %s", options.fetch)
        fetchJobStoreFiles(jobStore=jobStore, options=options)

    elif options.fetchEntireJobStore:
        # Copy all jobStore files locally
        logger.debug("Fetching all local files.")
        options.fetch = "*"
        fetchJobStoreFiles(jobStore=jobStore, options=options)

    if options.listFilesInJobStore:
        # Log filenames and create a file containing these names in cwd
        printContentsOfJobStore(jobStorePath=options.jobStore)
Exemplo n.º 16
0
 def _runWorker(self, jobCommand, jobID, environment):
     """
     Run the jobCommand using the worker and wait for it to finish.
     The worker is forked unless it is a '_toil_worker' job and
     debugWorker is True.
     """
     startTime = time.time()  # Time job is started
     if self.debugWorker and "_toil_worker" in jobCommand:
         # Run the worker without forking
         jobName, jobStoreLocator, jobStoreID = jobCommand.split()[
             1:]  # Parse command
         jobStore = Toil.resumeJobStore(jobStoreLocator)
         # TODO: The following does not yet properly populate self.runningJobs so it is not possible to kill
         # running jobs in forkless mode - see the "None" value in place of popen
         info = Info(time.time(), None, killIntended=False)
         try:
             self.runningJobs[jobID] = info
             try:
                 toil_worker.workerScript(
                     jobStore,
                     jobStore.config,
                     jobName,
                     jobStoreID,
                     redirectOutputToLogFile=not self.debugWorker
                 )  # Call the worker
             finally:
                 self.runningJobs.pop(jobID)
         finally:
             if not info.killIntended:
                 self.outputQueue.put((jobID, 0, time.time() - startTime))
     else:
         with self.popenLock:
             popen = subprocess.Popen(jobCommand,
                                      shell=True,
                                      env=dict(os.environ, **environment))
         info = Info(time.time(), popen, killIntended=False)
         try:
             self.runningJobs[jobID] = info
             try:
                 statusCode = popen.wait()
                 if statusCode != 0 and not info.killIntended:
                     log.error(
                         "Got exit code %i (indicating failure) "
                         "from job %s.", statusCode, self.jobs[jobID])
             finally:
                 self.runningJobs.pop(jobID)
         finally:
             if not info.killIntended:
                 self.outputQueue.put(
                     (jobID, statusCode, time.time() - startTime))
Exemplo n.º 17
0
def main() -> None:
    parser = parser_with_common_options()
    options = parser.parse_args()
    set_logging_from_options(options)
    config = Config()
    config.setOptions(options)

    job_store_type, _ = Toil.parseLocator(config.jobStore)

    if job_store_type != 'file':
        # Remote (aws/google) jobstore; use the old (broken?) method
        job_store = Toil.resumeJobStore(config.jobStore)
        logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore)
        # TODO: This behaviour is now broken: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1
        # There's no guarantee that the batch system in use can enumerate
        # running jobs belonging to the job store we've attached to. And
        # moreover we don't even bother trying to kill the leader at its
        # recorded PID, even if it is a local process.
        batch_system = Toil.createBatchSystem(job_store.config)  # Should automatically kill existing jobs, so we're good.
        for job_id in batch_system.getIssuedBatchJobIDs():  # Just in case we do it again.
            batch_system.killBatchJobs([job_id])
        logger.info("All jobs SHOULD have been killed")
    else:
        # otherwise, kill the pid recorded in the jobstore.
        # TODO: We assume thnis is a local PID.
        job_store = Toil.resumeJobStore(config.jobStore)
        assert isinstance(job_store, FileJobStore), "Need a FileJobStore which has a sharedFilesDir"
        pid_log = os.path.join(job_store.sharedFilesDir, 'pid.log')
        with open(pid_log) as f:
            pid_to_kill = f.read().strip()
        try:
            os.kill(int(pid_to_kill), signal.SIGTERM)
            logger.info("Toil process %s successfully terminated." % str(pid_to_kill))
        except OSError:
            logger.error("Toil process %s could not be terminated." % str(pid_to_kill))
            raise
Exemplo n.º 18
0
def main():
    parser = getBasicOptionParser()

    parser.add_argument("jobStore", type=str,
                        help="The location of the job store used by the workflow whose jobs should "
                             "be killed." + jobStoreLocatorHelp)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)

    jobStore = Toil.resumeJobStore(options.jobStore)

    logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore)
    ####This behaviour is now broken
    batchSystem = Toil.createBatchSystem(jobStore.config) #This should automatically kill the existing jobs.. so we're good.
    for jobID in batchSystem.getIssuedBatchJobIDs(): #Just in case we do it again.
        batchSystem.killBatchJobs(jobID)
    logger.info("All jobs SHOULD have been killed")
Exemplo n.º 19
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Parse input args
    jobName = argv[1]
    jobStoreLocator = argv[2]
    jobStoreID = argv[3]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = Toil.resumeJobStore(jobStoreLocator)
    config = jobStore.config

    # Call the worker
    workerScript(jobStore, config, jobName, jobStoreID)
Exemplo n.º 20
0
def main() -> None:
    """Reports stats on the workflow, use with --stats option to toil."""
    parser = parser_with_common_options()
    add_stats_options(parser)
    options = parser.parse_args()

    for c in options.categories.split(","):
        if c.strip() not in category_choices:
            raise ValueError(f'{c} not in {category_choices}!')
    options.categories = [
        x.strip().lower() for x in options.categories.split(",")
    ]

    set_logging_from_options(options)
    config = Config()
    config.setOptions(options)
    jobStore = Toil.resumeJobStore(config.jobStore)
    stats = getStats(jobStore)
    collatedStatsTag = processData(jobStore.config, stats)
    reportData(collatedStatsTag, options)
Exemplo n.º 21
0
def main():
    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help="The location of the job store used by the workflow." +
        jobStoreLocatorHelp)
    parser.add_argument("jobID",
                        nargs=1,
                        help="The job store id of a job "
                        "within the provided jobstore to run by itself.")
    parser.add_argument(
        "--printJobInfo",
        nargs=1,
        help="Return information about this job to the user"
        " including preceding jobs, inputs, outputs, and runtime"
        " from the last known run.")
    parser.add_argument("--version", action='version', version=version)

    # Parse options
    options = parseBasicOptions(parser)
    config = Config()
    config.setOptions(options)

    # Load the job store
    jobStore = Toil.resumeJobStore(config.jobStore)

    if options.printJobInfo:
        printContentsOfJobStore(jobStorePath=options.jobStore,
                                nameOfJob=options.printJobInfo)

    # TODO: Option to print list of successor jobs
    # TODO: Option to run job within python debugger, allowing step through of arguments
    # idea would be to have option to import pdb and set breakpoint at the start of the user's code

    # Run the job locally
    jobID = options.jobID[0]
    logger.debug("Going to run the following job locally: %s", jobID)
    workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False)
    logger.debug("Ran the following job locally: %s", jobID)
Exemplo n.º 22
0
def main():
    """Reports the state of a Toil workflow."""
    parser = getBasicOptionParser()

    parser.add_argument("jobStore", type=str,
                        help="The location of a job store that holds the information about the "
                             "workflow whose status is to be reported on." + jobStoreLocatorHelp)

    parser.add_argument("--failIfNotComplete", action="store_true",
                        help="Return exit value of 1 if toil jobs not all completed. default=%(default)s",
                        default=False)

    parser.add_argument("--noAggStats", dest="stats", action="store_false",
                        help="Do not print overall, aggregate status of workflow.",
                        default=True)

    parser.add_argument("--printDot", action="store_true",
                        help="Print dot formatted description of the graph. If using --jobs will "
                             "restrict to subgraph including only those jobs. default=%(default)s",
                        default=False)

    parser.add_argument("--jobs", nargs='+',
                        help="Restrict reporting to the following jobs (allows subsetting of the report).",
                        default=None)

    parser.add_argument("--printPerJobStats", action="store_true",
                        help="Print info about each job. default=%(default)s",
                        default=False)

    parser.add_argument("--printLogs", action="store_true",
                        help="Print the log files of jobs (if they exist). default=%(default)s",
                        default=False)

    parser.add_argument("--printChildren", action="store_true",
                        help="Print children of each job. default=%(default)s",
                        default=False)

    parser.add_argument("--version", action='version', version=version)

    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    config = Config()
    config.setOptions(options)
    jobStore = Toil.resumeJobStore(config.jobStore)

    ##########################################
    # Gather the jobs to report
    ##########################################

    # Gather all jobs in the workflow in jobsToReport
    if options.jobs == None:
        rootJob = fetchRootJob(jobStore)
        logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.')
        jobsToReport = traverseJobGraph(rootJob, jobStore)

    # Only gather jobs specified in options.jobs
    else:
        jobsToReport = fetchUserJobs(jobStore, jobs=options.jobs)

    ##########################################
    # Report on the jobs
    ##########################################

    jobStats = report_on_jobs(jobsToReport, jobStore, options)

    hasChildren = jobStats['hasChildren']
    readyToRun = jobStats['readyToRun']
    zombies = jobStats['zombies']
    hasServices = jobStats['hasServices']
    services = jobStats['services']
    hasLogFile = jobStats['hasLogFile']
    properties = jobStats['properties']
    childNumber = jobStats['childNumber']

    if options.printPerJobStats:
        printAggregateJobStats(jobsToReport, properties, childNumber)

    if options.printLogs:
        printJobLog(jobsToReport, jobStore)

    if options.printChildren:
        printJobChildren(jobsToReport)

    if options.printDot:
        print_dot_chart(jobsToReport, jobStore_name=config.jobStore)

    if options.stats:
        print('Of the %i jobs considered, '
           'there are %i jobs with children, '
           '%i jobs ready to run, '
           '%i zombie jobs, '
           '%i jobs with services, '
           '%i services, '
           'and %i jobs with log files currently in %s.' %
            (len(jobsToReport), len(hasChildren), len(readyToRun), len(zombies),
             len(hasServices), len(services), len(hasLogFile), config.jobStore))

    if len(jobsToReport) > 0 and options.failIfNotComplete:
        # Upon workflow completion, all jobs will have been removed from job store
        exit(1)
Exemplo n.º 23
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules
    ##########################################

    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)

    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.job import Job
    try:
        import boto
    except ImportError:
        pass
    else:
        # boto is installed, monkey patch it now
        from bd2k.util.ec2.credentials import enable_metadata_credential_caching
        enable_metadata_credential_caching()
    ##########################################
    #Input args
    ##########################################

    jobStoreLocator = sys.argv[1]
    jobStoreID = sys.argv[2]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = Toil.resumeJobStore(jobStoreLocator)
    config = jobStore.config

    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:

        def badWorker():
            #This will randomly kill the worker process at a random time
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL)  #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)

        t = Thread(target=badWorker)
        # Ideally this would be a daemon thread but that causes an intermittent (but benign)
        # exception similar to the one described here:
        # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd
        # Our exception is:
        #    Exception in thread Thread-1 (most likely raised during interpreter shutdown):
        #    <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill'
        # This attribute error is caused by the call os.kill() and apparently unavoidable with a
        # daemon
        t.start()

    ##########################################
    #Load the environment for the jobGraph
    ##########################################

    #First load the environment for the jobGraph.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################

    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>

    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)

    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)

    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...

    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers):  #Remove old handlers
        logger.removeHandler(handler)

    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda: True
    cleanCacheFn = lambda x: True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()

        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))

        logProcessContext(config, logger)

        ##########################################
        #Load the jobGraph
        ##########################################

        jobGraph = jobStore.load(jobStoreID)
        logger.debug("Parsed jobGraph")

        ##########################################
        #Cleanup from any earlier invocation of the jobGraph
        ##########################################

        if jobGraph.command == None:
            # Cleanup jobs already finished
            f = lambda jobs: filter(
                lambda x: len(x) > 0,
                map(
                    lambda x: filter(lambda y: jobStore.exists(y.jobStoreID), x
                                     ), jobs))
            jobGraph.stack = f(jobGraph.stack)
            jobGraph.services = f(jobGraph.services)
            logger.debug(
                "Cleaned up any references to completed successor jobs")

        #This cleans the old log file which may
        #have been left if the job is being retried after a job failure.
        oldLogFile = jobGraph.logJobStoreFileID
        if oldLogFile != None:
            jobGraph.logJobStoreFileID = None
            jobStore.update(jobGraph)  #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        # The job is a checkpoint, and is being restarted after previously completing
        if jobGraph.checkpoint != None:
            logger.debug("Job is a checkpoint")
            if len(jobGraph.stack) > 0 or len(
                    jobGraph.services) > 0 or jobGraph.command != None:
                if jobGraph.command != None:
                    assert jobGraph.command == jobGraph.checkpoint
                    logger.debug(
                        "Checkpoint job already has command set to run")
                else:
                    jobGraph.command = jobGraph.checkpoint

                # Reduce the retry count
                assert jobGraph.remainingRetryCount >= 0
                jobGraph.remainingRetryCount = max(
                    0, jobGraph.remainingRetryCount - 1)

                jobStore.update(
                    jobGraph)  # Update immediately to ensure that checkpoint
                # is made before deleting any remaining successors

                if len(jobGraph.stack) > 0 or len(jobGraph.services) > 0:
                    # If the subtree of successors is not complete restart everything
                    logger.debug(
                        "Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s "
                        % (jobGraph.stack, jobGraph.services))

                    # Delete everything on the stack, as these represent successors to clean
                    # up as we restart the queue
                    def recursiveDelete(jobGraph2):
                        # Recursive walk the stack to delete all remaining jobs
                        for jobs in jobGraph2.stack + jobGraph2.services:
                            for jobNode in jobs:
                                if jobStore.exists(jobNode.jobStoreID):
                                    recursiveDelete(
                                        jobStore.load(jobNode.jobStoreID))
                                else:
                                    logger.debug(
                                        "Job %s has already been deleted",
                                        jobNode)
                        if jobGraph2 != jobGraph:
                            logger.debug(
                                "Checkpoint is deleting old successor job: %s",
                                jobGraph2.jobStoreID)
                            jobStore.delete(jobGraph2.jobStoreID)

                    recursiveDelete(jobGraph)

                    jobGraph.stack = [
                        [], []
                    ]  # Initialise the job to mimic the state of a job
                    # that has been previously serialised but which as yet has no successors

                    jobGraph.services = []  # Empty the services

                    # Update the jobStore to avoid doing this twice on failure and make this clean.
                    jobStore.update(jobGraph)

            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug(
                    "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
                )
                #Delete any remnant files
                map(
                    jobStore.deleteFile,
                    filter(jobStore.fileExists,
                           jobGraph.checkpointFilesToDelete))

        ##########################################
        #Setup the stats, if requested
        ##########################################

        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        #Make a temporary file directory for the jobGraph
        #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))

        startTime = time.time()
        while True:
            ##########################################
            #Run the jobGraph, if there is one
            ##########################################

            if jobGraph.command is not None:
                assert jobGraph.command.startswith("_toil ")
                logger.debug("Got a command to run: %s" % jobGraph.command)
                #Load the job
                job = Job._loadJob(jobGraph.command, jobStore)
                # If it is a checkpoint job, save the command
                if job.checkpoint:
                    jobGraph.checkpoint = jobGraph.command

                # Create a fileStore object for the job
                fileStore = FileStore.createFileStore(
                    jobStore,
                    jobGraph,
                    localWorkerTempDir,
                    blockFn,
                    caching=not config.disableCaching)
                with job._executor(jobGraph=jobGraph,
                                   stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with fileStore.open(job):
                        # Get the next block function and list that will contain any messages
                        blockFn = fileStore._blockFn

                        job._runner(jobGraph=jobGraph,
                                    jobStore=jobStore,
                                    fileStore=fileStore)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages

            else:
                #The command may be none, in which case
                #the jobGraph is either a shell ready to be deleted or has
                #been scheduled after a failure to cleanup
                break

            if FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobGraph within the worker
            ##########################################

            #If no more jobs to run or services not finished, quit
            if len(jobGraph.stack) == 0 or len(
                    jobGraph.services) > 0 or jobGraph.checkpoint != None:
                logger.debug(
                    "Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s",
                    len(jobGraph.stack), len(jobGraph.services),
                    jobGraph.checkpoint != None)
                break

            #Get the next set of jobs to run
            jobs = jobGraph.stack[-1]
            assert len(jobs) > 0

            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug(
                    "No more jobs can run in series by this worker,"
                    " it's got %i children",
                    len(jobs) - 1)
                break

            #We check the requirements of the jobGraph to see if we can run it
            #within the current worker
            successorJobNode = jobs[0]
            if successorJobNode.memory > jobGraph.memory:
                logger.debug(
                    "We need more memory for the next job, so finishing")
                break
            if successorJobNode.cores > jobGraph.cores:
                logger.debug(
                    "We need more cores for the next job, so finishing")
                break
            if successorJobNode.disk > jobGraph.disk:
                logger.debug(
                    "We need more disk for the next job, so finishing")
                break
            if successorJobNode.predecessorNumber > 1:
                logger.debug(
                    "The jobGraph has multiple predecessors, we must return to the leader."
                )
                break

            # Load the successor jobGraph
            successorJobGraph = jobStore.load(successorJobNode.jobStoreID)

            # Somewhat ugly, but check if job is a checkpoint job and quit if
            # so
            if successorJobGraph.command.startswith("_toil "):
                #Load the job
                successorJob = Job._loadJob(successorJobGraph.command,
                                            jobStore)

                # Check it is not a checkpoint
                if successorJob.checkpoint:
                    logger.debug("Next job is checkpoint, so finishing")
                    break

            ##########################################
            #We have a single successor job that is not a checkpoint job.
            #We transplant the successor jobGraph command and stack
            #into the current jobGraph object so that it can be run
            #as if it were a command that were part of the current jobGraph.
            #We can then delete the successor jobGraph in the jobStore, as it is
            #wholly incorporated into the current jobGraph.
            ##########################################

            #Clone the jobGraph and its stack
            jobGraph = copy.deepcopy(jobGraph)

            #Remove the successor jobGraph
            jobGraph.stack.pop()

            #These should all match up
            assert successorJobGraph.memory == successorJobNode.memory
            assert successorJobGraph.cores == successorJobNode.cores
            assert successorJobGraph.predecessorsFinished == set()
            assert successorJobGraph.predecessorNumber == 1
            assert successorJobGraph.command is not None
            assert successorJobGraph.jobStoreID == successorJobNode.jobStoreID

            #Transplant the command and stack to the current jobGraph
            jobGraph.command = successorJobGraph.command
            jobGraph.stack += successorJobGraph.stack
            assert jobGraph.memory >= successorJobGraph.memory
            assert jobGraph.cores >= successorJobGraph.cores

            #Build a fileStore to update the job
            fileStore = FileStore.createFileStore(
                jobStore,
                jobGraph,
                localWorkerTempDir,
                blockFn,
                caching=not config.disableCaching)

            #Update blockFn
            blockFn = fileStore._blockFn

            #Add successorJobGraph to those to be deleted
            fileStore.jobsToDelete.add(successorJobGraph.jobStoreID)

            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()

            #Clone the jobGraph and its stack again, so that updates to it do
            #not interfere with this update
            jobGraph = copy.deepcopy(jobGraph)

            logger.debug("Starting the next job")

        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        logger.info(
            "Worker log can be found at %s. Set --cleanWorkDir to retain this log",
            localWorkerTempDir)
        logger.info(
            "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
            time.time() - startTime)

    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except:  #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed job on host %s",
                     socket.gethostname())
        FileStore._terminateEvent.set()

    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ##########################################

    blockFn()

    ##########################################
    #All the asynchronous worker/update threads must be finished now,
    #so safe to test if they completed okay
    ##########################################

    if FileStore._terminateEvent.isSet():
        jobGraph = jobStore.load(jobStoreID)
        jobGraph.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################

    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)

    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)

    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdErr, 2)

    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr

    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)

    #Now our file handles are in exactly the state they were in before.

    #Copy back the log file to the global dir, if needed
    if workerFailed:
        jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID(
            jobGraph.jobStoreID)
        with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, "r") as f:
                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit:
                    f.seek(-logFileByteReportLimit,
                           2)  # seek to last tooBig bytes of file
                w.write(f.read())
        jobStore.update(jobGraph)

    elif debugging:  # write log messages
        with open(tempWorkerLogPath, 'r') as logFile:
            if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit:
                logFile.seek(-logFileByteReportLimit,
                             2)  # seek to last tooBig bytes of file
            logMessages = logFile.read().splitlines()
        statsDict.logs = [
            Expando(jobStoreID=jobStoreID, text=logMessage)
            for logMessage in logMessages
        ]

    if (debugging or config.stats or statsDict.workers.logsToMaster
        ) and not workerFailed:  # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess'
                               and not workerFailed) or (cleanUp == 'onError'
                                                         and workerFailed):
        shutil.rmtree(localWorkerTempDir)

    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobGraph.command == None and len(
            jobGraph.stack) == 0 and len(jobGraph.services) == 0:
        # We can now safely get rid of the jobGraph
        jobStore.delete(jobGraph.jobStoreID)
Exemplo n.º 24
0
def main():
    """Reports the state of the toil.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser()
    
    parser.add_argument("jobStore", type=str,
                        help="The location of a job store that holds the information about the "
                             "workflow whose status is to be reported on." + jobStoreLocatorHelp)
    
    parser.add_argument("--verbose", dest="verbose", action="store_true",
                      help="Print loads of information, particularly all the log files of \
                      jobs that failed. default=%(default)s",
                      default=False)
    
    parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true",
                      help="Return exit value of 1 if toil jobs not all completed. default=%(default)s",
                      default=False)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for Toil")
    assert options.jobStore is not None
    config = Config()
    config.setOptions(options)
    ##########################################
    #Survey the status of the job and report.
    ##########################################  
    
    jobStore = Toil.resumeJobStore(config.jobStore)
    try:
        rootJob = jobStore.loadRootJob()
    except JobException:
        print('The root job of the job store is absent, the workflow completed successfully.',
              file=sys.stderr)
        sys.exit(0)

    def traverseGraph(jobGraph):
        foundJobStoreIDs = set()
        totalJobs = []
        def inner(jobGraph):
            if jobGraph.jobStoreID in foundJobStoreIDs:
                return
            foundJobStoreIDs.add(jobGraph.jobStoreID)
            totalJobs.append(jobGraph)
            # Traverse jobs in stack
            for jobs in jobGraph.stack:
                for successorJobStoreID in [x.jobStoreID for x in jobs]:
                    if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)):
                        inner(jobStore.load(successorJobStoreID))

            # Traverse service jobs
            for jobs in jobGraph.services:
                for serviceJobStoreID in [x.jobStoreID for x in jobs]:
                    if jobStore.exists(serviceJobStoreID):
                        assert serviceJobStoreID not in foundJobStoreIDs
                        foundJobStoreIDs.add(serviceJobStoreID)
                        totalJobs.append(jobStore.load(serviceJobStoreID))
        inner(jobGraph)
        return totalJobs

    logger.info('Traversing the job graph. This may take a couple minutes.')
    totalJobs = traverseGraph(rootJob)

    failedJobs = []
    hasChildren = []
    hasServices = []
    services = []
    currentlyRunnning = []

    for job in totalJobs:
        if job.logJobStoreFileID is not None:
            failedJobs.append(job)
        if job.stack:
            hasChildren.append(job)
        elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command:
            # The job has no children, hasn't failed, and has a command to run. This indicates that the job is
            # likely currently running, or at least could be run.
            currentlyRunnning.append(job)
        if job.services:
            hasServices.append(job)
        if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID:
            # these attributes are only set in service jobs
            services.append(job)

    logger.info('There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, '
                'and %i totally failed jobs currently in %s.' %
                (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore))

    if currentlyRunnning:
        logger.info('These %i jobs are currently active: %s',
                    len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning)))

    if options.verbose: #Verbose currently means outputting the files that have failed.
        if failedJobs:
            msg = "Outputting logs for the %i failed jobs" % (len(failedJobs))
            msg += ": %s" % ", ".join((str(failedJob) for failedJob in failedJobs))
            for jobNode in failedJobs:
                job = jobStore.load(jobNode.jobStoreID)
                msg += "\n=========> Failed job %s \n" % jobNode
                with job.getLogFileHandle(jobStore) as fH:
                    msg += fH.read()
                msg += "<=========\n"
            print(msg)
        else:
            print('There are no failed jobs to report.', file=sys.stderr)

    if totalJobs and options.failIfNotComplete:
        exit(1) # when the workflow is complete, all jobs will have been removed from job store
Exemplo n.º 25
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.job import Job
    try:
        import boto
    except ImportError:
        pass
    else:
        # boto is installed, monkey patch it now
        from bd2k.util.ec2.credentials import enable_metadata_credential_caching
        enable_metadata_credential_caching()
    ##########################################
    #Input args
    ##########################################
    
    jobStoreLocator = sys.argv[1]
    jobStoreID = sys.argv[2]
    # we really want a list of job names but the ID will suffice if the job graph can't
    # be loaded. If we can discover the name, we will replace this initial entry
    listOfJobs = [jobStoreID]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = Toil.resumeJobStore(jobStoreLocator)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    logFileByteReportLimit = config.maxLogFileSize

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        # Ideally this would be a daemon thread but that causes an intermittent (but benign)
        # exception similar to the one described here:
        # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd
        # Our exception is:
        #    Exception in thread Thread-1 (most likely raised during interpreter shutdown):
        #    <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill'
        # This attribute error is caused by the call os.kill() and apparently unavoidable with a
        # daemon
        t.start()

    ##########################################
    #Load the environment for the jobGraph
    ##########################################
    
    #First load the environment for the jobGraph.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0o755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print("---TOIL WORKER OUTPUT LOG---")
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))

        logProcessContext(config)

        ##########################################
        #Load the jobGraph
        ##########################################
        
        jobGraph = jobStore.load(jobStoreID)
        listOfJobs[0] = str(jobGraph)
        logger.debug("Parsed jobGraph")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobGraph
        ##########################################
        
        if jobGraph.command == None:
            # Cleanup jobs already finished
            f = lambda jobs : filter(lambda x : len(x) > 0, map(lambda x :
                                    filter(lambda y : jobStore.exists(y.jobStoreID), x), jobs))
            jobGraph.stack = f(jobGraph.stack)
            jobGraph.services = f(jobGraph.services)
            logger.debug("Cleaned up any references to completed successor jobs")

        #This cleans the old log file which may 
        #have been left if the job is being retried after a job failure.
        oldLogFile = jobGraph.logJobStoreFileID
        if oldLogFile != None:
            jobGraph.logJobStoreFileID = None
            jobStore.update(jobGraph) #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        # The job is a checkpoint, and is being restarted after previously completing
        if jobGraph.checkpoint != None:
            logger.debug("Job is a checkpoint")
            if len(jobGraph.stack) > 0 or len(jobGraph.services) > 0 or jobGraph.command != None:
                if jobGraph.command != None:
                    assert jobGraph.command == jobGraph.checkpoint
                    logger.debug("Checkpoint job already has command set to run")
                else:
                    jobGraph.command = jobGraph.checkpoint

                jobStore.update(jobGraph) # Update immediately to ensure that checkpoint
                # is made before deleting any remaining successors

                if len(jobGraph.stack) > 0 or len(jobGraph.services) > 0:
                    # If the subtree of successors is not complete restart everything
                    logger.debug("Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s " %
                                 (jobGraph.stack, jobGraph.services))

                    # Delete everything on the stack, as these represent successors to clean
                    # up as we restart the queue
                    def recursiveDelete(jobGraph2):
                        # Recursive walk the stack to delete all remaining jobs
                        for jobs in jobGraph2.stack + jobGraph2.services:
                            for jobNode in jobs:
                                if jobStore.exists(jobNode.jobStoreID):
                                    recursiveDelete(jobStore.load(jobNode.jobStoreID))
                                else:
                                    logger.debug("Job %s has already been deleted", jobNode)
                        if jobGraph2 != jobGraph:
                            logger.debug("Checkpoint is deleting old successor job: %s", jobGraph2.jobStoreID)
                            jobStore.delete(jobGraph2.jobStoreID)
                    recursiveDelete(jobGraph)

                    jobGraph.stack = [ [], [] ] # Initialise the job to mimic the state of a job
                    # that has been previously serialised but which as yet has no successors

                    jobGraph.services = [] # Empty the services

                    # Update the jobStore to avoid doing this twice on failure and make this clean.
                    jobStore.update(jobGraph)

            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
                #Delete any remnant files
                map(jobStore.deleteFile, filter(jobStore.fileExists, jobGraph.checkpointFilesToDelete))

        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        #Make a temporary file directory for the jobGraph
        #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))

        startTime = time.time()
        while True:
            ##########################################
            #Run the jobGraph, if there is one
            ##########################################
            
            if jobGraph.command is not None:
                assert jobGraph.command.startswith( "_toil " )
                logger.debug("Got a command to run: %s" % jobGraph.command)
                #Load the job
                job = Job._loadJob(jobGraph.command, jobStore)
                # If it is a checkpoint job, save the command
                if job.checkpoint:
                    jobGraph.checkpoint = jobGraph.command

                # Create a fileStore object for the job
                fileStore = FileStore.createFileStore(jobStore, jobGraph, localWorkerTempDir, blockFn,
                                                      caching=not config.disableCaching)
                with job._executor(jobGraph=jobGraph,
                                   stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with fileStore.open(job):
                        # Get the next block function and list that will contain any messages
                        blockFn = fileStore._blockFn

                        job._runner(jobGraph=jobGraph, jobStore=jobStore, fileStore=fileStore)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages

            else:
                #The command may be none, in which case
                #the jobGraph is either a shell ready to be deleted or has
                #been scheduled after a failure to cleanup
                break
            
            if FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobGraph within the worker
            ##########################################
            
            #If no more jobs to run or services not finished, quit
            if len(jobGraph.stack) == 0 or len(jobGraph.services) > 0 or jobGraph.checkpoint != None:
                logger.debug("Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s",
                             len(jobGraph.stack), len(jobGraph.services), jobGraph.checkpoint != None)
                break
            
            #Get the next set of jobs to run
            jobs = jobGraph.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobGraph to see if we can run it
            #within the current worker
            successorJobNode = jobs[0]
            if successorJobNode.memory > jobGraph.memory:
                logger.debug("We need more memory for the next job, so finishing")
                break
            if successorJobNode.cores > jobGraph.cores:
                logger.debug("We need more cores for the next job, so finishing")
                break
            if successorJobNode.disk > jobGraph.disk:
                logger.debug("We need more disk for the next job, so finishing")
                break
            if successorJobNode.preemptable != jobGraph.preemptable:
                logger.debug("Preemptability is different for the next job, returning to the leader")
                break
            if successorJobNode.predecessorNumber > 1:
                logger.debug("The jobGraph has multiple predecessors, we must return to the leader.")
                break

            # Load the successor jobGraph
            successorJobGraph = jobStore.load(successorJobNode.jobStoreID)

            # add the successor to the list of jobs run
            listOfJobs.append(str(successorJobGraph))

            # Somewhat ugly, but check if job is a checkpoint job and quit if
            # so
            if successorJobGraph.command.startswith( "_toil " ):
                #Load the job
                successorJob = Job._loadJob(successorJobGraph.command, jobStore)

                # Check it is not a checkpoint
                if successorJob.checkpoint:
                    logger.debug("Next job is checkpoint, so finishing")
                    break

            ##########################################
            #We have a single successor job that is not a checkpoint job.
            #We transplant the successor jobGraph command and stack
            #into the current jobGraph object so that it can be run
            #as if it were a command that were part of the current jobGraph.
            #We can then delete the successor jobGraph in the jobStore, as it is
            #wholly incorporated into the current jobGraph.
            ##########################################
            
            #Clone the jobGraph and its stack
            jobGraph = copy.deepcopy(jobGraph)
            
            #Remove the successor jobGraph
            jobGraph.stack.pop()

            #These should all match up
            assert successorJobGraph.memory == successorJobNode.memory
            assert successorJobGraph.cores == successorJobNode.cores
            assert successorJobGraph.predecessorsFinished == set()
            assert successorJobGraph.predecessorNumber == 1
            assert successorJobGraph.command is not None
            assert successorJobGraph.jobStoreID == successorJobNode.jobStoreID

            #Transplant the command and stack to the current jobGraph
            jobGraph.command = successorJobGraph.command
            jobGraph.stack += successorJobGraph.stack
            # include some attributes for better identification of chained jobs in
            # logging output
            jobGraph.unitName = successorJobGraph.unitName
            jobGraph.jobName = successorJobGraph.jobName
            assert jobGraph.memory >= successorJobGraph.memory
            assert jobGraph.cores >= successorJobGraph.cores
            
            #Build a fileStore to update the job
            fileStore = FileStore.createFileStore(jobStore, jobGraph, localWorkerTempDir, blockFn,
                                                  caching=not config.disableCaching)

            #Update blockFn
            blockFn = fileStore._blockFn

            #Add successorJobGraph to those to be deleted
            fileStore.jobsToDelete.add(successorJobGraph.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobGraph and its stack again, so that updates to it do
            #not interfere with this update
            jobGraph = copy.deepcopy(jobGraph)
            
            logger.debug("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
        FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if FileStore._terminateEvent.isSet():
        jobGraph = jobStore.load(jobStoreID)
        jobGraph.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)

    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)

    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdErr, 2)

    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr

    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)

    #Now our file handles are in exactly the state they were in before.

    #Copy back the log file to the global dir, if needed
    if workerFailed:
        jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID(jobGraph.jobStoreID)
        jobGraph.chainedJobs = listOfJobs
        with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, "r") as f:
                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
                    if logFileByteReportLimit > 0:
                        f.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                    elif logFileByteReportLimit < 0:
                        f.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
                w.write(f.read())
        jobStore.update(jobGraph)

    elif debugging:  # write log messages
        with open(tempWorkerLogPath, 'r') as logFile:
            if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
                if logFileByteReportLimit > 0:
                    logFile.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                elif logFileByteReportLimit < 0:
                    logFile.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
            logMessages = logFile.read().splitlines()
        statsDict.logs.names = listOfJobs
        statsDict.logs.messages = logMessages

    if (debugging or config.stats or statsDict.workers.logsToMaster) and not workerFailed:  # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed):
        shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobGraph.command == None and len(jobGraph.stack) == 0 and len(jobGraph.services) == 0:
        # We can now safely get rid of the jobGraph
        jobStore.delete(jobGraph.jobStoreID)
Exemplo n.º 26
0
def main():
    """Reports the state of the toil.
    """

    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help="The location of a job store that holds the information about the "
        "workflow whose status is to be reported on." + jobStoreLocatorHelp)

    parser.add_argument(
        "--verbose",
        dest="verbose",
        action="store_true",
        help="Print loads of information, particularly all the log files of \
                      jobs that failed. default=%(default)s",
        default=False)

    parser.add_argument(
        "--failIfNotComplete",
        dest="failIfNotComplete",
        action="store_true",
        help=
        "Return exit value of 1 if toil jobs not all completed. default=%(default)s",
        default=False)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    ##########################################
    #Do some checks.
    ##########################################

    logger.info("Checking if we have files for Toil")
    assert options.jobStore is not None
    config = Config()
    config.setOptions(options)
    ##########################################
    #Survey the status of the job and report.
    ##########################################

    jobStore = Toil.resumeJobStore(config.jobStore)
    try:
        rootJob = jobStore.loadRootJob()
    except JobException:
        print(
            'The root job of the job store is absent, the workflow completed successfully.',
            file=sys.stderr)
        sys.exit(0)

    def traverseGraph(jobGraph):
        foundJobStoreIDs = set()
        totalJobs = []

        def inner(jobGraph):
            if jobGraph.jobStoreID in foundJobStoreIDs:
                return
            foundJobStoreIDs.add(jobGraph.jobStoreID)
            totalJobs.append(jobGraph)
            # Traverse jobs in stack
            for jobs in jobGraph.stack:
                for successorJobStoreID in map(lambda x: x.jobStoreID, jobs):
                    if (successorJobStoreID not in foundJobStoreIDs
                            and jobStore.exists(successorJobStoreID)):
                        inner(jobStore.load(successorJobStoreID))

            # Traverse service jobs
            for jobs in jobGraph.services:
                for serviceJobStoreID in map(lambda x: x.jobStoreID, jobs):
                    if jobStore.exists(serviceJobStoreID):
                        assert serviceJobStoreID not in foundJobStoreIDs
                        foundJobStoreIDs.add(serviceJobStoreID)
                        totalJobs.append(jobStore.load(serviceJobStoreID))

        inner(jobGraph)
        return totalJobs

    logger.info('Traversing the job graph. This may take a couple minutes.')
    totalJobs = traverseGraph(rootJob)

    failedJobs = []
    hasChildren = []
    hasServices = []
    services = []
    currentlyRunnning = []

    for job in totalJobs:
        if job.logJobStoreFileID is not None:
            failedJobs.append(job)
        if job.stack:
            hasChildren.append(job)
        elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command:
            # The job has no children, hasn't failed, and has a command to run. This indicates that the job is
            # likely currently running, or at least could be run.
            currentlyRunnning.append(job)
        if job.services:
            hasServices.append(job)
        if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID:
            # these attributes are only set in service jobs
            services.append(job)

    logger.info(
        'There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, '
        'and %i totally failed jobs currently in %s.' %
        (len(totalJobs), len(hasChildren), len(hasServices), len(services),
         len(failedJobs), config.jobStore))

    if currentlyRunnning:
        logger.info('These %i jobs are currently active: %s',
                    len(currentlyRunnning),
                    ' \n'.join(map(str, currentlyRunnning)))

    if options.verbose:  #Verbose currently means outputting the files that have failed.
        if failedJobs:
            msg = "Outputting logs for the %i failed jobs" % (len(failedJobs))
            msg += ": %s" % ", ".join(
                (str(failedJob) for failedJob in failedJobs))
            for jobNode in failedJobs:
                job = jobStore.load(jobNode.jobStoreID)
                msg += "\n=========> Failed job %s \n" % jobNode
                with job.getLogFileHandle(jobStore) as fH:
                    msg += fH.read()
                msg += "<=========\n"
            print(msg)
        else:
            print('There are no failed jobs to report.', file=sys.stderr)

    if totalJobs and options.failIfNotComplete:
        exit(
            1
        )  # when the workflow is complete, all jobs will have been removed from job store
Exemplo n.º 27
0
def main():
    """Reports the state of the toil.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser()
    
    parser.add_argument("jobStore", type=str,
                        help="The location of a job store that holds the information about the "
                             "workflow whose status is to be reported on." + jobStoreLocatorHelp)
    
    parser.add_argument("--verbose", dest="verbose", action="store_true",
                      help="Print loads of information, particularly all the log files of \
                      jobs that failed. default=%(default)s",
                      default=False)
    
    parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true",
                      help="Return exit value of 1 if toil jobs not all completed. default=%(default)s",
                      default=False)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for Toil")
    assert options.jobStore is not None

    ##########################################
    #Survey the status of the job and report.
    ##########################################  
    
    jobStore = Toil.resumeJobStore(options.jobStore)
    try:
        rootJob = jobStore.loadRootJob()
    except JobException:
        print('The root job of the job store is absent, the workflow completed successfully.',
              file=sys.stderr)
        sys.exit(0)
    
    toilState = ToilState(jobStore, rootJob )

    # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect
    totalJobs = set(toilState.successorCounts.keys()) | \
                {jobTuple[0] for jobTuple in toilState.updatedJobs}

    failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ]

    print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs '
          'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts),
                                len(failedJobs), options.jobStore), file=sys.stderr)
    
    if options.verbose: #Verbose currently means outputting the files that have failed.
        for job in failedJobs:
            if job.logJobStoreFileID is not None:
                with job.getLogFileHandle(jobStore) as logFileHandle:
                    logStream(logFileHandle, job.jobStoreID, logger.warn)
            else:
                print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr)
        if len(failedJobs) == 0:
            print('There are no failed jobs to report.', file=sys.stderr)
    
    if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \
        options.failIfNotComplete:
        sys.exit(1)
Exemplo n.º 28
0
def main():
    """Reports the state of the toil.
    """

    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help="The location of a job store that holds the information about the "
        "workflow whose status is to be reported on." + jobStoreLocatorHelp)

    parser.add_argument(
        "--verbose",
        dest="verbose",
        action="store_true",
        help="Print loads of information, particularly all the log files of \
                      jobs that failed. default=%(default)s",
        default=False)

    parser.add_argument(
        "--failIfNotComplete",
        dest="failIfNotComplete",
        action="store_true",
        help=
        "Return exit value of 1 if toil jobs not all completed. default=%(default)s",
        default=False)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    ##########################################
    #Do some checks.
    ##########################################

    logger.info("Checking if we have files for Toil")
    assert options.jobStore is not None

    ##########################################
    #Survey the status of the job and report.
    ##########################################

    jobStore = Toil.resumeJobStore(options.jobStore)
    try:
        rootJob = jobStore.loadRootJob()
    except JobException:
        print(
            'The root job of the job store is absent, the workflow completed successfully.',
            file=sys.stderr)
        sys.exit(0)

    toilState = ToilState(jobStore, rootJob)

    # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect
    totalJobs = set(toilState.successorCounts.keys()) | \
                {jobTuple[0] for jobTuple in toilState.updatedJobs}

    failedJobs = [job for job in totalJobs if job.remainingRetryCount == 0]

    print(
        'There are %i active jobs, %i parent jobs with children, and %i totally failed jobs '
        'currently in %s.' %
        (len(toilState.updatedJobs), len(
            toilState.successorCounts), len(failedJobs), options.jobStore),
        file=sys.stderr)

    if options.verbose:  #Verbose currently means outputting the files that have failed.
        for job in failedJobs:
            if job.logJobStoreFileID is not None:
                with job.getLogFileHandle(jobStore) as logFileHandle:
                    logStream(logFileHandle, job.jobStoreID, logger.warn)
            else:
                print('Log file for job %s is absent.' % job.jobStoreID,
                      file=sys.stderr)
        if len(failedJobs) == 0:
            print('There are no failed jobs to report.', file=sys.stderr)

    if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \
        options.failIfNotComplete:
        sys.exit(1)