예제 #1
0
def main():
    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help=(
            "Store in which to place job management files \
              and the global accessed temporary files"
            "(If this is a file path this needs to be globally accessible "
            "by all machines running jobs).\n"
            "If the store already exists and restart is false an"
            " ExistingJobStoreException exception will be thrown."
        ),
    )
    parser.add_argument("--version", action="version", version=version)
    options = parseBasicOptions(parser)

    jobStore = Toil.loadOrCreateJobStore(options.jobStore)

    logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore)
    ####This behaviour is now broken
    batchSystem = Toil.createBatchSystem(
        jobStore.config
    )  # This should automatically kill the existing jobs.. so we're good.
    for jobID in batchSystem.getIssuedBatchJobIDs():  # Just in case we do it again.
        batchSystem.killBatchJobs(jobID)
    logger.info("All jobs SHOULD have been killed")
예제 #2
0
def main():
    """Removes the JobStore from a toil run.
    """

    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser()
    parser.add_argument("jobStore", type=str,
                      help=("Store in which to place job management files \
                      and the global accessed temporary files"
                      "(If this is a file path this needs to be globally accessible "
                      "by all machines running jobs).\n"
                      "If the store already exists and restart is false an"
                      " JobStoreCreationException exception will be thrown."))
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    ##########################################
    #Survey the status of the job and report.
    ##########################################
    logger.info("Checking if we have files for toil")
    try:
        jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    except JobStoreCreationException:
        logger.info("The specified JobStore does not exist, it may have already been deleted")
        sys.exit(0)

    logger.info("Attempting to delete the job store")
    jobStore.deleteJobStore()
    logger.info("Successfully deleted the job store")
예제 #3
0
def main():
    """Removes the JobStore from a toil run.
    """

    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser()
    parser.add_argument("jobStore", type=str,
                      help=("Store in which to place job management files \
                      and the global accessed temporary files"
                      "(If this is a file path this needs to be globally accessible "
                      "by all machines running jobs).\n"
                      "If the store already exists and restart is false an"
                      " ExistingJobStoreException exception will be thrown."))
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    ##########################################
    #Survey the status of the job and report.
    ##########################################
    logger.info("Checking if we have files for toil")
    try:
        jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    except JobStoreCreationException:
        logger.info("The specified JobStore does not exist, it may have already been deleted")
        sys.exit(0)

    logger.info("Deleting the JobStore")
    jobStore.deleteJobStore()
예제 #4
0
def main():
    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help=("Store in which to place job management files \
              and the global accessed temporary files"
              "(If this is a file path this needs to be globally accessible "
              "by all machines running jobs).\n"
              "If the store already exists and restart is false an"
              " JobStoreCreationException exception will be thrown."))
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)

    jobStore = Toil.loadOrCreateJobStore(options.jobStore)

    logger.info(
        "Starting routine to kill running jobs in the toil workflow: %s" %
        options.jobStore)
    ####This behaviour is now broken
    batchSystem = Toil.createBatchSystem(
        jobStore.config
    )  #This should automatically kill the existing jobs.. so we're good.
    for jobID in batchSystem.getIssuedBatchJobIDs(
    ):  #Just in case we do it again.
        batchSystem.killBatchJobs(jobID)
    logger.info("All jobs SHOULD have been killed")
예제 #5
0
def main():
    """ Reports stats on the workflow, use with --stats option to toil.
    """
    parser = getBasicOptionParser()
    initializeOptions(parser)
    options = parseBasicOptions(parser)
    checkOptions(options, parser)
    jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    stats = getStats(options)
    collatedStatsTag = processData(jobStore.config, stats, options)
    reportData(collatedStatsTag, options)
예제 #6
0
파일: toilStats.py 프로젝트: PureQsh/toil
def main():
    """ Reports stats on the workflow, use with --stats option to toil.
    """
    parser = getBasicOptionParser()
    initializeOptions(parser)
    options = parseBasicOptions(parser)
    checkOptions(options, parser)
    jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    stats = getStats(options)
    collatedStatsTag = processData(jobStore.config, stats, options)
    reportData(collatedStatsTag, options)
예제 #7
0
    def testMultipleJobsPerWorkerStats(self):
        """
        Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data
        """
        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.clean = "never"
        options.stats = True
        Job.Runner.startToil(RunTwoJobsPerWorker(), options)

        jobStore = Toil.loadOrCreateJobStore(options.jobStore)
        stats = getStats(options)
        collatedStats = processData(jobStore.config, stats, options)
        self.assertTrue(len(collatedStats.job_types) == 2, "Some jobs are not represented in the stats")
예제 #8
0
    def testMultipleJobsPerWorkerStats(self):
        """
        Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data
        """
        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.clean = 'never'
        options.stats = True
        Job.Runner.startToil(RunTwoJobsPerWorker(), options)

        jobStore = Toil.loadOrCreateJobStore(options.jobStore)
        stats = getStats(options)
        collatedStats = processData(jobStore.config, stats, options)
        self.assertTrue(
            len(collatedStats.job_types) == 2,
            "Some jobs are not represented in the stats")
예제 #9
0
def getStats(options):
    """ Collect and return the stats and config data.
    """
    def aggregateStats(fileHandle,aggregateObject):
        try:
            stats = json.load(fileHandle, object_hook=Expando)
            for key in stats.keys():
                if key in aggregateObject:
                    aggregateObject[key].append(stats[key])
                else:
                    aggregateObject[key]=[stats[key]]
        except ValueError:
            logger.critical("File %s contains corrupted json. Skipping file." % fileHandle)
            pass  # The file is corrupted.

    jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    aggregateObject = Expando()
    callBack = partial(aggregateStats, aggregateObject=aggregateObject)
    jobStore.readStatsAndLogging(callBack, readAll=True)
    return aggregateObject
예제 #10
0
파일: worker.py 프로젝트: anukat2015/toil
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import makePublicDir
    from toil.lib.bioio import system
    from toil.job import Job
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = Toil.loadOrCreateJobStore(jobStoreString)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        # Ideally this would be a daemon thread but that causes an intermittent (but benign)
        # exception similar to the one described here:
        # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd
        # Our exception is:
        #    Exception in thread Thread-1 (most likely raised during interpreter shutdown):
        #    <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill'
        # This attribute error is caused by the call os.kill() and apparently unavoidable with a
        # daemon
        t.start()

    ##########################################
    #Load the environment for the jobWrapper
    ##########################################
    
    #First load the environment for the jobWrapper.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))

        # Setup the caching variable now in case of an exception during loading of jobwrapper, etc
        # Flag to identify if the run is cached or not.
        FileStore = Job.FileStore if config.disableSharedCache else Job.CachedFileStore

        ##########################################
        #Load the jobWrapper
        ##########################################
        
        jobWrapper = jobStore.load(jobStoreID)
        logger.debug("Parsed jobWrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobWrapper
        ##########################################
        
        if jobWrapper.command == None:
            # Cleanup jobs already finished
            f = lambda jobs : filter(lambda x : len(x) > 0, map(lambda x :
                                    filter(lambda y : jobStore.exists(y[0]), x), jobs))
            jobWrapper.stack = f(jobWrapper.stack)
            jobWrapper.services = f(jobWrapper.services)
            logger.debug("Cleaned up any references to completed successor jobs")

        #This cleans the old log file which may 
        #have been left if the jobWrapper is being retried after a jobWrapper failure.
        oldLogFile = jobWrapper.logJobStoreFileID
        if oldLogFile != None:
            jobWrapper.logJobStoreFileID = None
            jobStore.update(jobWrapper) #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        # The job is a checkpoint, and is being restarted after previously completing
        if jobWrapper.checkpoint != None:
            logger.debug("Job is a checkpoint")
            if len(jobWrapper.stack) > 0 or len(jobWrapper.services) > 0 or jobWrapper.command != None:
                if jobWrapper.command != None:
                    assert jobWrapper.command == jobWrapper.checkpoint
                    logger.debug("Checkpoint job already has command set to run")
                else:
                    jobWrapper.command = jobWrapper.checkpoint

                # Reduce the retry count
                assert jobWrapper.remainingRetryCount >= 0
                jobWrapper.remainingRetryCount = max(0, jobWrapper.remainingRetryCount - 1)

                jobStore.update(jobWrapper) # Update immediately to ensure that checkpoint
                # is made before deleting any remaining successors

                if len(jobWrapper.stack) > 0 or len(jobWrapper.services) > 0:
                    # If the subtree of successors is not complete restart everything
                    logger.debug("Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s " %
                                 (jobWrapper.stack, jobWrapper.services))

                    # Delete everything on the stack, as these represent successors to clean
                    # up as we restart the queue
                    def recursiveDelete(jobWrapper2):
                        # Recursive walk the stack to delete all remaining jobs
                        for jobs in jobWrapper2.stack + jobWrapper2.services:
                            for jobTuple in jobs:
                                if jobStore.exists(jobTuple[0]):
                                    recursiveDelete(jobStore.load(jobTuple[0]))
                                else:
                                    logger.debug("Job %s has already been deleted", jobTuple[0])
                        if jobWrapper2 != jobWrapper:
                            logger.debug("Checkpoint is deleting old successor job: %s", jobWrapper2.jobStoreID)
                            jobStore.delete(jobWrapper2.jobStoreID)
                    recursiveDelete(jobWrapper)

                    jobWrapper.stack = [ [], [] ] # Initialise the job to mimic the state of a job
                    # that has been previously serialised but which as yet has no successors

                    jobWrapper.services = [] # Empty the services

                    # Update the jobStore to avoid doing this twice on failure and make this clean.
                    jobStore.update(jobWrapper)

            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
                #Delete any remnant files
                map(jobStore.deleteFile, filter(jobStore.fileExists, jobWrapper.checkpointFilesToDelete))

        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        #Make a temporary file directory for the jobWrapper
        #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))

        startTime = time.time()
        while True:
            ##########################################
            #Run the jobWrapper, if there is one
            ##########################################
            
            if jobWrapper.command is not None:
                assert jobWrapper.command.startswith( "_toil " )
                logger.debug("Got a command to run: %s" % jobWrapper.command)
                #Load the job
                job = Job._loadJob(jobWrapper.command, jobStore)
                # If it is a checkpoint job, save the command
                if job.checkpoint:
                    jobWrapper.checkpoint = jobWrapper.command

                # Need to fix all this for non shared cache runs
                if config.disableSharedCache:
                    #Cleanup the cache from the previous job
                    cleanCacheFn(job.effectiveRequirements(jobStore.config).cache)

                # Create a fileStore object for the job
                fileStore = FileStore(jobStore, jobWrapper, localWorkerTempDir, blockFn)
                with job._executor(jobWrapper=jobWrapper,
                                   stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with fileStore.open(job):
                        # Get the next block function and list that will contain any messages
                        blockFn = fileStore._blockFn

                        job._runner(jobWrapper=jobWrapper, jobStore=jobStore, fileStore=fileStore)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages
                if config.disableSharedCache:
                    #Set the clean cache function
                    cleanCacheFn = fileStore._cleanLocalTempDir

            else:
                #The command may be none, in which case
                #the jobWrapper is either a shell ready to be deleted or has 
                #been scheduled after a failure to cleanup
                break
            
            if FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobWrapper within the worker
            ##########################################
            
            #If no more jobs to run or services not finished, quit
            if len(jobWrapper.stack) == 0 or len(jobWrapper.services) > 0 or jobWrapper.checkpoint != None:
                logger.debug("Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s",
                             len(jobWrapper.stack), len(jobWrapper.services), jobWrapper.checkpoint != None)
                break
            
            #Get the next set of jobs to run
            jobs = jobWrapper.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobWrapper to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCores, successorsDisk, successorsPreemptable, successorPredecessorID = jobs[0]
            if successorMemory > jobWrapper.memory:
                logger.debug("We need more memory for the next jobWrapper, so finishing")
                break
            if successorCores > jobWrapper.cores:
                logger.debug("We need more cores for the next jobWrapper, so finishing")
                break
            if successorsDisk > jobWrapper.disk:
                logger.debug("We need more disk for the next jobWrapper, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.")
                break

            # Load the successor jobWrapper
            successorJobWrapper = jobStore.load(successorJobStoreID)

            # Somewhat ugly, but check if job is a checkpoint job and quit if
            # so
            if successorJobWrapper.command.startswith( "_toil " ):
                #Load the job
                successorJob = Job._loadJob(successorJobWrapper.command, jobStore)

                # Check it is not a checkpoint
                if successorJob.checkpoint:
                    logger.debug("Next job is checkpoint, so finishing")
                    break

            ##########################################
            #We have a single successor job that is not a checkpoint job.
            #We transplant the successor jobWrappers command and stack
            #into the current jobWrapper so that it can be run
            #as if it were a command that were part of the current jobWrapper.
            #We can then delete the successor jobWrapper in the jobStore, as it is
            #wholly incorporated into the current jobWrapper.
            ##########################################
            
            #Clone the jobWrapper and its stack
            jobWrapper = copy.deepcopy(jobWrapper)
            
            #Remove the successor jobWrapper
            jobWrapper.stack.pop()

            #These should all match up
            assert successorJobWrapper.memory == successorMemory
            assert successorJobWrapper.cores == successorCores
            assert successorJobWrapper.predecessorsFinished == set()
            assert successorJobWrapper.predecessorNumber == 1
            assert successorJobWrapper.command != None
            assert successorJobStoreID == successorJobWrapper.jobStoreID
            
            #Transplant the command and stack to the current jobWrapper
            jobWrapper.command = successorJobWrapper.command
            jobWrapper.stack += successorJobWrapper.stack
            assert jobWrapper.memory >= successorJobWrapper.memory
            assert jobWrapper.cores >= successorJobWrapper.cores
            
            #Build a fileStore to update the job
            fileStore = Job.FileStore(jobStore, jobWrapper, localWorkerTempDir, blockFn)
            
            #Update blockFn
            blockFn = fileStore._blockFn
            
            #Add successorJobWrapper to those to be deleted
            fileStore.jobsToDelete.add(successorJobWrapper.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobWrapper and its stack again, so that updates to it do 
            #not interfere with this update
            jobWrapper = copy.deepcopy(jobWrapper)
            
            logger.debug("Starting the next jobWrapper")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname())
        FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if FileStore._terminateEvent.isSet():
        jobWrapper = jobStore.load(jobStoreID)
        jobWrapper.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        jobWrapper.logJobStoreFileID = jobStore.getEmptyFileStoreID(jobWrapper.jobStoreID)
        with jobStore.updateFileStream(jobWrapper.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, "r") as f:
                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit:
                    f.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                w.write(f.read())
        jobStore.update(jobWrapper)

    elif debugging:  # write log messages
        with open(tempWorkerLogPath, 'r') as logFile:
            if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit:
                logFile.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
            logMessages = logFile.read().splitlines()
        statsDict.logs = [Expando(jobStoreID=jobStoreID, text=logMessage) for logMessage in logMessages]

    if (debugging or config.stats or statsDict.workers.logsToMaster) and not workerFailed:  # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed):
        shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0 and len(jobWrapper.services) == 0:
        # We can now safely get rid of the jobWrapper
        jobStore.delete(jobWrapper.jobStoreID)
예제 #11
0
파일: toilStatus.py 프로젝트: PureQsh/toil
def main():
    """Reports the state of the toil.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser()
    
    parser.add_argument("jobStore", type=str,
              help=("Store in which to place job management files \
              and the global accessed temporary files"
              "(If this is a file path this needs to be globally accessible "
              "by all machines running jobs).\n"
              "If the store already exists and restart is false an"
              " JobStoreCreationException exception will be thrown."))
    
    parser.add_argument("--verbose", dest="verbose", action="store_true",
                      help="Print loads of information, particularly all the log files of \
                      jobs that failed. default=%(default)s",
                      default=False)
    
    parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true",
                      help="Return exit value of 1 if toil jobs not all completed. default=%(default)s",
                      default=False)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for Toil")
    assert options.jobStore is not None

    ##########################################
    #Survey the status of the job and report.
    ##########################################  
    
    jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    try:
        rootJob = jobStore.loadRootJob()
    except JobException:
        print('The root job of the job store is absent, the workflow completed successfully.',
              file=sys.stderr)
        sys.exit(0)
    
    toilState = ToilState(jobStore, rootJob )

    # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect
    totalJobs = set(toilState.successorCounts.keys()) | \
                {jobTuple[0] for jobTuple in toilState.updatedJobs}

    failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ]

    print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs '
          'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts),
                                len(failedJobs), options.jobStore), file=sys.stderr)
    
    if options.verbose: #Verbose currently means outputting the files that have failed.
        for job in failedJobs:
            if job.logJobStoreFileID is not None:
                with job.getLogFileHandle(jobStore) as logFileHandle:
                    logStream(logFileHandle, job.jobStoreID, logger.warn)
            else:
                print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr)
        if len(failedJobs) == 0:
            print('There are no failed jobs to report.', file=sys.stderr)
    
    if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \
        options.failIfNotComplete:
        sys.exit(1)
예제 #12
0
def main():
    """Reports the state of the toil.
    """

    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser()

    parser.add_argument(
        "jobStore",
        type=str,
        help=("Store in which to place job management files \
              and the global accessed temporary files"
              "(If this is a file path this needs to be globally accessible "
              "by all machines running jobs).\n"
              "If the store already exists and restart is false an"
              " ExistingJobStoreException exception will be thrown."))

    parser.add_argument(
        "--verbose",
        dest="verbose",
        action="store_true",
        help="Print loads of information, particularly all the log files of \
                      jobs that failed. default=%(default)s",
        default=False)

    parser.add_argument(
        "--failIfNotComplete",
        dest="failIfNotComplete",
        action="store_true",
        help=
        "Return exit value of 1 if toil jobs not all completed. default=%(default)s",
        default=False)
    parser.add_argument("--version", action='version', version=version)
    options = parseBasicOptions(parser)
    logger.info("Parsed arguments")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    ##########################################
    #Do some checks.
    ##########################################

    logger.info("Checking if we have files for Toil")
    assert options.jobStore is not None

    ##########################################
    #Survey the status of the job and report.
    ##########################################

    jobStore = Toil.loadOrCreateJobStore(options.jobStore)
    try:
        rootJob = jobStore.loadRootJob()
    except JobException:
        print(
            'The root job of the job store is absent, the workflow completed successfully.',
            file=sys.stderr)
        sys.exit(0)

    toilState = ToilState(jobStore, rootJob)

    # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect
    totalJobs = set(toilState.successorCounts.keys()) | \
                {jobTuple[0] for jobTuple in toilState.updatedJobs}

    failedJobs = [job for job in totalJobs if job.remainingRetryCount == 0]

    print(
        'There are %i active jobs, %i parent jobs with children, and %i totally failed jobs '
        'currently in %s.' %
        (len(toilState.updatedJobs), len(
            toilState.successorCounts), len(failedJobs), options.jobStore),
        file=sys.stderr)

    if options.verbose:  #Verbose currently means outputting the files that have failed.
        for job in failedJobs:
            if job.logJobStoreFileID is not None:
                with job.getLogFileHandle(jobStore) as logFileHandle:
                    logStream(logFileHandle, job.jobStoreID, logger.warn)
            else:
                print('Log file for job %s is absent.' % job.jobStoreID,
                      file=sys.stderr)
        if len(failedJobs) == 0:
            print('There are no failed jobs to report.', file=sys.stderr)

    if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \
        options.failIfNotComplete:
        sys.exit(1)