def main(): """Restarts a toil workflow. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("--version", action='version', version=version) parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) options = parseBasicOptions(parser) ########################################## #Now run the toil construction/leader ########################################## setLoggingFromOptions(options) options.restart = True with setupToil(options) as (config, batchSystem, jobStore): jobStore.clean(Job._loadRootJob(jobStore)) mainLoop(config, batchSystem, jobStore, Job._loadRootJob(jobStore))
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help=( "Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown." ), ) parser.add_argument("--version", action="version", version=version) options = parseBasicOptions(parser) jobStore = Toil.loadOrCreateJobStore(options.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem( jobStore.config ) # This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): # Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def setUpClass(cls): super(ToilTest, cls).setUpClass() cls.orig_sys_argv = sys.argv[1:] sys.argv[1:] = shlex.split(os.environ.get('TOIL_TEST_ARGS', "")) parser = getBasicOptionParser() options, args = parseSuiteTestOptions(parser) sys.argv[1:] = args
def main(): """Removes the JobStore from a toil run. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") ########################################## #Survey the status of the job and report. ########################################## logger.info("Checking if we have files for toil") try: jobStore = Toil.loadOrCreateJobStore(options.jobStore) except JobStoreCreationException: logger.info("The specified JobStore does not exist, it may have already been deleted") sys.exit(0) logger.info("Deleting the JobStore") jobStore.deleteJobStore()
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) config = parseBasicOptions(parser) cluster = Cluster(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) cluster.destroyCluster()
def main(): parser = getBasicOptionParser("usage: %prog [--jobStore] JOB_TREE_DIR [more options]", "%prog 0.1") parser.add_option("--jobStore", dest="jobStore", help="Job store path. Can also be specified as the single argument to the script.") options, args = parseBasicOptions(parser) if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only toil may be specified as argument if len(args) == 1: #Allow toil directory as arg options.jobStore = args[0] logger.info("Parsed arguments") if options.jobStore == None: parser.error("Specify --jobStore") jobStore = loadJobStore(options.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = loadBatchSystem(jobStore.config) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) config = parseBasicOptions(parser) setLoggingFromOptions(config) cluster = Cluster(provisioner=config.provisioner, clusterName=config.clusterName) cluster.destroyCluster()
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", dest='insecure', action='store_true', required=False, help="Temporarily disable strict host key checking.") parser.add_argument('args', nargs=argparse.REMAINDER) config = parseBasicOptions(parser) cluster = Cluster(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) cluster.sshCluster(args=config.args, strict=not config.insecure)
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Attempting to delete the job store") jobStore = Toil.getJobStore(options.jobStore) jobStore.destroy() logger.info("Successfully deleted the job store")
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) config = Config() config.setOptions(parseBasicOptions(parser)) logger.info("Attempting to delete the job store") jobStore = Toil.getJobStore(config.jobStore) jobStore.destroy() logger.info("Successfully deleted the job store")
def main(): """ Reports stats on the workflow, use with --stats option to toil. """ parser = getBasicOptionParser() initializeOptions(parser) options = parseBasicOptions(parser) checkOptions(options, parser) jobStore = Toil.loadOrCreateJobStore(options.jobStore) stats = getStats(options) collatedStatsTag = processData(jobStore.config, stats, options) reportData(collatedStatsTag, options)
def main(): """ Reports stats on the workflow, use with --stats option to toil. """ parser = getBasicOptionParser() initializeOptions(parser) options = parseBasicOptions(parser) checkOptions(options, parser) jobStore = loadJobStore(options.jobStore) stats = getStats(options) collatedStatsTag = processData(jobStore.config, stats, options) reportData(collatedStatsTag, options)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", dest='insecure', action='store_true', required=False, help="Temporarily disable strict host key checking.") parser.add_argument('args', nargs=argparse.REMAINDER) config = parseBasicOptions(parser) cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) command = config.args if config.args else ['bash'] cluster.getLeader().sshAppliance(*command, strict=not config.insecure, tty=sys.stdin.isatty())
def main(): parser = getBasicOptionParser() parser.add_argument("--version", action='version', version=version) parser.add_argument( "--nodeType", dest='nodeType', required=True, help="Node type for {non-|}preemptable nodes. The syntax depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'.") parser.add_argument( '-p', "--provisioner", dest='provisioner', choices=['aws'], required=True, help="The provisioner for cluster auto-scaling. Only aws is currently" "supported") parser.add_argument( "clusterName", help="The name that the cluster will be identifiable by") parser.add_argument( "--keyPairName", dest='keyPairName', required=True, help="The name of the AWS key pair to include on the instance") config = parseBasicOptions(parser) setLoggingFromOptions(config) spotBid = None provisioner = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: raise RuntimeError( 'The aws extra must be installed to use this provisioner') provisioner = AWSProvisioner parsedBid = config.nodeType.split(':', 1) if len(config.nodeType) != len(parsedBid[0]): # there is a bid spotBid = float(parsedBid[1]) config.nodeType = parsedBid[0] else: assert False provisioner.launchCluster(instanceType=config.nodeType, clusterName=config.clusterName, keyName=config.keyPairName, spotBid=spotBid)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", dest='insecure', action='store_true', required=False, help="Temporarily disable strict host key checking.") parser.add_argument("args", nargs=argparse.REMAINDER, help="Arguments to pass to" "`rsync`. Takes any arguments that rsync accepts. Specify the" " remote with a colon. For example, to upload `example.py`," " specify `toil rsync-cluster -p aws test-cluster example.py :`." "\nOr, to download a file from the remote:, `toil rsync-cluster" " -p aws test-cluster :example.py .`") config = parseBasicOptions(parser) cluster = Cluster(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) cluster.rsyncCluster(args=config.args, strict=not config.insecure)
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("--localFilePath", nargs=1, help="Location to which to copy job store files.") parser.add_argument("--fetch", nargs="+", help="List of job-store files to be copied locally." "Use either explicit names (i.e. 'data.txt'), or " "specify glob patterns (i.e. '*.txt')") parser.add_argument( "--listFilesInJobStore", help="Prints a list of the current files in the jobStore.") parser.add_argument( "--fetchEntireJobStore", help="Copy all job store files into a local directory.") parser.add_argument( "--useSymlinks", help="Creates symlink 'shortcuts' of files in the localFilePath" " instead of hardlinking or copying, where possible. If this is" " not possible, it will copy the files (shutil.copyfile()).") parser.add_argument("--version", action='version', version=version) # Load the jobStore options = parseBasicOptions(parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) logger.debug("Connected to job store: %s", config.jobStore) if options.fetch: # Copy only the listed files locally logger.debug("Fetching local files: %s", options.fetch) fetchJobStoreFiles(jobStore=jobStore, options=options) elif options.fetchEntireJobStore: # Copy all jobStore files locally logger.debug("Fetching all local files.") options.fetch = "*" fetchJobStoreFiles(jobStore=jobStore, options=options) if options.listFilesInJobStore: # Log filenames and create a file containing these names in cwd printContentsOfJobStore(jobStorePath=options.jobStore)
def main(): """ Reports stats on the job-tree, use with --stats option to toil. """ parser = getBasicOptionParser() initializeOptions(parser) options = parseBasicOptions(parser) checkOptions(options, parser) jobStore = loadJobStore(options.jobStore) #collatedStatsTag = cacheAvailable(options) #if collatedStatsTag is None: stats = getStats(options) collatedStatsTag = processData(jobStore.config, stats, options) reportData(collatedStatsTag, options)
def main(): """ Reports stats on the job-tree, use with --stats option to toil. """ parser = getBasicOptionParser( "usage: %prog [--jobStore] JOB_TREE_DIR [options]", "%prog 0.1") initializeOptions(parser) options, args = parseBasicOptions(parser) checkOptions(options, args, parser) jobStore = loadJobStore(options.jobStore) #collatedStatsTag = cacheAvailable(options) #if collatedStatsTag is None: stats = getStats(options) collatedStatsTag = processData(jobStore.config, stats, options) reportData(collatedStatsTag, options)
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) jobStore = Toil.resumeJobStore(options.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem(jobStore.config) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "args", nargs=argparse.REMAINDER, help="Arguments to pass to" "`rsync`. Takes any arguments that rsync accepts. Specify the" " remote with a colon. For example, to upload `example.py`," " specify `toil rsync-cluster -p aws test-cluster example.py :`." "\nOr, to download a file from the remote:, `toil rsync-cluster" " -p aws test-cluster :example.py .`") config = parseBasicOptions(parser) setLoggingFromOptions(config) cluster = Cluster(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) cluster.rsyncCluster(args=config.args)
def main(): """Restarts a toil workflow. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("--version", action='version', version=version) parser.add_argument( "jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) options = parseBasicOptions(parser) ########################################## #Now run the toil construction/leader ########################################## setLoggingFromOptions(options) options.restart = True with setupToil(options) as (config, batchSystem, jobStore): # Load the whole jobstore into memory in a batch logger.info("Downloading entire JobStore") jobCache = { jobWrapper.jobStoreID: jobWrapper for jobWrapper in jobStore.jobs() } logger.info("{} jobs downloaded.".format(len(jobCache))) jobStore.clean(Job._loadRootJob(jobStore), jobCache=jobCache) mainLoop(config, batchSystem, jobStore, Job._loadRootJob(jobStore), jobCache=jobCache)
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help= "The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) config = Config() config.setOptions(options) config.jobStore = config.jobStore[5:] if config.jobStore.startswith( 'file:') else config.jobStore # ':' means an aws/google jobstore; use the old (broken?) method if ':' in config.jobStore: jobStore = Toil.resumeJobStore(config.jobStore) logger.info( "Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) # TODO: This behaviour is now broken src: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1 batchSystem = Toil.createBatchSystem( jobStore.config ) # Should automatically kill existing jobs, so we're good. for jobID in batchSystem.getIssuedBatchJobIDs( ): # Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed") # otherwise, kill the pid recorded in the jobstore else: pid_log = os.path.join(os.path.abspath(config.jobStore), 'pid.log') with open(pid_log, 'r') as f: pid2kill = f.read().strip() try: os.kill(int(pid2kill), signal.SIGKILL) logger.info("Toil process %s successfully terminated." % str(pid2kill)) except OSError: logger.error("Toil process %s could not be terminated." % str(pid2kill)) raise
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) config = Config() config.setOptions(parseBasicOptions(parser)) try: jobStore = Toil.getJobStore(config.jobStore) jobStore.resume() jobStore.destroy() logger.info("Successfully deleted the job store: %s" % config.jobStore) except NoSuchJobStoreException: logger.info("Failed to delete the job store: %s is non-existent" % config.jobStore) except: logger.info("Failed to delete the job store: %s" % config.jobStore) raise
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("jobID", nargs=1, help="The job store id of a job " "within the provided jobstore to run by itself.") parser.add_argument( "--printJobInfo", nargs=1, help="Return information about this job to the user" " including preceding jobs, inputs, outputs, and runtime" " from the last known run.") parser.add_argument("--version", action='version', version=version) # Parse options options = parseBasicOptions(parser) config = Config() config.setOptions(options) # Load the job store jobStore = Toil.resumeJobStore(config.jobStore) if options.printJobInfo: printContentsOfJobStore(jobStorePath=options.jobStore, nameOfJob=options.printJobInfo) # TODO: Option to print list of successor jobs # TODO: Option to run job within python debugger, allowing step through of arguments # idea would be to have option to import pdb and set breakpoint at the start of the user's code # Run the job locally jobID = options.jobID[0] logger.debug("Going to run the following job locally: %s", jobID) workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False) logger.debug("Ran the following job locally: %s", jobID)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", action='store_true', help="Temporarily disable strict host key checking.") parser.add_argument("--sshOption", dest='sshOptions', default=[], action='append', help="Pass an additional option to the SSH command.") parser.add_argument('args', nargs=argparse.REMAINDER) config = parseBasicOptions(parser) cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) command = config.args if config.args else ['bash'] cluster.getLeader().sshAppliance(*command, strict=not config.insecure, tty=sys.stdin.isatty(), sshOptions=config.sshOptions)
def main(): """Removes the JobStore from a toil run. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") ########################################## #Survey the status of the job and report. ########################################## logger.info("Checking if we have files for toil") try: jobStore = Toil.loadOrCreateJobStore(options.jobStore) except JobStoreCreationException: logger.info( "The specified JobStore does not exist, it may have already been deleted" ) sys.exit(0) logger.info("Attempting to delete the job store") jobStore.deleteJobStore() logger.info("Successfully deleted the job store")
def main(): """Restarts a toil workflow. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("--version", action='version', version=version) parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) options = parseBasicOptions(parser) ########################################## #Now run the toil construction/leader ########################################## setLoggingFromOptions(options) options.restart = True with setupToil(options) as (config, batchSystem, jobStore): # Load the whole jobstore into memory in a batch logger.info("Downloading entire JobStore") jobCache = {jobWrapper.jobStoreID: jobWrapper for jobWrapper in jobStore.jobs()} logger.info("{} jobs downloaded.".format(len(jobCache))) jobStore.clean(Job._loadRootJob(jobStore), jobCache=jobCache) mainLoop(config, batchSystem, jobStore, Job._loadRootJob(jobStore), jobCache=jobCache)
def main(): """Reports the state of a Toil workflow.""" parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--noAggStats", dest="stats", action="store_false", help="Do not print overall, aggregate status of workflow.", default=True) parser.add_argument("--printDot", action="store_true", help="Print dot formatted description of the graph. If using --jobs will " "restrict to subgraph including only those jobs. default=%(default)s", default=False) parser.add_argument("--jobs", nargs='+', help="Restrict reporting to the following jobs (allows subsetting of the report).", default=None) parser.add_argument("--printPerJobStats", action="store_true", help="Print info about each job. default=%(default)s", default=False) parser.add_argument("--printLogs", action="store_true", help="Print the log files of jobs (if they exist). default=%(default)s", default=False) parser.add_argument("--printChildren", action="store_true", help="Print children of each job. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) ########################################## # Gather the jobs to report ########################################## # Gather all jobs in the workflow in jobsToReport if options.jobs == None: rootJob = fetchRootJob(jobStore) logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.') jobsToReport = traverseJobGraph(rootJob, jobStore) # Only gather jobs specified in options.jobs else: jobsToReport = fetchUserJobs(jobStore, jobs=options.jobs) ########################################## # Report on the jobs ########################################## jobStats = report_on_jobs(jobsToReport, jobStore, options) hasChildren = jobStats['hasChildren'] readyToRun = jobStats['readyToRun'] zombies = jobStats['zombies'] hasServices = jobStats['hasServices'] services = jobStats['services'] hasLogFile = jobStats['hasLogFile'] properties = jobStats['properties'] childNumber = jobStats['childNumber'] if options.printPerJobStats: printAggregateJobStats(jobsToReport, properties, childNumber) if options.printLogs: printJobLog(jobsToReport, jobStore) if options.printChildren: printJobChildren(jobsToReport) if options.printDot: print_dot_chart(jobsToReport, jobStore_name=config.jobStore) if options.stats: print('Of the %i jobs considered, ' 'there are %i jobs with children, ' '%i jobs ready to run, ' '%i zombie jobs, ' '%i jobs with services, ' '%i services, ' 'and %i jobs with log files currently in %s.' % (len(jobsToReport), len(hasChildren), len(readyToRun), len(zombies), len(hasServices), len(services), len(hasLogFile), config.jobStore)) if len(jobsToReport) > 0 and options.failIfNotComplete: # Upon workflow completion, all jobs will have been removed from job store exit(1)
def main(): """Reports the state of a Toil workflow.""" parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument( "--failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument( "--noAggStats", dest="stats", action="store_false", help="Do not print overall, aggregate status of workflow.", default=True) parser.add_argument( "--printDot", action="store_true", help= "Print dot formatted description of the graph. If using --jobs will " "restrict to subgraph including only those jobs. default=%(default)s", default=False) parser.add_argument( "--jobs", nargs='+', help= "Restrict reporting to the following jobs (allows subsetting of the report).", default=None) parser.add_argument("--printPerJobStats", action="store_true", help="Print info about each job. default=%(default)s", default=False) parser.add_argument( "--printLogs", action="store_true", help="Print the log files of jobs (if they exist). default=%(default)s", default=False) parser.add_argument("--printChildren", action="store_true", help="Print children of each job. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) if len(sys.argv) == 1: parser.print_help() sys.exit(0) config = Config() config.setOptions(options) try: status = ToilStatus(config.jobStore, options.jobs) except NoSuchJobStoreException: print('No job store found.') return except JobException: # Workflow likely complete, user informed in ToilStatus() return jobStats = status.report_on_jobs() # Info to be reported. hasChildren = jobStats['hasChildren'] readyToRun = jobStats['readyToRun'] zombies = jobStats['zombies'] hasServices = jobStats['hasServices'] services = jobStats['services'] hasLogFile = jobStats['hasLogFile'] properties = jobStats['properties'] childNumber = jobStats['childNumber'] if options.printPerJobStats: status.printAggregateJobStats(properties, childNumber) if options.printLogs: status.printJobLog() if options.printChildren: status.printJobChildren() if options.printDot: status.print_dot_chart() if options.stats: print('Of the %i jobs considered, ' 'there are %i jobs with children, ' '%i jobs ready to run, ' '%i zombie jobs, ' '%i jobs with services, ' '%i services, ' 'and %i jobs with log files currently in %s.' % (len(status.jobsToReport), len(hasChildren), len(readyToRun), len(zombies), len(hasServices), len(services), len(hasLogFile), status.jobStore)) if len(status.jobsToReport) > 0 and options.failIfNotComplete: # Upon workflow completion, all jobs will have been removed from job store exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser( "usage: %prog [--jobStore] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option( "--jobStore", dest="jobStore", help= "Job store path. Can also be specified as the single argument to the script.\ default=%default", default=os.path.abspath("./toil")) parser.add_option( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%default", default=False) parser.add_option( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only toil may be specified as argument if len(args) == 1: #Allow toil directory as arg options.jobStore = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for toil") assert options.jobStore != None ########################################## #Survey the status of the job and report. ########################################## jobStore = loadJobStore(options.jobStore) try: rootJob = Job._loadRootJob(jobStore) except JobException: print "The root job of the jobStore is not present, the toil workflow has probably completed okay" sys.exit(0) toilState = ToilState(jobStore, rootJob) failedJobs = [ job for job in toilState.updatedJobs | \ set(toilState.successorCounts.keys()) \ if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, and \ %i totally failed jobs currently in toil workflow: %s" % \ (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print "Log file for job %s is not present" % job.jobStoreID if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "--nodeType", dest='nodeType', required=True, help="Node type for {non-|}preemptable nodes. The syntax depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'.") parser.add_argument( "--keyPairName", dest='keyPairName', required=True, help="The name of the AWS key pair to include on the instance") parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its" "children. Tags are of the form: " " -t key1=value1 --tag key2=value2 " "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {" " \"Name\": clusterName," " \"Owner\": IAM username" " }. ") parser.add_argument( "--vpcSubnet", help= "VPC subnet ID to launch cluster in. Uses default subnet if not specified." "This subnet needs to have auto assign IPs turned on.") parser.add_argument( "-w", "--workers", dest='workers', default=0, type=int, help= "Specify a number of workers to launch alongside the leader when the " "cluster is created. This can be useful if running toil without " "auto-scaling but with need of more hardware support") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) spotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: raise RuntimeError( 'The aws extra must be installed to use this provisioner') provisioner = AWSProvisioner() parsedBid = config.nodeType.split(':', 1) if len(config.nodeType) != len(parsedBid[0]): # there is a bid spotBid = float(parsedBid[1]) config.nodeType = parsedBid[0] else: assert False provisioner.launchCluster(instanceType=config.nodeType, keyName=config.keyPairName, clusterName=config.clusterName, workers=config.workers, spotBid=spotBid, userTags=tagsDict, zone=config.zone, vpcSubnet=config.vpcSubnet)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--nodeType", dest='nodeType', required=True, help="Node type for {non-|}preemptable nodes. The syntax depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'.") parser.add_argument("--keyPairName", dest='keyPairName', required=True, help="The name of the AWS key pair to include on the instance") parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " --t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument("--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not specified. " "This subnet needs to have auto assign IPs turned on.") parser.add_argument("-w", "--workers", dest='workers', default=0, type=int, help="Specify a number of workers to launch alongside the leader when the " "cluster is created. This can be useful if running toil without " "auto-scaling but with need of more hardware support") parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader instance. " "This is an EBS volume.") parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker instances " "created when using the -w flag. This is an EBS volume.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) spotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: raise RuntimeError('The aws extra must be installed to use this provisioner') provisioner = AWSProvisioner() parsedBid = config.nodeType.split(':', 1) if len(config.nodeType) != len(parsedBid[0]): # there is a bid spotBid = float(parsedBid[1]) config.nodeType = parsedBid[0] else: assert False provisioner.launchCluster(instanceType=config.nodeType, keyName=config.keyPairName, clusterName=config.clusterName, workers=config.workers, spotBid=spotBid, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) def traverseGraph(jobGraph): foundJobStoreIDs = set() totalJobs = [] def inner(jobGraph): if jobGraph.jobStoreID in foundJobStoreIDs: return foundJobStoreIDs.add(jobGraph.jobStoreID) totalJobs.append(jobGraph) # Traverse jobs in stack for jobs in jobGraph.stack: for successorJobStoreID in [x.jobStoreID for x in jobs]: if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)): inner(jobStore.load(successorJobStoreID)) # Traverse service jobs for jobs in jobGraph.services: for serviceJobStoreID in [x.jobStoreID for x in jobs]: if jobStore.exists(serviceJobStoreID): assert serviceJobStoreID not in foundJobStoreIDs foundJobStoreIDs.add(serviceJobStoreID) totalJobs.append(jobStore.load(serviceJobStoreID)) inner(jobGraph) return totalJobs logger.info('Traversing the job graph. This may take a couple minutes.') totalJobs = traverseGraph(rootJob) failedJobs = [] hasChildren = [] hasServices = [] services = [] currentlyRunnning = [] for job in totalJobs: if job.logJobStoreFileID is not None: failedJobs.append(job) if job.stack: hasChildren.append(job) elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command: # The job has no children, hasn't failed, and has a command to run. This indicates that the job is # likely currently running, or at least could be run. currentlyRunnning.append(job) if job.services: hasServices.append(job) if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID: # these attributes are only set in service jobs services.append(job) logger.info('There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, ' 'and %i totally failed jobs currently in %s.' % (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore)) if currentlyRunnning: logger.info('These %i jobs are currently active: %s', len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning))) if options.verbose: #Verbose currently means outputting the files that have failed. if failedJobs: msg = "Outputting logs for the %i failed jobs" % (len(failedJobs)) msg += ": %s" % ", ".join((str(failedJob) for failedJob in failedJobs)) for jobNode in failedJobs: job = jobStore.load(jobNode.jobStoreID) msg += "\n=========> Failed job %s \n" % jobNode with job.getLogFileHandle(jobStore) as fH: msg += fH.read() msg += "<=========\n" print(msg) else: print('There are no failed jobs to report.', file=sys.stderr) if totalJobs and options.failIfNotComplete: exit(1) # when the workflow is complete, all jobs will have been removed from job store
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(options.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print( 'The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob) # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [job for job in totalJobs if job.remainingRetryCount == 0] print( 'There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len( toilState.successorCounts), len(failedJobs), options.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for toil") assert options.jobStore != None ########################################## #Survey the status of the job and report. ########################################## jobStore = loadJobStore(options.jobStore) try: rootJob = Job._loadRootJob(jobStore) except JobException: print "The root job of the jobStore is not present, the toil workflow has probably completed okay" sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, and \ %i totally failed jobs currently in toil workflow: %s" % \ (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print "Log file for job %s is not present" % job.jobStoreID if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument( "--keyPairName", dest='keyPairName', help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair.") parser.add_argument( "--owner", dest='owner', help="The owner tag for all instances. If not given, the value in" " --keyPairName will be used if given.") parser.add_argument( "--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances." ) parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument( "--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument( "--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument( "-w", "--workers", dest='workers', default=None, type=str, help= "Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument( "--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument( "--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument( '--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help= "Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument( '--awsEc2ProfileArn', dest='awsEc2ProfileArn', default=None, type=str, help= "If provided, the specified ARN is used as the instance profile for EC2 instances." "Useful for setting custom IAM profiles. If not specified, a new IAM role is created " "by default with sufficient access to perform basic cluster operations." ) config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) checkValidNodeTypes(config.provisioner, config.nodeTypes) checkValidNodeTypes(config.provisioner, config.leaderNodeType) # checks the validity of TOIL_APPLIANCE_SELF before proceeding applianceSelf(forceDockerAppliance=config.forceDockerAppliance) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError( "The --nodeTypes and --workers options must be specified together," ) if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError( "List of node types must be the same length as the list of workers." ) for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) # set owner (default to keyPairName if not given) owner = 'toil' if config.owner: owner = config.owner elif config.keyPairName: owner = config.keyPairName # Check to see if the user specified a zone. If not, see if one is stored in an environment variable. config.zone = config.zone or getZoneFromEnv(config.provisioner) if not config.zone: raise RuntimeError( 'Please provide a value for --zone or set a default in the TOIL_' + config.provisioner.upper() + '_ZONE enviroment variable.') cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone, nodeStorage=config.nodeStorage) cluster.launchCluster(leaderNodeType=config.leaderNodeType, leaderStorage=config.leaderStorage, owner=owner, keyName=config.keyPairName, botoPath=config.botoPath, userTags=tagsDict, vpcSubnet=config.vpcSubnet, awsEc2ProfileArn=config.awsEc2ProfileArn) for nodeType, workers in zip(nodeTypes, numNodes): cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False) for nodeType, workers, spotBid in zip(preemptableNodeTypes, numPreemptableNodes, spotBids): cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True, spotBid=spotBid)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument( "--keyPairName", dest='keyPairName', required=True, help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair." " On Azure, this will be used as the owner tag.") parser.add_argument( "--publicKeyFile", dest='publicKeyFile', default="~/.ssh/id_rsa.pub", help="On Azure, the file" " containing the key pairs (the first key pair will be used).") parser.add_argument( "--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances." ) parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument( "--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument( "--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument( "-w", "--workers", dest='workers', default=None, type=str, help= "Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument( "--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument( "--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument( '--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help= "Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument( "--azureStorageCredentials", dest='azureStorageCredentials', type=str, default=credential_file_path, help= "The location of the file containing the Azure storage credentials. If not specified," " the default file is used with Azure provisioning. Use 'None' to disable" " the transfer of credentials.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) # checks the validity of TOIL_APPLIANCE_SELF before proceeding checkToilApplianceSelf = applianceSelf( forceDockerAppliance=config.forceDockerAppliance) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] leaderSpotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error( 'The aws extra must be installed to use this provisioner') raise provisioner = AWSProvisioner() elif config.provisioner == 'azure': try: from toil.provisioners.azure.azureProvisioner import AzureProvisioner except ImportError: raise RuntimeError( 'The aws extra must be installed to use this provisioner') provisioner = AzureProvisioner() elif config.provisioner == 'gce': logger.info('Using a gce provisioner.') try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error( 'The google extra must be installed to use this provisioner') raise provisioner = GCEProvisioner() else: assert False #Parse leader node type and spot bid parsedBid = config.leaderNodeType.split(':', 1) if len(config.leaderNodeType) != len(parsedBid[0]): leaderSpotBid = float(parsedBid[1]) config.leaderNodeType = parsedBid[0] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError( "The --nodeTypes and --workers options must be specified together," ) if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError( "List of node types must be the same length as the list of workers." ) for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) provisioner.launchCluster( leaderNodeType=config.leaderNodeType, leaderSpotBid=leaderSpotBid, nodeTypes=nodeTypes, preemptableNodeTypes=preemptableNodeTypes, numWorkers=numNodes, numPreemptableWorkers=numPreemptableNodes, keyName=config.keyPairName, botoPath=config.botoPath, clusterName=config.clusterName, spotBids=spotBids, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet, publicKeyFile=config.publicKeyFile, azureStorageCredentials=config.azureStorageCredentials)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument("--keyPairName", dest='keyPairName', required=True, help="The name of the AWS or ssh key pair to include on the instance") parser.add_argument("--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred to all " " nodes in order to access the AWS jobStore from non-AWS instances.") parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument("--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not specified. " "This subnet needs to have auto assign IPs turned on.") parser.add_argument("--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the leader. The " "syntax for each node type depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'. Must also provide " "the --workers argument to specify how many workers of each node type to create") parser.add_argument("-w", "--workers", dest='workers', default=None, type=str, help="Comma-separated list of the number of workers of each node type to launch " "alongside the leader when the " "cluster is created. This can be useful if running toil without " "auto-scaling but with need of more hardware support") parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader instance. " "This is an EBS volume.") parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker instances " "created when using the -w flag. This is an EBS volume.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] leaderSpotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error('The aws extra must be installed to use this provisioner') raise provisioner = AWSProvisioner() elif config.provisioner == 'gce': logger.info('Using a gce provisioner.') try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error('The google extra must be installed to use this provisioner') raise provisioner = GCEProvisioner() else: assert False #Parse leader node type and spot bid parsedBid = config.leaderNodeType.split(':', 1) if len(config.leaderNodeType) != len(parsedBid[0]): leaderSpotBid = float(parsedBid[1]) config.leaderNodeType = parsedBid[0] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError("The --nodeTypes and --workers options must be specified together,") if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError("List of node types must be same length as list of numbers of workers.") for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) provisioner.launchCluster(leaderNodeType=config.leaderNodeType, leaderSpotBid=leaderSpotBid, nodeTypes=nodeTypes, preemptableNodeTypes=preemptableNodeTypes, numWorkers=numNodes, numPreemptableWorkers = numPreemptableNodes, keyName=config.keyPairName, botoPath=config.botoPath, clusterName=config.clusterName, spotBids=spotBids, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [--jobStore] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option("--jobStore", dest="jobStore", help="Job store path. Can also be specified as the single argument to the script.\ default=%default", default=os.path.abspath("./toil")) parser.add_option("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%default", default=False) parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only toil may be specified as argument if len(args) == 1: #Allow toil directory as arg options.jobStore = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for toil") assert options.jobStore != None ########################################## #Survey the status of the job and report. ########################################## jobStore = loadJobStore(options.jobStore) try: rootJob = Job._loadRootJob(jobStore) except JobException: print "The root job of the jobStore is not present, the toil workflow has probably completed okay" sys.exit(0) toilState = ToilState(jobStore, rootJob ) failedJobs = [ job for job in toilState.updatedJobs | \ set(toilState.successorCounts.keys()) \ if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, and \ %i totally failed jobs currently in toil workflow: %s" % \ (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print "Log file for job %s is not present" % job.jobStoreID if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobGraph we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), config.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print( 'The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) def traverseGraph(jobGraph): foundJobStoreIDs = set() totalJobs = [] def inner(jobGraph): if jobGraph.jobStoreID in foundJobStoreIDs: return foundJobStoreIDs.add(jobGraph.jobStoreID) totalJobs.append(jobGraph) # Traverse jobs in stack for jobs in jobGraph.stack: for successorJobStoreID in map(lambda x: x.jobStoreID, jobs): if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)): inner(jobStore.load(successorJobStoreID)) # Traverse service jobs for jobs in jobGraph.services: for serviceJobStoreID in map(lambda x: x.jobStoreID, jobs): if jobStore.exists(serviceJobStoreID): assert serviceJobStoreID not in foundJobStoreIDs foundJobStoreIDs.add(serviceJobStoreID) totalJobs.append(jobStore.load(serviceJobStoreID)) inner(jobGraph) return totalJobs logger.info('Traversing the job graph. This may take a couple minutes.') totalJobs = traverseGraph(rootJob) failedJobs = [] hasChildren = [] hasServices = [] services = [] currentlyRunnning = [] for job in totalJobs: if job.logJobStoreFileID is not None: failedJobs.append(job) if job.stack: hasChildren.append(job) elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command: # The job has no children, hasn't failed, and has a command to run. This indicates that the job is # likely currently running, or at least could be run. currentlyRunnning.append(job) if job.services: hasServices.append(job) if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID: # these attributes are only set in service jobs services.append(job) logger.info( 'There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, ' 'and %i totally failed jobs currently in %s.' % (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore)) if currentlyRunnning: logger.info('These %i jobs are currently active: %s', len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning))) if options.verbose: #Verbose currently means outputting the files that have failed. if failedJobs: msg = "Outputting logs for the %i failed jobs" % (len(failedJobs)) msg += ": %s" % ", ".join( (str(failedJob) for failedJob in failedJobs)) for jobNode in failedJobs: job = jobStore.load(jobNode.jobStoreID) msg += "\n=========> Failed job %s \n" % jobNode with job.getLogFileHandle(jobStore) as fH: msg += fH.read() msg += "<=========\n" print(msg) else: print('There are no failed jobs to report.', file=sys.stderr) if totalJobs and options.failIfNotComplete: exit( 1 ) # when the workflow is complete, all jobs will have been removed from job store
def main(): """Reports the state of the toil. """ ########################################## # Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help=( "Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown." ), ) parser.add_argument( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False, ) parser.add_argument( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False, ) parser.add_argument("--version", action="version", version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## # Do some checks. ########################################## logger.info("Checking if we have files for toil") assert options.jobStore != None ########################################## # Survey the status of the job and report. ########################################## jobStore = loadJobStore(options.jobStore) try: rootJob = Job._loadRootJob(jobStore) except JobException: print "The root job of the jobStore is not present, the toil workflow has probably completed okay" sys.exit(0) toilState = ToilState(jobStore, rootJob) failedJobs = [ job for job in toilState.updatedJobs | set(toilState.successorCounts.keys()) if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, and \ %i totally failed jobs currently in toil workflow: %s" % ( len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore, ) if options.verbose: # Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print "Log file for job %s is not present" % job.jobStoreID if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " JobStoreCreationException exception will be thrown.")) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.loadOrCreateJobStore(options.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser( "usage: %prog [--toil] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option( "--toil", dest="toil", help= "Batchjob store path. Can also be specified as the single argument to the script.\ default=%default", default='./toil') parser.add_option( "--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%default", default=False) parser.add_option( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only toil may be specified as argument if len(args) == 1: #Allow toil directory as arg options.toil = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for toil") assert options.toil != None ########################################## #Survey the status of the batchjob and report. ########################################## jobStore = loadJobStore(options.toil) config = jobStore.config toilState = jobStore.loadToilState( ) #This initialises the object toil.toilState used to track the active toil failedJobs = [ batchjob for batchjob in toilState.updatedJobs | \ set(toilState.childCounts.keys()) \ if batchjob.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, \ %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) \ currently in toil: %s" % \ (len(toilState.updatedJobs), len(toilState.childCounts), len(failedJobs), len(toilState.shellJobs), options.toil) if options.verbose: #Verbose currently means outputting the files that have failed. for batchjob in failedJobs: if batchjob.logJobStoreFileID is not None: with batchjob.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, batchjob.jobStoreID, logger.warn) else: print "Log file for batchjob %s is not present" % batchjob.jobStoreID if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(toilState.updatedJobs) + len(toilState.childCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [--toil] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option("--toil", dest="toil", help="Batchjob store path. Can also be specified as the single argument to the script.\ default=%default", default='./toil') parser.add_option("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%default", default=False) parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only toil may be specified as argument if len(args) == 1: #Allow toil directory as arg options.toil = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for toil") assert options.toil != None ########################################## #Survey the status of the batchjob and report. ########################################## jobStore = loadJobStore(options.toil) config = jobStore.config toilState = jobStore.loadToilState() #This initialises the object toil.toilState used to track the active toil failedJobs = [ batchjob for batchjob in toilState.updatedJobs | \ set(toilState.childCounts.keys()) \ if batchjob.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, \ %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) \ currently in toil: %s" % \ (len(toilState.updatedJobs), len(toilState.childCounts), len(failedJobs), len(toilState.shellJobs), options.toil) if options.verbose: #Verbose currently means outputting the files that have failed. for batchjob in failedJobs: if batchjob.logJobStoreFileID is not None: with batchjob.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, batchjob.jobStoreID, logger.warn) else: print "Log file for batchjob %s is not present" % batchjob.jobStoreID if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(toilState.updatedJobs) + len(toilState.childCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)