def open(self, job): jobReqs = job.disk startingDir = os.getcwd() self.localTempDir = makePublicDir(os.path.join(self.localTempDir, str(uuid.uuid4()))) self._removeDeadJobs(self.workDir) self.jobStateFile = self._createJobStateFile() freeSpace, diskSize = getFileSystemSize(self.localTempDir) if freeSpace <= 0.1 * diskSize: logger.warning('Starting job %s with less than 10%% of disk space remaining.', self.jobName) try: os.chdir(self.localTempDir) with super().open(job): yield finally: diskUsed = getDirSizeRecursively(self.localTempDir) logString = ("Job {jobName} used {percent:.2f}% ({humanDisk}B [{disk}B] used, " "{humanRequestedDisk}B [{requestedDisk}B] requested) at the end of " "its run.".format(jobName=self.jobName, percent=(float(diskUsed) / jobReqs * 100 if jobReqs > 0 else 0.0), humanDisk=bytes2human(diskUsed), disk=diskUsed, humanRequestedDisk=bytes2human(jobReqs), requestedDisk=jobReqs)) self.logToMaster(logString, level=logging.DEBUG) if diskUsed > jobReqs: self.logToMaster("Job used more disk than requested. Consider modifying the user " "script to avoid the chance of failure due to incorrectly " "requested resources. " + logString, level=logging.WARNING) os.chdir(startingDir) # Finally delete the job from the worker os.remove(self.jobStateFile)
def issueJob(self, jobNode): """ Add a job to the queue of jobs """ jobNode.command = ' '.join((resolveEntryPoint('_toil_worker'), jobNode.jobName, self.jobStoreLocator, jobNode.jobStoreID)) # jobBatchSystemID is an int that is an incremented counter for each job jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode) self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = jobNode if jobNode.preemptable: # len(jobBatchSystemIDToIssuedJob) should always be greater than or equal to preemptableJobsIssued, # so increment this value after the job is added to the issuedJob dict self.preemptableJobsIssued += 1 cur_logger = (logger.debug if jobNode.jobName.startswith(CWL_INTERNAL_JOBS) else logger.info) cur_logger("Issued job %s with job batch system ID: " "%s and cores: %s, disk: %s, and memory: %s", jobNode, str(jobBatchSystemID), int(jobNode.cores), bytes2human(jobNode.disk), bytes2human(jobNode.memory)) if self.toilMetrics: self.toilMetrics.logIssuedJob(jobNode) self.toilMetrics.logQueueSize(self.getNumberOfJobsIssued())
def toil_call_preprocess(job, options, in_seq_file, out_seq_file, name): work_dir = job.fileStore.getLocalTempDir() in_path = in_seq_file.pathMap[name] out_name = os.path.basename(out_seq_file.pathMap[name]) cmd = ['cactus-preprocess', os.path.join(work_dir, 'js'), '--inPaths', in_path, '--outPaths', out_name, '--workDir', work_dir, '--maxCores', str(int(job.cores)), '--maxDisk', bytes2human(job.disk), '--maxMemory', bytes2human(job.memory)] + options.cactusOptions.strip().split(' ') cactus_call(parameters=cmd) out_fa_id = job.fileStore.writeGlobalFile(out_name) return out_fa_id
def get_toil_resource_opts(options, task): if task == 'preprocess': cores = options.preprocessCores mem = options.preprocessMemory elif task == 'blast': cores = options.blastCores mem = options.blastMemory elif task == 'align': cores = options.alignCores mem = options.alignMemory elif task == 'halAppend': cores = 1 mem = options.alignMemory else: cores = None mem = None s = '' if cores: s += '--maxCores {}'.format(cores) if mem and not options.wdl: if s: s += ' ' s += '--maxMemory {}'.format(bytes2human(mem)) return s
def bytes2humanN(s): return bytes2human(s, fmt='%(value).1f%(symbol)s') if s else s
def toil_call_align(job, options, seq_file, project, event, cigar_name, hal_path, fa_path, blast_output, dep_names, *dep_fa_ids): work_dir = job.fileStore.getLocalTempDir() # serialize the seqfile so cactus-blast can use it seq_file_path = os.path.join(work_dir, 'seqfile.txt') with open(seq_file_path, 'w') as sf: sf.write(str(seq_file)) # download the blast output from the file store blast_files = [] for blast_file_name, blast_file_id in blast_output: blast_files.append(os.path.join(work_dir, blast_file_name)) job.fileStore.readGlobalFile(blast_file_id, blast_files[-1]) # read the fasta files assert len(dep_names) == len(dep_fa_ids) fa_paths = [os.path.join(work_dir, "{}.pp.fa".format(name)) for name in dep_names] for fa_path, fa_id in zip(fa_paths, dep_fa_ids): job.fileStore.readGlobalFile(fa_id, fa_path) # call cactus-align out_hal_path = os.path.join(work_dir, os.path.basename(hal_path)) cactus_call(parameters=['cactus-align', os.path.join(work_dir, 'js'), seq_file_path] + blast_files + [out_hal_path, '--root', event, '--pathOverrides'] + fa_paths + ['--pathOverrideNames'] + dep_names + ['--workDir', work_dir, '--maxCores', str(int(job.cores)), '--maxDisk', bytes2human(job.disk), '--maxMemory', bytes2human(job.memory)] + options.cactusOptions.strip().split(' ')) out_hal_id = job.fileStore.writeGlobalFile(out_hal_path) # export the fasta while we're at it out_fa_path = os.path.join(work_dir, '{}.fa'.format(event)) cactus_call(parameters=['hal2fasta', out_hal_path, event] + options.halOptions.strip().split(' '), outfile=out_fa_path) out_fa_id = job.fileStore.writeGlobalFile(out_fa_path) return out_fa_id, out_hal_id
def toil_call_blast(job, options, seq_file, project, event, cigar_name, dep_names, *dep_fa_ids): work_dir = job.fileStore.getLocalTempDir() # serialize the seqfile so cactus-blast can use it seq_file_path = os.path.join(work_dir, 'seqfile.txt') with open(seq_file_path, 'w') as sf: sf.write(str(seq_file)) # read the fasta files assert len(dep_names) == len(dep_fa_ids) fa_paths = [os.path.join(work_dir, "{}.pp.fa".format(name)) for name in dep_names] for fa_path, fa_id in zip(fa_paths, dep_fa_ids): job.fileStore.readGlobalFile(fa_id, fa_path) cactus_call(parameters=['cactus-blast', os.path.join(work_dir, 'js'), seq_file_path, os.path.join(work_dir, os.path.basename(cigar_name)), '--root', event, '--pathOverrides'] + fa_paths+ ['--pathOverrideNames'] + dep_names + ['--workDir', work_dir, '--maxCores', str(int(job.cores)), '--maxDisk', bytes2human(job.disk), '--maxMemory', bytes2human(job.memory)] + options.cactusOptions.strip().split(' ')) # scrape the output files out of the workdir out_nameids = [] for out_file in [f for f in os.listdir(work_dir) if os.path.isfile(os.path.join(work_dir, f))]: if out_file.startswith(os.path.basename(cigar_name)): out_nameids.append((os.path.basename(out_file), job.fileStore.writeGlobalFile(os.path.join(work_dir, out_file)))) return out_nameids