def setup(job, inputFile, N, downCheckpoints, options): """ Sets up the sort. Returns the FileID of the sorted file """ RealtimeLogger.info("Starting the merge sort") return job.addChildJobFn(down, inputFile, N, downCheckpoints, options=options, preemptable=True, memory=sortMemory).rv()
def __connect(self): """ Make sure we have an Azure connection, and set one up if we don't. """ if self.connection is None: RealtimeLogger.debug("Connecting to account {}, using " "container {} and prefix {}".format( self.account_name, self.container_name, self.name_prefix)) # Connect to the blob service where we keep everything self.connection = BlobService(account_name=self.account_name, account_key=self.account_key)
def read_input_file(self, input_path, local_path): """ Get input from Azure. """ self.__connect() RealtimeLogger.debug("Loading {} from AzureIOStore".format(input_path)) # Download the blob. This is known to be synchronous, although it can # call a callback during the process. self.connection.get_blob_to_path(self.container_name, self.name_prefix + input_path, local_path)
def start_toil(job): print "Starting job" work_dir = job.fileStore.getLocalTempDir() in_store = IOStore.get("aws:us-east-1:molmimic-ibis") int_store = IOStore.get("aws:us-east-1:molmimic-interfaces") #Download PDB info pdb_file = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", pdb_file) #Add pdb info into local job store pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_file) #Download PDB Taxonomy information tax_file = os.path.join(work_dir, "pdb_chain_taxonomy.h5") in_store.read_input_file("pdb_chain_taxonomy.h5", tax_file) #Add tax info into local job store taxFileStoreID = job.fileStore.writeGlobalFile(tax_file) tables = set(range(1,87))-set([51]) sfams = pd.read_hdf(pdb_file, "Superfamilies", columns= ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values() #RealtimeLogger.info("SFAMS: {}".format(sfams.shape[0])) sfamFileStoreIDs = {} for s in sfams: k = "{0}/{0}.observed_interactome".format(int(s)) if int_store.exists(k): RealtimeLogger.info("Loading {}".format(s)) f = job.fileStore.getLocalTempFileName() int_store.read_input_file(k, f) sfamFileStoreIDs[int(s)] = job.fileStore.writeGlobalFile(f) os.remove(f) else: RealtimeLogger.info("FAILED Loading {}".format(s)) assert len(sfamFileStoreIDs) > 0 os.remove(tax_file) os.remove(pdb_file) job.log("Running tables: {}".format(tables)) j = job for table in table: j.addFollowOnJobFn(get_inferred_structural_interactome_by_table, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) # map_job(job, get_inferred_structural_interactome_by_table, tables, # pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) j.addFollowOnJobFn(merge_inferred_interactome, pdbFileStoreID)
def _runDeferredFunction(self, deferredFunction): """ Run a deferred function (either our own or someone else's). Reports an error if it fails. """ try: deferredFunction.invoke() except Exception as err: # Report this in real time, if enabled. Otherwise the only place it ends up is the worker log. RealtimeLogger.error("Failed to run deferred function %s: %s", repr(deferredFunction), str(err)) except: RealtimeLogger.error("Failed to run deferred function %s", repr(deferredFunction))
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths): """ Surject all gam chunks in parallel. surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name. If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output. Surjects against the given collection of paths in the given XG file. """ RealtimeLogger.info( "Surjecting read chunks {} to BAM".format(reads_chunk_ids)) # this will be a list of lists. # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) bam_chunk_file_ids = [] bam_chunk_running_times = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph surject on each gam chunk chunk_surject_job = child_job.addChildJobFn( run_chunk_surject, context, interleaved, xg_file_id, paths, chunk_filename_ids, '{}_chunk{}'.format(output_name, chunk_id), cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) bam_chunk_file_ids.append(chunk_surject_job.rv(0)) bam_chunk_running_times.append(chunk_surject_job.rv(1)) return child_job.addFollowOnJobFn(run_merge_bams, context, output_name, bam_chunk_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def run_split_fastq(job, context, fastq, fastq_i, sample_fastq_id): RealtimeLogger.info("Starting fastq split") start_time = timeit.default_timer() # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # We need the sample fastq for alignment fastq_name = os.path.basename(fastq[fastq_i]) fastq_path = os.path.join(work_dir, fastq_name) fastq_gzipped = os.path.splitext(fastq_name)[1] == '.gz' fastq_name = os.path.splitext(fastq_name)[0] if fastq_gzipped: fastq_name = os.path.splitext(fastq_name)[0] job.fileStore.readGlobalFile(sample_fastq_id, fastq_path) # Split up the fastq into chunks # Make sure chunk size even in case paired interleaved chunk_size = context.config.reads_per_chunk if chunk_size % 2 != 0: chunk_size += 1 # 4 lines per read chunk_lines = chunk_size * 4 # Note we do this on the command line because Python is too slow if fastq_gzipped: cmd = [['gzip', '-d', '-c', os.path.basename(fastq_path)]] else: cmd = [['cat', os.path.basename(fastq_path)]] cmd.append(['split', '-l', str(chunk_lines), '--filter=pigz -p {} > $FILE.fq.gz'.format(max(1, int(context.config.fq_split_cores) - 1)), '-', '{}-chunk.'.format(fastq_name)]) context.runner.call(job, cmd, work_dir = work_dir, tool_name='pigz') fastq_chunk_ids = [] for chunk_name in sorted(os.listdir(work_dir)): if chunk_name.endswith('.fq.gz') and chunk_name.startswith('{}-chunk'.format(fastq_name)): fastq_chunk_ids.append(context.write_intermediate_file(job, os.path.join(work_dir, chunk_name))) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info("Split fastq into {} chunks. Process took {} seconds.".format(len(fastq_chunk_ids), run_time)) return fastq_chunk_ids
def handle_reject(self, job, err): """ Handle promise rejection. """ Logger.error("Promise Rejected: {}".format(err)) RealtimeLogger.error("Promise Rejected: {}".format(err)) self.err = err # TODO: implement # Check if we have any reject handlers # If so call them # If not throw an error that stops the workflow raise err
def download(filename): """ Download each file """ try: if (not options.overwrite) and out_store.exists(filename): # File exists. But make sure its size is correct. if not options.check_size: # Skip existing file. No need to check the length. RealtimeLogger.info("Skipped {}".format(filename)) return out_size = out_store.get_size(filename) in_size = in_store.get_size(filename) if out_size != in_size: # Complain about size mismatch and copy RealtimeLogger.warning( "Redownloading {}! Size was {} and not {}!".format( filename, out_size, in_size)) else: # Skip existing file RealtimeLogger.info("Skipped {}".format(filename)) return # Make a temp file (handle, path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir()) os.close(handle) RealtimeLogger.debug("Download {}".format(filename)) # Download in_store.read_input_file(filename, path) # Store out_store.write_output_file(path, filename) # Clean up os.unlink(path) except: # Put all exception text into an exception and raise that raise Exception("".join( traceback.format_exception(*sys.exc_info()))) RealtimeLogger.info("Copied {}".format(filename))
def run_concat_files(job, context, file_ids, dest_name=None, header=None): """ Utility job to concatenate some files. Returns the concatenated file ID. If given a dest_name, writes the result to the out store with the given name. (We wanted to use name, but that kwarg is reserved by Toil.) If given a header, prepends the header to the file with a trailing newline. """ requeue_promise = ensure_disk(job, run_concat_files, [context, file_ids], { "dest_name": dest_name, "header": header }, file_ids, factor=2) if requeue_promise is not None: # We requeued ourselves with more disk to accomodate our inputs return requeue_promise work_dir = job.fileStore.getLocalTempDir() out_name = os.path.join(work_dir, 'output.dat' if dest_name is None else dest_name) # Concatenate all the files # TODO: We don't use the trick where we append to the first file to save a copy. Should we? with open(out_name, 'w') as out_file: if header is not None: # Put the header if specified out_file.write(header + '\n') for file_id in file_ids: with job.fileStore.readGlobalFileStream(file_id) as in_file: # Then beam over each file shutil.copyfileobj(in_file, out_file) if dest_name is None: # Send back an intermediate file RealtimeLogger.info( "Concatenated {} files into intermediate file {}".format( len(file_ids), out_name)) return context.write_intermediate_file(job, out_name) else: # Write to outstore under the given name. RealtimeLogger.info( "Concatenated {} files into output file {} -> {}".format( len(file_ids), out_name, dest_name)) return context.write_output_file(job, out_name, dest_name)
def calculate_features_for_atom(self, atom, only_aa=False, only_atom=False, non_geom_features=False, use_deepsite_features=False, warn_if_buried=False): if use_deepsite_features: features = self.get_deepsite_features(atom) if warn_if_buried: is_burried = self.get_accessible_surface_area(atom)[-2] elif only_atom: features = self.get_element_type(atom) if warn_if_buried: is_buried = self.get_accessible_surface_area(atom)[-2] elif only_aa: features = self.get_residue(atom) if warn_if_buried: is_buried = self.get_accessible_surface_area(atom)[-2] elif non_geom_features: features = np.zeros(13) features[0:5] = self.get_element_type(atom) features[5:9] = self.get_charge_and_electrostatics(atom) features[9:13] = self.get_hydrophobicity(atom) is_buried = self.get_accessible_surface_area(atom)[-2] else: features = np.empty(self.n_atom_features) features[0:13] = self.get_atom_type(atom) features[13:18] = self.get_element_type(atom) features[18:19] = self.get_vdw(atom) features[19:26] = self.get_charge_and_electrostatics(atom) features[26:30] = self.get_concavity(atom) features[30:34] = self.get_hydrophobicity(atom) features[34:40] = self.get_accessible_surface_area(atom) features[40:61] = self.get_residue(atom) features[61:64] = self.get_ss(atom) features[64:70] = self.get_deepsite_features(atom, calc_charge=False, calc_conservation=False) features[70:73] = self.get_evolutionary_conservation_score(atom) is_buried = bool(features[35]) RealtimeLogger.info("Finished atom {}".format(atom)) self.atom_features[atom.serial_number-1] = features if warn_if_buried: return features, is_buried else: return features
def ensure_disk(job, job_fn, job_fn_args, job_fn_kwargs, file_id_list, factor=8, padding=1024 ** 3): """ Ensure that the currently running job has enough disk to load all the given file IDs (passed as any number of lists of file IDs), and process them, producing factor times as much data, plus padding. Takes the job, the function that is the job, the list of arguments passed in (except the job object), the dict of keyword args passed in, and then a file ID list or iterable. If there is not enough disk, re-queues the job with more disk, and returns the promise for the result. If there is enough disk, returns None TODO: Convert to promised requirements if it is sufficiently expressive. """ # We need to compute the total size of our inputs, expected intermediates, # and outputs, and re-queue ourselves if we don't have enough disk. required_disk = 0 for file_id in file_id_list: # For each file in the collection # Say we need space for it required_disk += file_id.size # Multiply out for intermediates and results # TODO: Allow different factors for different file IDs # We only need to multiply e.g. BAM files, not indexes required_disk *= factor # Add some padding required_disk += padding if job.disk < required_disk: # Re-queue with more disk RealtimeLogger.info("Re-queueing job with {} bytes of disk; originally had {}".format(required_disk, job.disk)) requeued = job.addChildJobFn(job_fn, *job_fn_args, cores=job.cores, memory=job.memory, disk=required_disk, **job_fn_kwargs) return requeued.rv() else: # Disk we have is OK return None
def run_split_gam_reads(job, context, gam_input_reads, gam_reads_file_id): """ split up an input reads file in GAM format """ RealtimeLogger.info("Starting gam split") start_time = timeit.default_timer() # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # We need the sample fastq for alignment gam_path = os.path.join(work_dir, os.path.basename(gam_input_reads)) job.fileStore.readGlobalFile(gam_reads_file_id, gam_path) # Split up the gam into chunks # Make sure chunk size even in case paired interleaved chunk_size = context.config.reads_per_chunk if chunk_size % 2 != 0: chunk_size += 1 cmd = [ 'vg', 'chunk', '-a', os.path.basename(gam_path), '--gam-split-size', str(chunk_size), '--prefix', 'gam_reads_chunk' ] context.runner.call(job, cmd, work_dir=work_dir) gam_chunk_ids = [] for chunk_name in os.listdir(work_dir): if chunk_name.endswith('.gam') and chunk_name.startswith( 'gam_reads_chunk'): gam_chunk_ids.append( context.write_intermediate_file( job, os.path.join(work_dir, chunk_name))) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info( "Split gam into {} chunks. Process took {} seconds.".format( len(gam_chunk_ids), run_time)) return gam_chunk_ids
def create_data_loader(job, sfam_id, preemptable=True): """Create H5 for Molmimic3dCNN to read Note: move this somewhere else """ work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] pdb_path = os.path.join(work_dir, "pdb") if not os.path.isdir(pdb_path): os.makedirs(pdb_path) id_format = re.compile( "^([A-Z0-9]{4})_([A-Za-z0-9]+)_sdi([0-9]+)_d([0-9]+)$") #Get all with keys same sfam, but do not download in_store = IOStore.get("{}:molmimic-clustered-structures".format(prefix)) keys = [id_format.match(f).groups() for f in in_store.list_input_directory(sfam_id) \ if f.endswith(".pdb") and id_format.match(f)] pdb_path = os.path.join(PDB_PATH, dataset_name, "by_superfamily", str(int(sfam_id))) clusters_file = os.path.join(pdb_path, "{}_nr.fasta".format(int(sfam_id))) try: pdb, chain, sdi, domain = zip(*[id_format.match(seq.id[:-2]).groups() \ for s in SeqIO.parse(clusters_file, "fasta")]) except ValueError: RealtimeLogger.info( "Unable to create data loading file for {}.".format(sfam_id)) return domains = pd.DataFrame({ "pdb": pdb, "chain": chain, "domNo": domain, "sdi": sdi }) data_loader = os.path.join(pdb_path, "{}.h5".format(int(sfam_id))) domains.to_hdf(unicode(data_loader), "table", complevel=9, complib="bzip2")
def write_output_file(self, local_path, output_path): """ Write output to Azure. Will create the container if necessary. """ self.__connect() RealtimeLogger.debug("Saving {} to AzureIOStore".format(output_path)) try: # Make the container self.connection.create_container(self.container_name) except azure.WindowsAzureConflictError: # The container probably already exists pass # Upload the blob (synchronously) # TODO: catch no container error here, make the container, and retry self.connection.put_block_blob_from_path( self.container_name, self.name_prefix + output_path, local_path)
def run_split_reads_if_needed(job, context, fastq, gam_input_reads, bam_input_reads, reads_file_ids): """ Return a list of lists of read chunk file IDs, one list per read files. If the workflow is in single_reads_chunk mode (according to context.options.single_read_chunk), produce one chunk per file. Otherwise, produce several chunks per file. """ if not context.config.single_reads_chunk: reads_chunk_ids = job.addChildJobFn(run_split_reads, context, fastq, gam_input_reads, bam_input_reads, reads_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() else: RealtimeLogger.info("Bypassing reads splitting because --single_reads_chunk enabled") reads_chunk_ids = [[r] for r in reads_file_ids] return reads_chunk_ids
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved, xg_file_id, paths): """ split the fastq, then surject each chunk. returns outputgams, paired with total surject time (excluding toil-vg overhead such as transferring and splitting files )""" # to encapsulate everything under this job child_job = Job() job.addChild(child_job) if not context.config.single_reads_chunk: reads_chunk_ids = child_job.addChildJobFn(run_split_reads, context, None, 'aln.gam', None, [gam_input_reads_id], cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() else: RealtimeLogger.info("Bypassing reads splitting because --single_reads_chunk enabled") reads_chunk_ids = [[r] for r in [gam_input_reads_id]] return child_job.addFollowOnJobFn(run_whole_surject, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def get_original_complexes(job, mol_sfam, int_sfam, group, int_type, work_dir=None): if work_dir is None: work_dir = os.cwd() complex_files = [] for _, row in group: row = row.iloc[0] RealtimeLogger.info("ROW: {}".format(row)) mol_file, mol_resi, int_file, int_resi = process_interface( job, row, int_type, work_dir=work_dir) # try: # mol_file = download_pdb(job, row.mol_superfam_id, row.mol_pdb, # row.mol_chain, row.mol_sdi_id, row.mol_domNo, work_dir=work_dir) # # int_file = download_pdb(job, row.int_superfam_id, row.int_pdb, # row.int_chain, row.int_sdi_id, row.int_domNo, work_dir=work_dir) # except (KeyboardInterrupt, SystemExit): # raise # except Exception as e: if mol_file is None or int_file is None: #PDB files not found, skip RealtimeLogger.info( "Cannot download PDB {}.{}.{} bc it was not found".format( row.mol_pdb, row.mol_chain, row.mol_sdi_id)) complex_files.append(None) continue merged_file = next( prep((mol_file, "M"), (int_file, "I"), merge=True, work_dir=work_dir)) complex_files.append(merged_file) return complex_files
def run_split_bam_reads(job, context, bam_input_reads, bam_reads_file_id): """ split up an input reads file in BAM format """ RealtimeLogger.info("Starting bam split") start_time = timeit.default_timer() # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # We need the sample fastq for alignment bam_path = os.path.join(work_dir, os.path.basename(bam_input_reads)) job.fileStore.readGlobalFile(bam_reads_file_id, bam_path) # Split up the bam into chunks # Make sure chunk size even in case paired interleaved chunk_size = context.config.reads_per_chunk if chunk_size % 2 != 0: chunk_size += 1 # 1 line per read chunk_lines = chunk_size * 1 cmd = [['samtools', 'view', os.path.basename(bam_path)]] cmd.append(['split', '-l', str(chunk_lines), '--filter=bash -c \'cat <(samtools view -H {}) <(cat -)'.format(os.path.basename(bam_path)) + ' | samtools view -O BAM --threads {} -'.format(max(1, int(context.config.fq_split_cores) - 1)) + ' > $FILE.bam\'', '-', 'bam_reads_chunk.']) context.runner.call(job, cmd, work_dir = work_dir) bam_chunk_ids = [] for chunk_name in sorted(os.listdir(work_dir)): if chunk_name.endswith('.bam') and chunk_name.startswith('bam_reads_chunk'): bam_chunk_ids.append(context.write_intermediate_file(job, os.path.join(work_dir, chunk_name))) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info("Split bam into {} chunks. Process took {} seconds.".format(len(bam_chunk_ids), run_time)) return bam_chunk_ids
def up(job, inputFileID1, inputFileID2, options, memory=sortMemory): """ Merges the two files and places them in the output. """ with job.fileStore.writeGlobalFileStream() as (fileHandle, outputFileStoreID): fileHandle = codecs.getwriter('utf-8')(fileHandle) with job.fileStore.readGlobalFileStream( inputFileID1) as inputFileHandle1: inputFileHandle1 = codecs.getreader('utf-8')(inputFileHandle1) with job.fileStore.readGlobalFileStream( inputFileID2) as inputFileHandle2: inputFileHandle2 = codecs.getreader('utf-8')(inputFileHandle2) RealtimeLogger.info( "Merging %s and %s to %s" % (inputFileID1, inputFileID2, outputFileStoreID)) merge(inputFileHandle1, inputFileHandle2, fileHandle) # Cleanup up the input files - these deletes will occur after the completion is successful. job.fileStore.deleteGlobalFile(inputFileID1) job.fileStore.deleteGlobalFile(inputFileID2) return outputFileStoreID
def copy_everything(job, options): """ Download the file list and copy all the files. """ # Set up the IO stores. in_store = IOStore.get(options.in_store) out_store = IOStore.get(options.out_store) batch_count = 0 # List all the files. blobs_iterator = in_store.list_input_directory("", recursive=True) # Make an iterator that filters them filtered_iterator = (x for x in blobs_iterator if fnmatch.fnmatchcase(x, options.pattern)) # Batch them up for batch in group(filtered_iterator, options.batch_size): # For every batch, strip out any Nones that got put in when grouping batch = [x for x in batch if x is not None] # Copy everything in that batch job.addChildJobFn(copy_batch, options, batch, cores=1, memory="1G", disk="10G") batch_count += 1 if batch_count % 10 == 0: RealtimeLogger.info("Queued {} batches...".format(batch_count)) RealtimeLogger.info("Queued {} total batches".format(batch_count))
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") RealtimeLogger.info("Running table {}".format(table)) #Read in H5 for entire table tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store) tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath) sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), columns=["nbr_superfam_id"]).drop_duplicates().dropna() skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \ out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \ not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))]) # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \ # if f.endswith(".inferred_interactome")]) sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)] sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist() # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \ # out_store.list_input_directory( # "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \ # sfam=sfam, table=table)) if not k.endswith("failed")) #sfams = list(set(sfams)-partial_sfams) if len(sfams) > 0: map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) try: os.remove(tableInfPath) except OSError: pass
def main_job(job, options, sam_urls): """ Root job of the Toil workflow. Download the sample URLs. Returns a Directory containing a bunch of output files. """ RealtimeLogger.info("Main job starting") RealtimeLogger.info("Temp directory location: {}".format( job.fileStore.getLocalTempDir())) # Make sure we can use samtools options.drunner.call(job, [["samtools", "--version"]]) # We'll fill this with promises for subdirectories by sample filename subdir_promises = {} for sam_url in sam_urls: # Work out the base filename sam_filename = os.path.basename(urlparse.urlparse(sam_url).path) # Go download and convert the reads, and stick the FASTQs in a directory subdir_promises[sam_filename] = ToilPromise.wrap( job.addChildJobFn(extract_job, options, sam_url, cores=1, memory="16G", disk="500G")).then( lambda (fq1, fq2): Directory({ "fq1.fastq": fq1, "fq2.fastq": fq2 })) # Mount each subdirectory under its original sam/bam/cram filename return ToilPromise.all(subdir_promises).then( lambda dirs: Directory().mount_all(dirs)).unwrap_result()
def poll(job, options, file_id, number, cores=0.1, disk='200M', memory='512M'): # Wait a random amount of time before grabbing the file for others to cache it time.sleep(random.randint(options.minSleep, options.minSleep + 10)) # Read the file. Don't accept a symlink because then we might just have the # filestore's copy, even if caching is not happening. local_file = job.fileStore.readGlobalFile(file_id, cache=True, mutable=False, symlink=False) # Wait a random amount of after before grabbing the file for others to use it time.sleep(random.randint(options.minSleep, options.minSleep + 10)) # Stat the file (reads through links) stats = os.stat(local_file) # Check what machine we are hostname = socket.gethostname() RealtimeLogger.info('Job {} on host {} sees file at device {} inode {}'.format(number, hostname, stats.st_dev, stats.st_ino)) # Return a tuple representing our view of the file. # Drop hostname since hostnames are unique per pod. return (stats.st_dev, stats.st_ino)
def new_function(*args, **kwargs): # Call backoff times, overriding parameters with stuff from kwargs for delay in backoff_times(retries=kwargs.get("retries", retries), base_delay=kwargs.get( "base_delay", base_delay)): # Keep looping until it works or our iterator raises a # BackoffError if delay > 0: # We have to wait before trying again RealtimeLogger.error("Retry after {} seconds".format(delay)) time.sleep(delay) try: return original_function(*args, **kwargs) except: # Report the formatted underlying exception with traceback RealtimeLogger.error("{} failed due to: {}".format( original_function.__name__, "".join(traceback.format_exception(*sys.exc_info())))) # If we get here, the function we're calling never ran through before we # ran out of backoff times. Give an error. raise BackoffError("Ran out of retries calling {}".format( original_function.__name__))
def read_input_file(self, input_path, local_path): """ Get input from the filesystem. """ RealtimeLogger.debug("Loading {} from FileIOStore in {} to {}".format( input_path, self.path_prefix, local_path)) if os.path.exists(local_path): # Try deleting the existing item if it already exists try: os.unlink(local_path) except: # Don't fail here, fail complaining about the assertion, which # will be more informative. pass # Make sure the path is clear for copying assert (not os.path.exists(local_path)) # Where is the file actually? real_path = os.path.abspath(os.path.join(self.path_prefix, input_path)) if not os.path.exists(real_path): RealtimeLogger.error( "Can't find {} from FileIOStore in {}!".format( input_path, self.path_prefix)) raise RuntimeError("File {} missing!".format(real_path)) # Make a temporary file temp_handle, temp_path = tempfile.mkstemp( dir=os.path.dirname(local_path)) os.close(temp_handle) # Copy to the temp file shutil.copy2(real_path, temp_path) # Rename the temp file to the right place, atomically RealtimeLogger.info("rename {} -> {}".format(temp_path, local_path)) os.rename(temp_path, local_path) # Look at the file stats file_stats = os.stat(real_path) if (file_stats.st_uid == os.getuid() and file_stats.st_mode & stat.S_IWUSR): # We own this file and can write to it. We don't want the user # script messing it up through the symlink. try: # Clear the user write bit, so the user can't accidentally # clobber the file in the actual store through the symlink. os.chmod(real_path, file_stats.st_mode ^ stat.S_IWUSR) except OSError: # If something goes wrong here (like us not having permission to # change permissions), ignore it. pass
def extract_job(job, options, sam_url): """ Extract and fix up the given SAM/BAM/CRAM reads by URL. Return a pair of FASTQ file IDs. """ # We have to deal with relative paths relative to here if we want Docker to # work right work_dir = job.fileStore.getLocalTempDir() # Let's just download the whole bam sorted_bam = "sorted.bam" # We need a prefix for temp files temp_prefix = sorted_bam + ".part" RealtimeLogger.info("Sort {} to BAM".format(sam_url)) # Sort reads by name to a BAM file. If we don't give a temp file prefix it # tries to write the temp files back to the FTP. options.drunner.call(job, [[ "samtools", "sort", "-n", "-o", sorted_bam, "-T", temp_prefix, sam_url ]], work_dir=work_dir) # Save to file store bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, sorted_bam)) # Convert and return FASTQs return job.addChildJobFn(convert_job, options, sam_url, bam_id, cores=4, memory="16G", disk="500G").rv()
def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory): """ Input is a file, a subdivision size N, and a path in the hierarchy of jobs. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ RealtimeLogger.info("Down job starting: %s" % path) # Read the file inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False) length = os.path.getsize(inputFile) if length > N: # We will subdivide the file RealtimeLogger.critical("Splitting file: %s of size: %s" % (inputFileStoreID, length)) # Split the file into two copies midPoint = getMidPoint(inputFile, 0, length) t1 = job.fileStore.getLocalTempFile() with open(t1, 'w') as fH: fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1)) t2 = job.fileStore.getLocalTempFile() with open(t2, 'w') as fH: fH.write(copySubRangeOfFile(inputFile, midPoint+1, length)) # Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up, # we communicate the dependency without hindering concurrency. result = job.addFollowOnJobFn(up, job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, path + '/0', downCheckpoints, checkpoint=downCheckpoints, options=options, preemptable=True, memory=options.sortMemory).rv(), job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, path + '/1', downCheckpoints, checkpoint=downCheckpoints, options=options, preemptable=True, memory=options.mergeMemory).rv(), path + '/up', preemptable=True, options=options, memory=options.sortMemory).rv() else: # We can sort this bit of the file RealtimeLogger.critical("Sorting file: %s of size: %s" % (inputFileStoreID, length)) # Sort the copy and write back to the fileStore shutil.copyfile(inputFile, inputFile + '.sort') sort(inputFile + '.sort') result = job.fileStore.writeGlobalFile(inputFile + '.sort') RealtimeLogger.info("Down job finished: %s" % path) return result
def _runMainLoop(self, rootJob): """ Runs the main loop with the given job. :param toil.job.Job rootJob: The root job for the workflow. :rtype: Any """ with RealtimeLogger(self._batchSystem, level=self.options.logLevel if self.options.realTimeLogging else None): # FIXME: common should not import from leader from toil.leader import mainLoop return mainLoop(config=self.config, batchSystem=self._batchSystem, provisioner=None, jobStore=self._jobStore, rootJobWrapper=rootJob, jobCache=self._jobCache)
def run_id_increment(job, options, graph_i, graph_id, distance): """ Actually do the ID incrementing. Is a separate, toil-vg-style job so it can be added to toil-vg and so we can set the correct resource requirements. """ RealtimeLogger.info("Starting graph shift...") start_time = timeit.default_timer() work_dir = job.fileStore.getLocalTempDir() # download graph graph_filename = os.path.join(work_dir, '{}.vg'.format(options.chroms[graph_i])) toil_vg.vg_common.read_from_store(job, options, graph_id, graph_filename) # Output output_graph_filename = graph_filename + '.shifted.vg' RealtimeLogger.info("Moving {} up by {} to {}".format( graph_filename, distance, output_graph_filename)) with open(output_graph_filename, "w") as out_file: command = [ 'vg', 'ids', '--increment', str(distance), os.path.basename(graph_filename) ] options.drunner.call(job, command, work_dir=work_dir, outfile=out_file) # Back to store output_graph_id = toil_vg.vg_common.write_to_store(job, options, output_graph_filename) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info( "Finished graph shift. Process took {} seconds.".format(run_time)) return output_graph_id
def run(self, fileStore): RealtimeLogger.info('This should be logged at info level') RealtimeLogger.debug('This should be logged at debug level')