def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [ FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai') ]
def augustus_pb(args, toil_options): """ Main entry function for AugustusPB toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object :return: """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.chrom_sizes = FileID.forPath( t.importFile('file://' + args.chrom_sizes), args.chrom_sizes) input_file_ids.pb_cfg = FileID.forPath( t.importFile('file://' + args.pb_cfg), args.pb_cfg) input_file_ids.hints_gff = FileID.forPath( t.importFile('file://' + args.hints_gff), args.hints_gff) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk='32G') raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job) else: raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf) t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf) t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf) t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)
def align_transcripts(args, toil_options): """ Main entry function for transcript alignment toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta) input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), args.annotation_gp) input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path) input_file_ids.modes = {} file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp, input_file_ids.ref_db] for mode in args.transcript_modes: input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp']) file_ids.append(input_file_ids.modes[mode]) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage) results_file_ids = t.start(job) else: results_file_ids = t.restart() for file_path, file_id in results_file_ids.items(): tools.fileOps.ensure_file_dir(file_path) t.exportFile(file_id, 'file://' + file_path)
def hints_db(hints_args, toil_options): """ Entry point for hints database Toil pipeline. """ def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')] fasta = pyfasta.Fasta(hints_args.fasta) fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()} with Toil(toil_options) as t: if not t.options.restart: # load the RNA-seq data, if we have any bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} for dtype in ['BAM', 'INTRONBAM']: if hints_args.genome not in hints_args.cfg[dtype]: continue for bam_path in hints_args.cfg[dtype][hints_args.genome]: bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome) # load the IsoSeq data, if we have any iso_seq_file_ids = [] if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']: for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]: validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome) iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome)) if hints_args.annotation_gp is None: annotation_file_id = None else: annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp), hints_args.annotation_gp) if hints_args.protein_fasta is None: protein_fasta_file_id = genome_fasta_file_id = None else: protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta), hints_args.protein_fasta) genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta) input_file_ids = {'bams': bam_file_ids, 'iso_seq_bams': iso_seq_file_ids, 'annotation': annotation_file_id, 'protein_fasta': protein_fasta_file_id, 'genome_fasta': genome_fasta_file_id} if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0: logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome)) disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids) job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage) combined_hints = t.start(job) else: logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome)) combined_hints = t.restart() tools.fileOps.ensure_file_dir(hints_args.hints_path) t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
def write_fasta_to_filestore(toil, fasta_local_path): """ Convenience function that loads a fasta and its associated gdx/flat file into the fileStore. Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat) :param toil: Toil context manager :param fasta_local_path: Path to local fasta to load. :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat """ fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path) gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx') flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat') return fasta_file_id, gdx_file_id, flat_file_id
def writeGlobalFile(self, localFileName, cleanup=False): absLocalFileName = self._resolveAbsoluteLocalPath(localFileName) creatorID = self.jobGraph.jobStoreID fileStoreID = self.jobStore.writeFile(absLocalFileName, creatorID, cleanup) self.localFileMap[fileStoreID].append(absLocalFileName) return FileID.forPath(fileStoreID, absLocalFileName)
def writeGlobalFile(self, localFileName, cleanup=False): absLocalFileName = self._resolveAbsoluteLocalPath(localFileName) creatorID = self.jobDesc.jobStoreID fileStoreID = self.jobStore.writeFile(absLocalFileName, creatorID, cleanup) if absLocalFileName.startswith(self.localTempDir): # Only files in the appropriate directory should become local files # we can delete with deleteLocalFile self.localFileMap[fileStoreID].append(absLocalFileName) return FileID.forPath(fileStoreID, absLocalFileName)
def chaining(args, toil_options): """entry point to this program""" with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit), args.query_two_bit) target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f) for genome, f in args.target_two_bits.iteritems()} input_file_ids.target_two_bits = target_two_bit_file_ids job = Job.wrapJobFn(setup, args, input_file_ids) chain_file_ids = t.start(job) else: chain_file_ids = t.restart() for chain_file, chain_file_id in chain_file_ids.iteritems(): tools.fileOps.ensure_file_dir(chain_file) t.exportFile(chain_file_id, 'file://' + chain_file)
def augustus(args, coding_gp, toil_options): """ Main entry function for Augustus toil pipeline :param args: dictionary of arguments from CAT :param coding_gp: genePred with only coding transcripts :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.tm_cfg = FileID.forPath( t.importFile('file://' + args.tm_cfg), args.tm_cfg) input_file_ids.coding_gp = FileID.forPath( t.importFile('file://' + coding_gp), coding_gp) input_file_ids.ref_psl = FileID.forPath( t.importFile('file://' + args.ref_psl), args.ref_psl) input_file_ids.tm_psl = FileID.forPath( t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl) input_file_ids.annotation_gp = FileID.forPath( t.importFile('file://' + args.annotation_gp), args.annotation_gp) file_ids = [ input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl, input_file_ids.tm_psl, input_file_ids.annotation_gp ] if args.augustus_tmr: input_file_ids.augustus_hints_db = FileID.forPath( t.importFile('file://' + args.augustus_hints_db), args.augustus_hints_db) input_file_ids.tmr_cfg = FileID.forPath( t.importFile('file://' + args.tmr_cfg), args.tmr_cfg) file_ids.append(args.augustus_hints_db) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage) tm_file_id, tmr_file_id = t.start(job) else: tm_file_id, tmr_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_tm_gtf) t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf) if tmr_file_id is not None: tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf) t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def augustus_cgp(args, toil_options): """ Main entry function for AugustusCGP toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object :return: """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db) if args.cgp_param is not None: input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param) else: input_file_ids.cgp_param = None input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf) input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg) input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta) for genome, fasta in args.fasta_files.items()} du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G') job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du) results, stdout_file_ids, param_file_id = t.start(job) else: results, stdout_file_ids, param_file_id = t.restart() tools.fileOps.ensure_file_dir(args.stdout_file) with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp: for (chrom, start, chunksize), stdout_file in stdout_file_ids.items(): outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize)) t.exportFile(stdout_file, 'file://' + tmp) for l in open(tmp): outf.write(l) for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.items(): tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome]) t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_cgp_raw_gtf[genome]) t.exportFile(joined_gtf_file_id, 'file://' + args.augustus_cgp_gtf[genome]) t.exportFile(joined_gp_file_id, 'file://' + args.augustus_cgp_gp[genome]) if args.cgp_param is None: t.exportFile(param_file_id, 'file://' + args.param_out_path)