def align_rna(job, fastqs, univ_options, star_options): """ A wrapper for the entire rna alignment subgraph. :param list fastqs: The input fastqs for alignment :param dict univ_options: Dict of universal options used by almost all tools :param dict star_options: Options specific to star :return: Dict containing input bam and the generated index (.bam.bai) :rtype: dict """ star = job.wrapJobFn(run_star, fastqs, univ_options, star_options, cores=star_options['n'], memory=PromisedRequirement( lambda x: int(1.85 * x.size), star_options['index']), disk=PromisedRequirement(star_disk, fastqs, star_options['index'])) s_and_i = job.wrapJobFn(sort_and_index_star, star.rv(), univ_options, star_options).encapsulate() job.addChild(star) star.addChild(s_and_i) return s_and_i.rv()
def maxConcurrency(job, cpuCount, filename, coresPerJob): """ Returns the max number of concurrent tasks when using a PromisedRequirement instance to allocate the number of cores per job. :param int cpuCount: number of available cpus :param str filename: path to counter file :param int coresPerJob: number of cores assigned to each job :return int max concurrency value: """ one = job.addChildFn(getOne, cores=0.1, memory='32M', disk='1M') thirtyTwoMb = job.addChildFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') values = [] for _ in range(cpuCount): value = job.addFollowOnFn(batchSystemTest.measureConcurrency, filename, cores=PromisedRequirement( lambda x: x * coresPerJob, one.rv()), memory=PromisedRequirement(thirtyTwoMb.rv()), disk='1M').rv() values.append(value) return values
def wrap_fusion(job, fastqs, star_output, univ_options, star_fusion_options, fusion_inspector_options): """ A wrapper for run_fusion using the results from cutadapt and star as input. :param tuple fastqs: RNA-Seq FASTQ Filestore IDs :param dict star_output: Dictionary containing STAR output files :param dict univ_options: universal arguments used by almost all tools :param dict star_fusion_options: STAR-Fusion specific parameters :param dict fusion_inspector_options: FusionInspector specific parameters :return: Transgene BEDPE file :rtype: toil.fileStore.FileID """ # Give user option to skip fusion calling if not star_fusion_options['run']: job.fileStore.logToMaster('Skipping STAR-Fusion on %s' % univ_options['patient']) return fusion = job.wrapJobFn( run_fusion, fastqs, star_output['rnaChimeric.out.junction'], univ_options, star_fusion_options, fusion_inspector_options, cores=star_fusion_options['n'], memory=PromisedRequirement(lambda x: int(1.85 * x.size), star_fusion_options['index']), disk=PromisedRequirement(fusion_disk, fastqs, star_fusion_options['index'])).encapsulate() job.addChild(fusion) return fusion.rv()
def testConcurrencyStatic(self): """ Asserts that promised core resources are allocated properly using a static DAG """ for coresPerJob in self.allocatedCores: tempDir = self._createTempDir('testFiles') counterPath = self.getCounterPath(tempDir) root = Job() one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M') thirtyTwoMb = Job.wrapFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') root.addChild(one) root.addChild(thirtyTwoMb) for _ in range(self.cpuCount): root.addFollowOn( Job.wrapFn(batchSystemTest.measureConcurrency, counterPath, cores=PromisedRequirement( lambda x: x * coresPerJob, one.rv()), memory=PromisedRequirement( thirtyTwoMb.rv()), disk='1M')) Job.Runner.startToil(root, self.getOptions(tempDir)) _, maxValue = batchSystemTest.getCounters(counterPath) self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
def testPromiseRequirementRaceStatic(self): """ Checks for a race condition when using promised requirements and child job functions. """ A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024)) B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv())) A.addChild(B) Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles')))
def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_options, split=True): """ Run the SomaticSniper subgraph on the DNA bams. Optionally split the results into per-chromosome vcfs. :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq :param dict normal_bam: Dict of bam and bai for normal DNA-Seq :param dict univ_options: Dict of universal options used by almost all tools :param dict somaticsniper_options: Options specific to SomaticSniper :param bool split: Should the results be split into perchrom vcfs? :return: Either the fsID to the genome-level vcf or a dict of results from running SomaticSniper on every chromosome perchrom_somaticsniper: |- 'chr1': fsID |- 'chr2' fsID | |-... | +- 'chrM': fsID :rtype: toil.fileStore.FileID|dict """ # Get a list of chromosomes to handle if somaticsniper_options['chromosomes']: chromosomes = somaticsniper_options['chromosomes'] else: chromosomes = sample_chromosomes(job, somaticsniper_options['genome_fai']) perchrom_somaticsniper = defaultdict() snipe = job.wrapJobFn(run_somaticsniper_full, tumor_bam, normal_bam, univ_options, somaticsniper_options, disk=PromisedRequirement(sniper_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], somaticsniper_options['genome_fasta']), memory='6G') pileup = job.wrapJobFn(run_pileup, tumor_bam, univ_options, somaticsniper_options, disk=PromisedRequirement(pileup_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], somaticsniper_options['genome_fasta']), memory='6G') filtersnipes = job.wrapJobFn(filter_somaticsniper, tumor_bam, snipe.rv(), pileup.rv(), univ_options, somaticsniper_options, disk=PromisedRequirement(sniper_filter_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], somaticsniper_options['genome_fasta']), memory='6G') job.addChild(snipe) job.addChild(pileup) snipe.addChild(filtersnipes) pileup.addChild(filtersnipes) if split: unmerge_snipes = job.wrapJobFn(unmerge, filtersnipes.rv(), 'somaticsniper', chromosomes, somaticsniper_options, univ_options) filtersnipes.addChild(unmerge_snipes) return unmerge_snipes.rv() else: return filtersnipes.rv()
def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options): """ Spawn a MuSE job for each chromosome on the DNA bams. :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq :param dict normal_bam: Dict of bam and bai for normal DNA-Seq :param dict univ_options: Dict of universal options used by almost all tools :param dict muse_options: Options specific to MuSE :return: Dict of results from running MuSE on every chromosome perchrom_muse: |- 'chr1': fsID |- 'chr2' fsID | |-... | +- 'chrM': fsID :rtype: dict """ # Get a list of chromosomes to handle if muse_options['chromosomes']: chromosomes = muse_options['chromosomes'] else: chromosomes = sample_chromosomes(job, muse_options['genome_fai']) perchrom_muse = defaultdict() for chrom in chromosomes: call = job.addChildJobFn( run_muse_perchrom, tumor_bam, normal_bam, univ_options, muse_options, chrom, disk=PromisedRequirement( muse_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], muse_options['genome_fasta']), memory='6G') sump = call.addChildJobFn(run_muse_sump_perchrom, call.rv(), univ_options, muse_options, chrom, disk=PromisedRequirement( muse_sump_disk, muse_options['dbsnp_vcf']), memory='6G') perchrom_muse[chrom] = sump.rv() return perchrom_muse
def download_and_process_tar(job, config): """ Download tarball containing fastq(s) and process :param JobFunctionWrappingJob job: passed automatically by Toil :param Expando config: Dict-like object containing workflow options as attributes :return: Processed fastqs :rtype: tuple(str, str) """ # Define download and process jobs disk = '2G' if config.ci_test else config.max_sample_size download = job.wrapJobFn(download_url_job, config.url, s3_key_path=config.ssec, disk=disk) process = job.wrapJobFn(process_sample, config, input_tar=download.rv(), disk=PromisedRequirement(lambda x: x.size * 10, download.rv())) # Wire jobs and return processed fastqs job.addChild(download) download.addChild(process) return process.rv()
def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split=True): """ Run the strelka subgraph on the DNA bams. Optionally split the results into per-chromosome vcfs. :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq :param dict normal_bam: Dict of bam and bai for normal DNA-Seq :param dict univ_options: Dict of universal options used by almost all tools :param dict strelka_options: Options specific to strelka :param bool split: Should the results be split into perchrom vcfs? :return: Either the fsID to the genome-level vcf or a dict of results from running strelka on every chromosome perchrom_strelka: |- 'chr1': | |-'snvs': fsID | +-'indels': fsID |- 'chr2': | |-'snvs': fsID | +-'indels': fsID |-... | +- 'chrM': |-'snvs': fsID +-'indels': fsID :rtype: toil.fileStore.FileID|dict """ if strelka_options['chromosomes']: chromosomes = strelka_options['chromosomes'] else: chromosomes = sample_chromosomes(job, strelka_options['genome_fai']) num_cores = min(len(chromosomes), univ_options['max_cores']) strelka = job.wrapJobFn(run_strelka_full, tumor_bam, normal_bam, univ_options, strelka_options, disk=PromisedRequirement( strelka_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], strelka_options['genome_fasta']), memory='6G', cores=num_cores) job.addChild(strelka) if split: unmerge_strelka = job.wrapJobFn(wrap_unmerge, strelka.rv(), chromosomes, strelka_options, univ_options).encapsulate() strelka.addChild(unmerge_strelka) return unmerge_strelka.rv() else: return strelka.rv()
def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options): """ This module will spawn a mutect job for each chromosome on the DNA bams. ARGUMENTS 1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai tumor_bam |- 'tumor_fix_pg_sorted.bam': <JSid> +- 'tumor_fix_pg_sorted.bam.bai': <JSid> 2. normal_bam: Dict of input normal WGS/WSQ bam + bai normal_bam |- 'normal_fix_pg_sorted.bam': <JSid> +- 'normal_fix_pg_sorted.bam.bai': <JSid> 3. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 4. mutect_options: Dict of parameters specific to mutect mutect_options |- 'dbsnp_vcf': <JSid for dnsnp vcf file> |- 'dbsnp_idx': <JSid for dnsnp vcf index file> |- 'cosmic_vcf': <JSid for cosmic vcf file> |- 'cosmic_idx': <JSid for cosmic vcf index file> |- 'genome_fasta': <JSid for genome fasta file> +- 'genome_dict': <JSid for genome fasta dict file> +- 'genome_fai': <JSid for genome fasta index file> RETURN VALUES 1. perchrom_mutect: Dict of results of mutect per chromosome perchrom_mutect |- 'chr1' | +- 'mutect_chr1.vcf': <JSid> | +- 'mutect_chr1.out': <JSid> |- 'chr2' | |- 'mutect_chr2.vcf': <JSid> | +- 'mutect_chr2.out': <JSid> etc... This module corresponds to node 11 on the tree """ # Get a list of chromosomes to handle chromosomes = sample_chromosomes(job, mutect_options['genome_fai']) perchrom_mutect = defaultdict() for chrom in chromosomes: perchrom_mutect[chrom] = job.addChildJobFn( run_mutect_perchrom, tumor_bam, normal_bam, univ_options, mutect_options, chrom, memory='6G', disk=PromisedRequirement( mutect_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], mutect_options['genome_fasta'], mutect_options['dbsnp_vcf'], mutect_options['cosmic_vcf'])).rv() return perchrom_mutect
def parentJob(job): downloadJob = Job.wrapJobFn(stageFn, "file://" + os.path.realpath(__file__), cores=0.1, memory='32M', disk='1M') job.addChild(downloadJob) analysis = Job.wrapJobFn(analysisJob, fileStoreID=downloadJob.rv(0), disk=PromisedRequirement(downloadJob.rv(1))) job.addFollowOn(analysis)
def annotate_vcfs(job, vcfs, config): """ Runs Oncotator for a group of VCF files. Each sample is annotated individually. :param JobFunctionWrappingJob job: passed automatically by Toil :param dict vcfs: Dictionary of VCF FileStoreIDs {Sample identifier: FileStoreID} :param Namespace config: Input parameters and shared FileStoreIDs Requires the following config attributes: config.oncotator_db FileStoreID to Oncotator database config.suffix Suffix added to output filename config.output_dir URL or local path to output directory config.ssec Path to key file for SSE-C encryption config.cores Number of cores for each job config.xmx Java heap size in bytes """ job.fileStore.logToMaster( 'Running Oncotator on the following samples:\n%s' % '\n'.join(vcfs.keys())) for uuid, vcf_id in vcfs.iteritems(): # The Oncotator disk requirement depends on the input VCF, the Oncotator database # and the output VCF. The annotated VCF will be significantly larger than the input VCF. onco_disk = PromisedRequirement(lambda vcf, db: 3 * vcf.size + db.size, vcf_id, config.oncotator_db) annotated_vcf = job.addChildJobFn(run_oncotator, vcf_id, config.oncotator_db, disk=onco_disk, cores=config.cores, memory=config.xmx) output_dir = os.path.join(config.output_dir, uuid) filename = '{}.oncotator{}.vcf'.format(uuid, config.suffix) annotated_vcf.addChildJobFn(output_file_job, filename, annotated_vcf.rv(), output_dir, s3_key_path=config.ssec, disk=PromisedRequirement( lambda x: x.size, annotated_vcf.rv()))
def sort_and_index_star(job, star_bams, univ_options, star_options): """ A wrapper for sorting and indexing the genomic star bam generated by run_star. It is required since run_star returns a dict of 2 bams :param dict star_bams: The bams from run_star :param dict univ_options: Dict of universal options used by almost all tools :param dict star_options: Options specific to star :return: Dict containing input bam and the generated index (.bam.bai) output_files: |- 'rna_transcriptome.bam': fsID +- 'rna_genome': |- 'rna_sorted.bam': fsID +- 'rna_sorted.bam.bai': fsID +- 'rnaChimeric.out.junction': fsID :rtype: dict """ star_options['samtools']['n'] = star_options['n'] sort = job.wrapJobFn(sort_bamfile, star_bams['rnaAligned.out.bam'], 'rna', univ_options, samtools_options=star_options['samtools'], disk=PromisedRequirement( sort_disk, star_bams['rnaAligned.out.bam'])) index = job.wrapJobFn(index_bamfile, sort.rv(), 'rna', univ_options, samtools_options=star_options['samtools'], sample_info='genome_sorted', disk=PromisedRequirement(index_disk, sort.rv())) job.addChild(sort) sort.addChild(index) return { 'rna_genome': index.rv(), 'rna_transcriptome.bam': star_bams['rnaAligned.toTranscriptome.out.bam'], 'rnaChimeric.out.junction': star_bams['rnaChimeric.out.junction'] }
def align_rna(job, fastqs, univ_options, star_options): """ This is a convenience function that runs the entire rna alignment subgraph """ star = job.wrapJobFn(run_star, fastqs, univ_options, star_options, cores=star_options['n'], memory=PromisedRequirement( lambda x: int(1.85 * x.size), star_options['tool_index']), disk=PromisedRequirement(star_disk, fastqs, star_options['tool_index'])) index = job.wrapJobFn(index_star, star.rv(), univ_options, disk=PromisedRequirement(star_disk, fastqs, star_options['tool_index'])) job.addChild(star) star.addChild(index) return index.rv()
def align_dna(job, fastqs, sample_type, univ_options, bwa_options): """ This is a convenience function that runs the entire dna alignment subgraph """ bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options, disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['tool_index'])) sam2bam = job.wrapJobFn(bam_conversion, bwa.rv(), sample_type, univ_options, disk=PromisedRequirement(sam2bam_disk, bwa.rv())) # reheader takes the same disk as sam2bam so we can serialize this on the same worker. reheader = job.wrapJobFn(fix_bam_header, sam2bam.rv(), sample_type, univ_options, disk=PromisedRequirement(sam2bam_disk, bwa.rv())) regroup = job.wrapJobFn(add_readgroups, reheader.rv(), sample_type, univ_options, disk=PromisedRequirement(regroup_disk, reheader.rv())) index = job.wrapJobFn(index_bamfile, regroup.rv(), sample_type, univ_options, disk=PromisedRequirement(index_disk, regroup.rv())) job.addChild(bwa) bwa.addChild(sam2bam) sam2bam.addChild(reheader) reheader.addChild(regroup) regroup.addChild(index) return index.rv()
def index_star(job, star_bams, univ_options): """ This is a wrapper functiion for index_bamfile in protect.common which is required since run_star returns a dict of 2 bams """ index = job.wrapJobFn(index_bamfile, star_bams['rnaAligned.sortedByCoord.out.bam'], 'rna', univ_options, disk=PromisedRequirement( index_disk, star_bams['rnaAligned.sortedByCoord.out.bam'])) job.addChild(index) star_bams['rnaAligned.sortedByCoord.out.bam'] = index.rv() return star_bams
def align_dna(job, fastqs, sample_type, univ_options, bwa_options): """ A wrapper for the entire dna alignment subgraph. :param list fastqs: The input fastqs for alignment :param str sample_type: Description of the sample to inject into the filename :param dict univ_options: Dict of universal options used by almost all tools :param dict bwa_options: Options specific to bwa :return: Dict containing output bam and bai output_files: |- '<sample_type>_fix_pg_sorted.bam': fsID +- '<sample_type>_fix_pg_sorted.bam.bai': fsID :rtype: dict """ bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options, disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']), cores=bwa_options['n']) sam2bam = job.wrapJobFn(bam_conversion, bwa.rv(), sample_type, univ_options, bwa_options['samtools'], disk=PromisedRequirement(sam2bam_disk, bwa.rv())) # reheader takes the same disk as sam2bam so we can serialize this on the same worker. reheader = job.wrapJobFn(fix_bam_header, sam2bam.rv(), sample_type, univ_options, bwa_options['samtools'], disk=PromisedRequirement(sam2bam_disk, bwa.rv())) regroup = job.wrapJobFn(add_readgroups, reheader.rv(), sample_type, univ_options, bwa_options['picard'], disk=PromisedRequirement(regroup_disk, reheader.rv())) index = job.wrapJobFn(index_bamfile, regroup.rv(), sample_type, univ_options, bwa_options['samtools'], sample_info='fix_pg_sorted', disk=PromisedRequirement(index_disk, regroup.rv())) job.addChild(bwa) bwa.addChild(sam2bam) sam2bam.addChild(reheader) reheader.addChild(regroup) regroup.addChild(index) return index.rv()
def testPromisesWithJobStoreFileObjects(self): """ Check whether FileID objects are being pickled properly when used as return values of functions. Then ensure that lambdas of promised FileID objects can be used to describe the requirements of a subsequent job. This type of operation will be used commonly in Toil scripts. :return: None """ file1 = 1024 file2 = 512 F1 = Job.wrapJobFn(_writer, file1) F2 = Job.wrapJobFn(_writer, file2) G = Job.wrapJobFn(_follower, file1+file2, disk=PromisedRequirement(lambda x, y: x.size + y.size, F1.rv(), F2.rv())) F1.addChild(F2) F2.addChild(G) Job.Runner.startToil(F1, self.getOptions(self._createTempDir('testFiles')))
def wrap_rsem(job, star_bams, univ_options, rsem_options): """ A wrapper for run_rsem using the results from run_star as input. :param dict star_bams: dict of results from star :param dict univ_options: Dict of universal options used by almost all tools :param dict rsem_options: Options specific to rsem :return: Dict of gene- and isoform-level expression calls output_files: |- 'rsem.genes.results': fsID +- 'rsem.isoforms.results': fsID :rtype: dict """ rsem = job.addChildJobFn(run_rsem, star_bams['rna_transcriptome.bam'], univ_options, rsem_options, cores=rsem_options['n'], disk=PromisedRequirement(rsem_disk, star_bams, rsem_options['index'])) return rsem.rv()
def align_dna(job, fastqs, sample_type, univ_options, bwa_options): """ A wrapper for the entire dna alignment subgraph. :param list fastqs: The input fastqs for alignment :param str sample_type: Description of the sample to inject into the filename :param dict univ_options: Dict of universal options used by almost all tools :param dict bwa_options: Options specific to bwa :return: Dict containing output bam and bai output_files: |- '<sample_type>_fix_pg_sorted.bam': fsID +- '<sample_type>_fix_pg_sorted.bam.bai': fsID :rtype: dict """ # The mkdup and regroup steps use picard that allots heap space using the Xmx key in the # univ_options dictionary. This should reflect in the job allotment. Since We want all these # jobs to occur on the same node, we ened to give them all the same memory requirements. bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options, disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']), memory=univ_options['java_Xmx'], cores=bwa_options['n']) sam2bam = job.wrapJobFn(bam_conversion, bwa.rv(), sample_type, univ_options, bwa_options['samtools'], disk=PromisedRequirement(sam2bam_disk, bwa.rv()), memory=univ_options['java_Xmx']) # reheader takes the same disk as sam2bam so we can serialize this on the same worker. reheader = job.wrapJobFn(fix_bam_header, sam2bam.rv(), sample_type, univ_options, bwa_options['samtools'], disk=PromisedRequirement(sam2bam_disk, bwa.rv()), memory=univ_options['java_Xmx']) regroup = job.wrapJobFn(add_readgroups, reheader.rv(), sample_type, univ_options, bwa_options['picard'], disk=PromisedRequirement(regroup_disk, reheader.rv()), memory=univ_options['java_Xmx']) mkdup = job.wrapJobFn(mark_duplicates, regroup.rv(), sample_type, univ_options, bwa_options['picard'], disk=PromisedRequirement(mkdup_disk, regroup.rv()), memory=univ_options['java_Xmx']) index = job.wrapJobFn(index_bamfile, mkdup.rv(), sample_type, univ_options, bwa_options['samtools'], sample_info='fix_pg_sorted', disk=PromisedRequirement(index_disk, mkdup.rv()), memory=univ_options['java_Xmx']) job.addChild(bwa) bwa.addChild(sam2bam) sam2bam.addChild(reheader) reheader.addChild(regroup) regroup.addChild(mkdup) mkdup.addChild(index) return index.rv()
def get_patient_bams(job, patient_dict, sample_type, univ_options, bwa_options, mutect_options): """ Convenience function to return the bam and its index in the correct format for a sample type. :param dict patient_dict: dict of patient info :param str sample_type: 'tumor_rna', 'tumor_dna', 'normal_dna' :param dict univ_options: Dict of universal options used by almost all tools :param dict bwa_options: Options specific to bwa :param dict bwa_options: Options specific to mutect :return: formatted dict of bam and bai :rtype: dict """ output_dict = {} if 'dna' in sample_type: sample_info = 'fix_pg_sorted' prefix = sample_type + '_' + sample_info else: sample_info = 'genome_sorted' prefix = 'rna_' + sample_info if sample_type + '_bam' in patient_dict['gdc_inputs']: output_dict[prefix + '.bam'] = patient_dict[sample_type + '_bam'][0] output_dict[prefix + '.bam.bai'] = patient_dict[sample_type + '_bam'][1] elif sample_type + '_bai' in patient_dict: output_dict[prefix + '.bam'] = patient_dict[sample_type + '_bam'] output_dict[prefix + '.bam.bai'] = patient_dict[sample_type + '_bai'] else: from protect.alignment.dna import index_bamfile, index_disk output_job = job.wrapJobFn(index_bamfile, patient_dict[sample_type + '_bam'], 'rna' if sample_type == 'tumor_rna' else sample_type, univ_options, bwa_options['samtools'], sample_info=sample_info, export=False, disk=PromisedRequirement(index_disk, patient_dict[sample_type + '_bam'])) job.addChild(output_job) output_dict = output_job.rv() if sample_type == 'tumor_rna': return{'rna_genome': output_dict, 'rna_transcriptome.bam': patient_dict['tumor_rna_transcriptome_bam']} else: return output_dict
def download_and_process_fastqs(job, config): """ :param JobFunctionWrappingJob job: passed automatically by Toil :param Expando config: Dict-like object containing workflow options as attributes :return: Processed fastqs :rtype: tuple(str, str) """ # Define download and process jobs disk = '2G' if config.ci_test else config.max_sample_size download = job.wrapJobFn(multiple_fastq_dowloading, config, sample_disk=disk).encapsulate() process = job.wrapJobFn(process_sample, config, fastq_ids=download.rv(), disk=PromisedRequirement( lambda xs: sum(x.size for x in xs) * 5, download.rv())) # Wire jobs and return processed fastqs job.addChild(download) download.addChild(process) return process.rv()
def hard_filter_pipeline(job, uuid, vcf_id, config): """ Runs GATK Hard Filtering on a Genomic VCF file and uploads the results. 0: Start 0 --> 1 --> 3 --> 5 --> 6 1: Select SNPs | | 2: Select INDELs +-> 2 --> 4 + 3: Apply SNP Filter 4: Apply INDEL Filter 5: Merge SNP and INDEL VCFs 6: Write filtered VCF to output directory :param JobFunctionWrappingJob job: passed automatically by Toil :param str uuid: Unique sample identifier :param str vcf_id: VCF FileStoreID :param Namespace config: Pipeline configuration options and shared files Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.snp_filter_name Name of SNP filter for VCF header config.snp_filter_expression SNP JEXL filter expression config.indel_filter_name Name of INDEL filter for VCF header config.indel_filter_expression INDEL JEXL filter expression config.xmx Java heap size in bytes config.suffix Suffix added to output filename config.output_dir URL or local path to output directory config.ssec Path to key file for SSE-C encryption :return: SNP and INDEL FileStoreIDs :rtype: tuple """ job.fileStore.logToMaster('Running Hard Filter on {}'.format(uuid)) # Get the total size of the genome reference genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # The SelectVariants disk requirement depends on the input VCF, the genome reference files, # and the output VCF. The output VCF is smaller than the input VCF. The disk requirement # is identical for SNPs and INDELs. select_variants_disk = PromisedRequirement( lambda vcf, ref_size: 2 * vcf.size + ref_size, vcf_id, genome_ref_size) select_snps = job.wrapJobFn(gatk_select_variants, 'SNP', vcf_id, config.genome_fasta, config.genome_fai, config.genome_dict, memory=config.xmx, disk=select_variants_disk) # The VariantFiltration disk requirement depends on the input VCF, the genome reference files, # and the output VCF. The filtered VCF is smaller than the input VCF. snp_filter_disk = PromisedRequirement( lambda vcf, ref_size: 2 * vcf.size + ref_size, select_snps.rv(), genome_ref_size) snp_filter = job.wrapJobFn(gatk_variant_filtration, select_snps.rv(), config.snp_filter_name, config.snp_filter_expression, config.genome_fasta, config.genome_fai, config.genome_dict, memory=config.xmx, disk=snp_filter_disk) select_indels = job.wrapJobFn(gatk_select_variants, 'INDEL', vcf_id, config.genome_fasta, config.genome_fai, config.genome_dict, memory=config.xmx, disk=select_variants_disk) indel_filter_disk = PromisedRequirement( lambda vcf, ref_size: 2 * vcf.size + ref_size, select_indels.rv(), genome_ref_size) indel_filter = job.wrapJobFn(gatk_variant_filtration, select_indels.rv(), config.indel_filter_name, config.indel_filter_expression, config.genome_fasta, config.genome_fai, config.genome_dict, memory=config.xmx, disk=indel_filter_disk) # The CombineVariants disk requirement depends on the SNP and INDEL input VCFs and the # genome reference files. The combined VCF is approximately the same size as the input files. combine_vcfs_disk = PromisedRequirement( lambda vcf1, vcf2, ref_size: 2 * (vcf1.size + vcf2.size) + ref_size, indel_filter.rv(), snp_filter.rv(), genome_ref_size) combine_vcfs = job.wrapJobFn( gatk_combine_variants, { 'SNPs': snp_filter.rv(), 'INDELs': indel_filter.rv() }, config.genome_fasta, config.genome_fai, config.genome_dict, merge_option='UNSORTED', # Merges variants from a single sample memory=config.xmx, disk=combine_vcfs_disk) job.addChild(select_snps) job.addChild(select_indels) select_snps.addChild(snp_filter) snp_filter.addChild(combine_vcfs) select_indels.addChild(indel_filter) indel_filter.addChild(combine_vcfs) # Output the hard filtered VCF output_dir = os.path.join(config.output_dir, uuid) output_filename = '%s.hard_filter%s.vcf' % (uuid, config.suffix) output_vcf = job.wrapJobFn(output_file_job, output_filename, combine_vcfs.rv(), output_dir, s3_key_path=config.ssec, disk=PromisedRequirement( lambda x: x.size, combine_vcfs.rv())) combine_vcfs.addChild(output_vcf) return combine_vcfs.rv()
def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split=True): """ This module will spawn a strelka job for each chromosome on the DNA bams. ARGUMENTS 1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai tumor_bam |- 'tumor_fix_pg_sorted.bam': <JSid> +- 'tumor_fix_pg_sorted.bam.bai': <JSid> 2. normal_bam: Dict of input normal WGS/WSQ bam + bai normal_bam |- 'normal_fix_pg_sorted.bam': <JSid> +- 'normal_fix_pg_sorted.bam.bai': <JSid> 3. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 4. strelka_options: Dict of parameters specific to strelka strelka_options |- 'dbsnp_vcf': <JSid for dnsnp vcf file> |- 'dbsnp_idx': <JSid for dnsnp vcf index file> |- 'cosmic_vcf': <JSid for cosmic vcf file> |- 'cosmic_idx': <JSid for cosmic vcf index file> |- 'genome_fasta': <JSid for genome fasta file> +- 'genome_dict': <JSid for genome fasta dict file> +- 'genome_fai': <JSid for genome fasta index file> RETURN VALUES 1. perchrom_strelka: Dict of results of strelka per chromosome perchrom_strelka |- 'chr1' | +- 'strelka_chr1.vcf': <JSid> | +- 'strelka_chr1.out': <JSid> |- 'chr2' | |- 'strelka_chr2.vcf': <JSid> | +- 'strelka_chr2.out': <JSid> etc... This module corresponds to node 11 on the tree """ chromosomes = sample_chromosomes(job, strelka_options['genome_fai']) num_cores = min(len(chromosomes), univ_options['max_cores']) strelka = job.wrapJobFn(run_strelka_full, tumor_bam, normal_bam, univ_options, strelka_options, disk=PromisedRequirement( strelka_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], strelka_options['genome_fasta']), memory='6G', cores=num_cores) job.addChild(strelka) if split: unmerge_strelka = job.wrapJobFn(wrap_unmerge, strelka.rv(), strelka_options, univ_options).encapsulate() strelka.addChild(unmerge_strelka) return unmerge_strelka.rv() else: return strelka.rv()
def vqsr_pipeline(job, uuid, vcf_id, config): """ Runs GATK Variant Quality Score Recalibration. 0: Start 0 --> 1 --> 3 --> 4 --> 5 1: Recalibrate SNPs | | 2: Recalibrate INDELS +-> 2 -+ 3: Apply SNP Recalibration 4: Apply INDEL Recalibration 5: Write VCF to output directory :param JobFunctionWrappingJob job: passed automatically by Toil :param str uuid: unique sample identifier :param str vcf_id: VCF FileStoreID :param Namespace config: Pipeline configuration options and shared files Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.cores Number of cores for each job config.xmx Java heap size in bytes config.suffix Suffix for output filename config.output_dir URL or local path to output directory config.ssec Path to key file for SSE-C encryption SNP VQSR attributes: config.snp_filter_annotations List of GATK variant annotations config.hapmap FileStoreID for HapMap resource file config.omni FileStoreID for Omni resource file config.dbsnp FileStoreID for dbSNP resource file config.g1k_snp FileStoreID for 1000G SNP resource file INDEL VQSR attributes: config.indel_filter_annotations List of GATK variant annotations config.dbsnp FileStoreID for dbSNP resource file config.mills FileStoreID for Mills resource file :return: SNP and INDEL VQSR VCF FileStoreID :rtype: str """ # Get the total size of the genome reference genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # The VariantRecalibator disk requirement depends on the input VCF, the resource files, # the genome reference files, and the output recalibration table, tranche file, and plots. # The sum of these output files are less than the input VCF. snp_resources = ['hapmap', 'omni', 'dbsnp', 'g1k_snp'] snp_resource_size = sum( getattr(config, resource).size for resource in snp_resources) snp_recal_disk = PromisedRequirement( lambda in_vcf, ref_size, resource_size: 2 * in_vcf.size + ref_size + resource_size, vcf_id, genome_ref_size, snp_resource_size) snp_recal = job.wrapJobFn(gatk_variant_recalibrator, 'SNP', vcf_id, config.genome_fasta, config.genome_fai, config.genome_dict, get_short_annotations( config.snp_filter_annotations), hapmap=config.hapmap, omni=config.omni, phase=config.g1k_snp, dbsnp=config.dbsnp, unsafe_mode=config.unsafe_mode, disk=snp_recal_disk, cores=config.cores, memory=config.xmx) indel_resource_size = config.mills.size + config.dbsnp.size indel_recal_disk = PromisedRequirement( lambda in_vcf, ref_size, resource_size: 2 * in_vcf.size + ref_size + resource_size, vcf_id, genome_ref_size, indel_resource_size) indel_recal = job.wrapJobFn(gatk_variant_recalibrator, 'INDEL', vcf_id, config.genome_fasta, config.genome_fai, config.genome_dict, get_short_annotations( config.indel_filter_annotations), dbsnp=config.dbsnp, mills=config.mills, unsafe_mode=config.unsafe_mode, disk=indel_recal_disk, cores=config.cores, memory=config.xmx) # The ApplyRecalibration disk requirement depends on the input VCF size, the variant # recalibration table, the tranche file, the genome reference file, and the output VCF. # This step labels variants as filtered, so the output VCF file should be slightly larger # than the input file. Estimate a 10% increase in the VCF file size. apply_snp_recal_disk = PromisedRequirement( lambda in_vcf, recal, tranche, ref_size: int( 2.1 * in_vcf.size + recal.size + tranche.size + ref_size), vcf_id, snp_recal.rv(0), snp_recal.rv(1), genome_ref_size) apply_snp_recal = job.wrapJobFn(gatk_apply_variant_recalibration, 'SNP', vcf_id, snp_recal.rv(0), snp_recal.rv(1), config.genome_fasta, config.genome_fai, config.genome_dict, unsafe_mode=config.unsafe_mode, disk=apply_snp_recal_disk, cores=config.cores, memory=config.xmx) apply_indel_recal_disk = PromisedRequirement( lambda in_vcf, recal, tranche, ref_size: int( 2.1 * in_vcf.size + recal.size + tranche.size + ref_size), vcf_id, indel_recal.rv(0), indel_recal.rv(1), genome_ref_size) apply_indel_recal = job.wrapJobFn(gatk_apply_variant_recalibration, 'INDEL', apply_snp_recal.rv(), indel_recal.rv(0), indel_recal.rv(1), config.genome_fasta, config.genome_fai, config.genome_dict, unsafe_mode=config.unsafe_mode, disk=apply_indel_recal_disk, cores=config.cores, memory=config.xmx) job.addChild(snp_recal) job.addChild(indel_recal) snp_recal.addChild(apply_snp_recal) indel_recal.addChild(apply_indel_recal) apply_snp_recal.addChild(apply_indel_recal) # Output recalibrated VCF output_dir = config.output_dir output_dir = os.path.join(output_dir, uuid) vqsr_name = '%s.vqsr%s.vcf' % (uuid, config.suffix) output_vqsr = job.wrapJobFn(output_file_job, vqsr_name, apply_indel_recal.rv(), output_dir, s3_key_path=config.ssec, disk=PromisedRequirement( lambda x: x.size, apply_indel_recal.rv())) apply_indel_recal.addChild(output_vqsr) return apply_indel_recal.rv()
def launch_protect(job, patient_data, univ_options, tool_options): """ The launchpad for ProTECT. The DAG for ProTECT can be viewed in Flowchart.txt. :param dict patient_data: Dict of information regarding the input sequences for the patient :param dict univ_options: Dict of universal options used by almost all tools :param dict tool_options: Options for the various tools """ # Add Patient id to univ_options as is is passed to every major node in the DAG and can be used # as a prefix for the logfile. univ_options['patient'] = patient_data['patient_id'] univ_options['tumor_type'] = patient_data['tumor_type'] # Ascertain number of cpus to use per job for tool in tool_options: tool_options[tool]['n'] = ascertain_cpu_share(univ_options['max_cores']) # Define the various nodes in the DAG # Need a logfile and a way to send it around sample_prep = job.wrapJobFn(prepare_samples, patient_data, univ_options, disk='40G') job.addChild(sample_prep) # Define the fastq deletion step fastq_deletion_1 = job.wrapJobFn(delete_fastqs, sample_prep.rv(), disk='100M', memory='100M') sample_prep.addChild(fastq_deletion_1) # Get all the input files haplotype_patient = get_mutations = None fastq_files = defaultdict(lambda: None) bam_files = defaultdict(lambda: None) delete_bam_files = defaultdict(lambda: None) phlat_files = defaultdict(lambda: None) for sample_type in 'tumor_dna', 'normal_dna', 'tumor_rna': if sample_type + '_fastq_1' in patient_data: fastq_files[sample_type] = job.wrapJobFn(get_patient_fastqs, sample_prep.rv(), sample_type, disk='10M') sample_prep.addChild(fastq_files[sample_type]) fastq_files[sample_type].addChild(fastq_deletion_1) elif sample_type + '_bam' in patient_data: bam_files[sample_type] = job.wrapJobFn(get_patient_bams, sample_prep.rv(), sample_type, univ_options, tool_options['bwa'], tool_options['mutect'], disk='10M').encapsulate() sample_prep.addChild(bam_files[sample_type]) # define the haplotyping subgraph of the DAG if 'hla_haplotype_files' in patient_data: haplotype_patient = job.wrapJobFn(get_patient_mhc_haplotype, sample_prep.rv()) sample_prep.addChild(haplotype_patient) else: assert None not in fastq_files.values() # We are guaranteed to have fastqs here for sample_type in 'tumor_dna', 'normal_dna', 'tumor_rna': phlat_files[sample_type] = job.wrapJobFn( run_phlat, fastq_files[sample_type].rv(), sample_type, univ_options, tool_options['phlat'], cores=tool_options['phlat']['n'], disk=PromisedRequirement(phlat_disk, fastq_files[sample_type].rv())) fastq_files[sample_type].addChild(phlat_files[sample_type]) phlat_files[sample_type].addChild(fastq_deletion_1) haplotype_patient = job.wrapJobFn(merge_phlat_calls, phlat_files['tumor_dna'].rv(), phlat_files['normal_dna'].rv(), phlat_files['tumor_rna'].rv(), univ_options, disk='100M', memory='100M', cores=1) phlat_files['tumor_dna'].addChild(haplotype_patient) phlat_files['normal_dna'].addChild(haplotype_patient) phlat_files['tumor_rna'].addChild(haplotype_patient) # Define the RNA-Seq Alignment subgraph if needed if bam_files['tumor_rna'] is None: assert fastq_files['tumor_rna'] is not None cutadapt = job.wrapJobFn(run_cutadapt, fastq_files['tumor_rna'].rv(), univ_options, tool_options['cutadapt'], cores=1, disk=PromisedRequirement(cutadapt_disk, fastq_files['tumor_rna'].rv())) bam_files['tumor_rna'] = job.wrapJobFn(align_rna, cutadapt.rv(), univ_options, tool_options['star'], cores=1, disk='100M').encapsulate() fastq_deletion_2 = job.wrapJobFn(delete_fastqs, {'cutadapted_rnas': cutadapt.rv()}, disk='100M', memory='100M') fastq_files['tumor_rna'].addChild(cutadapt) cutadapt.addChild(fastq_deletion_1) cutadapt.addChild(fastq_deletion_2) cutadapt.addChild(bam_files['tumor_rna']) bam_files['tumor_rna'].addChild(fastq_deletion_2) # Define the fusion calling node tool_options['star_fusion']['index'] = tool_options['star']['index'] tool_options['fusion_inspector']['index'] = tool_options['star']['index'] fusions = job.wrapJobFn(wrap_fusion, cutadapt.rv(), bam_files['tumor_rna'].rv(), univ_options, tool_options['star_fusion'], tool_options['fusion_inspector'], disk='100M', memory='100M', cores=1).encapsulate() bam_files['tumor_rna'].addChild(fusions) fusions.addChild(fastq_deletion_1) fusions.addChild(fastq_deletion_2) else: if tool_options['star_fusion']['run'] is True: job.fileStore.logToMaster('Input RNA bams were provided for sample %s. Fusion detection' 'can only be run with input fastqs.' % univ_options['patient'] ) fusions = None # Define the Expression estimation node rsem = job.wrapJobFn(wrap_rsem, bam_files['tumor_rna'].rv(), univ_options, tool_options['rsem'], cores=1, disk='100M').encapsulate() bam_files['tumor_rna'].addChild(rsem) # Define the bam deletion node delete_bam_files['tumor_rna'] = job.wrapJobFn(delete_bams, bam_files['tumor_rna'].rv(), univ_options['patient'], disk='100M', memory='100M') bam_files['tumor_rna'].addChild(delete_bam_files['tumor_rna']) rsem.addChild(delete_bam_files['tumor_rna']) if fusions: fusions.addChild(delete_bam_files['tumor_rna']) # Define the reporting leaves if phlat_files['tumor_rna'] is not None: mhc_pathway_assessment = job.wrapJobFn(run_mhc_gene_assessment, rsem.rv(), phlat_files['tumor_rna'].rv(), univ_options, tool_options['reports'], disk='100M', memory='100M', cores=1) rsem.addChild(mhc_pathway_assessment) phlat_files['tumor_rna'].addChild(mhc_pathway_assessment) else: mhc_pathway_assessment = job.wrapJobFn(run_mhc_gene_assessment, rsem.rv(), None, univ_options, tool_options['reports'], disk='100M', memory='100M', cores=1) rsem.addChild(mhc_pathway_assessment) itx_resistance_assessment = job.wrapJobFn(run_itx_resistance_assessment, rsem.rv(), univ_options, tool_options['reports'], disk='100M', memory='100M', cores=1) rsem.addChild(itx_resistance_assessment) car_t_validity_assessment = job.wrapJobFn(run_car_t_validity_assessment, rsem.rv(), univ_options, tool_options['reports'], disk='100M', memory='100M', cores=1) rsem.addChild(car_t_validity_assessment) # Define the DNA-Seq alignment and mutation calling subgraphs if necessary if 'mutation_vcf' in patient_data: get_mutations = job.wrapJobFn(get_patient_vcf, sample_prep.rv()) sample_prep.addChild(get_mutations) else: assert (None, None) not in zip(fastq_files.values(), bam_files.values()) for sample_type in 'tumor_dna', 'normal_dna': if bam_files[sample_type] is None: assert fastq_files[sample_type] is not None bam_files[sample_type] = job.wrapJobFn(align_dna, fastq_files[sample_type].rv(), sample_type, univ_options, tool_options['bwa'], cores=1, disk='100M').encapsulate() fastq_files[sample_type].addChild(bam_files[sample_type]) bam_files[sample_type].addChild(fastq_deletion_1) else: # We already have the bam ready to go pass delete_bam_files[sample_type] = job.wrapJobFn(delete_bams, bam_files[sample_type].rv(), univ_options['patient'], disk='100M', memory='100M') bam_files[sample_type].addChild(delete_bam_files[sample_type]) # Time to call mutations mutations = { 'radia': job.wrapJobFn(run_radia, bam_files['tumor_rna'].rv(), bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(), univ_options, tool_options['radia'], disk='100M').encapsulate(), 'mutect': job.wrapJobFn(run_mutect, bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(), univ_options, tool_options['mutect'], disk='100M').encapsulate(), 'muse': job.wrapJobFn(run_muse, bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(), univ_options, tool_options['muse']).encapsulate(), 'somaticsniper': job.wrapJobFn(run_somaticsniper, bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(), univ_options, tool_options['somaticsniper']).encapsulate(), 'strelka': job.wrapJobFn(run_strelka, bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(), univ_options, tool_options['strelka']).encapsulate(), 'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(), univ_options, 'indel_options', disk='100M', memory='100M', cores=1)} for sample_type in 'tumor_dna', 'normal_dna': for caller in mutations: bam_files[sample_type].addChild(mutations[caller]) bam_files['tumor_rna'].addChild(mutations['radia']) get_mutations = job.wrapJobFn(run_mutation_aggregator, {caller: cjob.rv() for caller, cjob in mutations.items()}, univ_options, disk='100M', memory='100M', cores=1).encapsulate() for caller in mutations: mutations[caller].addChild(get_mutations) # We don't need the normal dna bam any more get_mutations.addChild(delete_bam_files['normal_dna']) # We may need the tumor one depending on OxoG if not patient_data['filter_for_OxoG']: get_mutations.addChild(delete_bam_files['tumor_dna']) # The rest of the subgraph should be unchanged snpeff = job.wrapJobFn(run_snpeff, get_mutations.rv(), univ_options, tool_options['snpeff'], disk=PromisedRequirement(snpeff_disk, tool_options['snpeff']['index'])) get_mutations.addChild(snpeff) tumor_dna_bam = bam_files['tumor_dna'].rv() if patient_data['filter_for_OxoG'] else None fusion_calls = fusions.rv() if fusions else None transgene = job.wrapJobFn(run_transgene, snpeff.rv(), bam_files['tumor_rna'].rv(), univ_options, tool_options['transgene'], disk=PromisedRequirement(transgene_disk, bam_files['tumor_rna'].rv()), memory='100M', cores=1, tumor_dna_bam=tumor_dna_bam, fusion_calls=fusion_calls) snpeff.addChild(transgene) bam_files['tumor_rna'].addChild(transgene) transgene.addChild(delete_bam_files['tumor_rna']) if patient_data['filter_for_OxoG']: bam_files['tumor_dna'].addChild(transgene) transgene.addChild(delete_bam_files['tumor_dna']) if fusions: fusions.addChild(transgene) spawn_mhc = job.wrapJobFn(spawn_antigen_predictors, transgene.rv(), haplotype_patient.rv(), univ_options, (tool_options['mhci'], tool_options['mhcii']), disk='100M', memory='100M', cores=1).encapsulate() haplotype_patient.addChild(spawn_mhc) transgene.addChild(spawn_mhc) merge_mhc = job.wrapJobFn(merge_mhc_peptide_calls, spawn_mhc.rv(), transgene.rv(), univ_options, disk='100M', memory='100M', cores=1) spawn_mhc.addFollowOn(merge_mhc) transgene.addChild(merge_mhc) rankboost = job.wrapJobFn(wrap_rankboost, rsem.rv(), merge_mhc.rv(), transgene.rv(), univ_options, tool_options['rankboost'], disk='100M', memory='100M', cores=1) rsem.addChild(rankboost) merge_mhc.addChild(rankboost) transgene.addChild(rankboost) report_success = job.wrapJobFn(email_report, univ_options) rankboost.addChild(report_success) return None
def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): """ Spawn a RADIA job for each chromosome on the input bam trios. :param dict rna_bam: Dict of bam and bai for tumor DNA-Seq. It can be one of two formats rna_bam: # Just the genomic bam and bai |- 'rna_genome_sorted.bam': fsID +- 'rna_genome_sorted.bam.bai': fsID OR rna_bam: # The output from run_star |- 'rna_transcriptome.bam': fsID |- 'rna_genome': # Only this part will be used |- 'rna_genome_sorted.bam': fsID +- 'rna_genome_sorted.bam.bai': fsID :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq :param dict normal_bam: Dict of bam and bai for normal DNA-Seq :param dict univ_options: Dict of universal options used by almost all tools :param dict radia_options: Options specific to RADIA :return: Dict of results from running RADIA on every chromosome perchrom_radia: |- 'chr1': fsID |- 'chr2' fsID | |-... | +- 'chrM': fsID :rtype: dict """ if 'rna_genome' in rna_bam.keys(): rna_bam = rna_bam['rna_genome'] elif set(rna_bam.keys()) == { 'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai' }: pass else: raise RuntimeError( 'An improperly formatted dict was passed to rna_bam.') bams = { 'tumor_rna': rna_bam['rna_genome_sorted.bam'], 'tumor_rnai': rna_bam['rna_genome_sorted.bam.bai'], 'tumor_dna': tumor_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor_dnai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'], 'normal_dna': normal_bam['normal_dna_fix_pg_sorted.bam'], 'normal_dnai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'] } # Get a list of chromosomes to process if radia_options['chromosomes']: chromosomes = radia_options['chromosomes'] else: chromosomes = sample_chromosomes(job, radia_options['genome_fai']) perchrom_radia = defaultdict() for chrom in chromosomes: radia = job.addChildJobFn( run_radia_perchrom, bams, univ_options, radia_options, chrom, memory='6G', disk=PromisedRequirement( radia_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], rna_bam['rna_genome_sorted.bam'], radia_options['genome_fasta'])) filter_radia = radia.addChildJobFn( run_filter_radia, bams, radia.rv(), univ_options, radia_options, chrom, memory='6G', disk=PromisedRequirement( radia_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], rna_bam['rna_genome_sorted.bam'], radia_options['genome_fasta'])) perchrom_radia[chrom] = filter_radia.rv() job.fileStore.logToMaster('Ran spawn_radia on %s successfully' % univ_options['patient']) return perchrom_radia
def workflow(job, sample, config): """ Creates workflow graph for each sample based on configuration options :param JobFunctionWrappingJob job: passed automatically by Toil :param list(str, str, str, str) sample: Sample information - filetype, paired/unpaired, UUID, and URL :param Expando config: Dict-like object containing workflow options as attributes """ # Create copy of config to store sample-specific information config = config.copy() config.file_type, config.paired, config.uuid, config.url = sample config.paired = True if config.paired == 'paired' else False config.cores = min(config.maxCores, multiprocessing.cpu_count()) # Download and process input based on file type # `inputs` will return the FileStoreID(s) of the R1 / R2 fastq if config.file_type == 'bam': disk = '2G' if config.ci_test else config.max_sample_size disk = human2bytes(disk) * 5 inputs = job.wrapJobFn(download_and_process_bam, config, disk=disk) elif config.file_type == 'tar': inputs = job.wrapJobFn(download_and_process_tar, config).encapsulate() else: config.gz = True if config.url.split(',')[0].endswith( 'gz') else None # Check if fastqs are gzipped inputs = job.wrapJobFn(download_and_process_fastqs, config).encapsulate() # Add inputs as first child to root job job.addChild(inputs) # Define preliminary disk and create dictionary for storing output disk = PromisedRequirement( lambda xs: sum(x.size for x in xs if x) + human2bytes('2G'), inputs.rv()) cores = min(16, config.cores ) if config.cores >= 32 else config.cores # Core optimization output = {} # DAG wiring for remainder of workflow # FASTQC if config.fastqc: fastqc = job.wrapJobFn(run_fastqc, r1_id=inputs.rv(0), r2_id=inputs.rv(1), cores=2, disk=disk) inputs.addChild(fastqc) output['QC/fastQC'] = fastqc.rv() # Kallisto if config.kallisto_index: kallisto = job.wrapJobFn(run_kallisto, r1_id=inputs.rv(0), r2_id=inputs.rv(1), kallisto_index_url=config.kallisto_index, cores=cores, disk=disk) inputs.addChild(kallisto) output['Kallisto'] = kallisto.rv() # Hera if config.hera_index: hera = job.wrapJobFn(run_hera, r1_id=inputs.rv(0), r2_id=inputs.rv(1), hera_index_url=config.hera_index, cores=config.cores, disk=disk) inputs.addChild(hera) output['Hera'] = hera.rv() # STAR and RSEM if config.star_index and config.rsem_ref: if config.ci_test: disk = '2G' mem = '2G' else: disk = PromisedRequirement( lambda xs: sum(x.size for x in xs if x) + human2bytes('25G'), inputs.rv()) mem = '40G' # STAR returns: transcriptome_id, star_id, aligned_id, wiggle_id sort = True if config.wiggle else False star = job.wrapJobFn(run_star, inputs.rv(0), inputs.rv(1), star_index_url=config.star_index, wiggle=config.wiggle, sort=sort, save_aligned_bam=config.save_bam, cores=config.cores, memory=mem, disk=disk) inputs.addChild(star) output['QC/STAR'] = star.rv(1) # Handle optional files user can save if config.save_bam: star.addChildJobFn(sort_and_save_bam, config, bam_id=star.rv(2), skip_sort=sort) if config.wiggle: star.addChildJobFn(save_wiggle, config, wiggle_id=star.rv(3)) # RSEM returns: gene_id, isoform_id rsem = job.wrapJobFn(run_rsem, bam_id=star.rv(0), rsem_ref_url=config.rsem_ref, paired=config.paired, cores=cores, disk=PromisedRequirement( lambda x: x.size + human2bytes('2G'), star.rv(0))) star.addChild(rsem) # RSEM postprocess returns: rsem_id, rsem_hugo_id rsem_postprocess = job.wrapJobFn(run_rsem_gene_mapping, rsem_gene_id=rsem.rv(0), rsem_isoform_id=rsem.rv(1)) rsem.addChild(rsem_postprocess) output['RSEM'] = rsem_postprocess.rv(0) output['RSEM/Hugo'] = rsem_postprocess.rv(1) # Cleanup star.addFollowOnJobFn(cleanup_ids, ids_to_delete=[star.rv(2), star.rv(3)]) rsem.addChildJobFn(cleanup_ids, ids_to_delete=[star.rv(0)]) rsem_postprocess.addChildJobFn(cleanup_ids, ids_to_delete=[rsem.rv(0), rsem.rv(1)]) # Cleanup and Consolidate job.addFollowOnJobFn(cleanup_ids, [inputs.rv(0), inputs.rv(1)]) job.addFollowOnJobFn(consolidate_output, config, output)
def pipeline_launchpad(job, fastqs, univ_options, tool_options): """ The precision immuno pipeline begins at this module. The DAG can be viewed in Flowchart.txt :param job job: job :param dict fastqs: Dict of lists of fastq files :param univ_options: Universal Options :param tool_options: Options for the various tools :return: None """ # Add Patient id to univ_options as is is passed to every major node in the DAG and can be used # as a prefix for the logfile. univ_options['patient'] = fastqs['patient_id'] # Ascertin number of cpus to use per job tool_options['star']['n'] = tool_options['bwa']['n'] = tool_options['phlat']['n'] = \ tool_options['rsem']['n'] = ascertain_cpu_share(univ_options['max_cores']) # Define the various nodes in the DAG # Need a logfile and a way to send it around sample_prep = job.wrapJobFn(prepare_samples, fastqs, univ_options, disk='40G') tumor_dna_fqs = job.wrapJobFn(get_fqs, sample_prep.rv(), 'tumor_dna', disk='10M') normal_dna_fqs = job.wrapJobFn(get_fqs, sample_prep.rv(), 'normal_dna', disk='10M') tumor_rna_fqs = job.wrapJobFn(get_fqs, sample_prep.rv(), 'tumor_rna', disk='10M') cutadapt = job.wrapJobFn(run_cutadapt, tumor_rna_fqs.rv(), univ_options, tool_options['cutadapt'], cores=1, disk=PromisedRequirement(cutadapt_disk, tumor_rna_fqs.rv())) star = job.wrapJobFn(align_rna, cutadapt.rv(), univ_options, tool_options['star'], cores=1, disk='100M').encapsulate() bwa_tumor = job.wrapJobFn(align_dna, tumor_dna_fqs.rv(), 'tumor_dna', univ_options, tool_options['bwa'], cores=1, disk='100M').encapsulate() bwa_normal = job.wrapJobFn(align_dna, normal_dna_fqs.rv(), 'normal_dna', univ_options, tool_options['bwa'], cores=1, disk='100M').encapsulate() phlat_tumor_dna = job.wrapJobFn(run_phlat, tumor_dna_fqs.rv(), 'tumor_dna', univ_options, tool_options['phlat'], cores=tool_options['phlat']['n'], disk=PromisedRequirement( phlat_disk, tumor_dna_fqs.rv())) phlat_normal_dna = job.wrapJobFn(run_phlat, normal_dna_fqs.rv(), 'normal_dna', univ_options, tool_options['phlat'], cores=tool_options['phlat']['n'], disk=PromisedRequirement( phlat_disk, normal_dna_fqs.rv())) phlat_tumor_rna = job.wrapJobFn(run_phlat, tumor_rna_fqs.rv(), 'tumor_rna', univ_options, tool_options['phlat'], cores=tool_options['phlat']['n'], disk=PromisedRequirement( phlat_disk, tumor_rna_fqs.rv())) fastq_deletion_1 = job.wrapJobFn(delete_fastqs, sample_prep.rv(), disk='100M', memory='100M') fastq_deletion_2 = job.wrapJobFn(delete_fastqs, {'cutadapted_rnas': cutadapt.rv()}, disk='100M', memory='100M') rsem = job.wrapJobFn(wrap_rsem, star.rv(), univ_options, tool_options['rsem'], cores=tool_options['rsem']['n'], disk='100M').encapsulate() mhc_pathway_assessment = job.wrapJobFn( run_mhc_gene_assessment, rsem.rv(), phlat_tumor_rna.rv(), univ_options, tool_options['mhc_pathway_assessment'], disk='100M', memory='100M', cores=1) fusions = job.wrapJobFn(run_fusion_caller, star.rv(), univ_options, 'fusion_options', disk='100M', memory='100M', cores=1) radia = job.wrapJobFn(run_radia, star.rv(), bwa_tumor.rv(), bwa_normal.rv(), univ_options, tool_options['mut_callers'], disk='100M').encapsulate() mutect = job.wrapJobFn(run_mutect, bwa_tumor.rv(), bwa_normal.rv(), univ_options, tool_options['mut_callers'], disk='100M').encapsulate() muse = job.wrapJobFn(run_muse, bwa_tumor.rv(), bwa_normal.rv(), univ_options, tool_options['mut_callers']).encapsulate() somaticsniper = job.wrapJobFn(run_somaticsniper, bwa_tumor.rv(), bwa_normal.rv(), univ_options, tool_options['mut_callers']).encapsulate() strelka = job.wrapJobFn(run_strelka, bwa_tumor.rv(), bwa_normal.rv(), univ_options, tool_options['mut_callers']).encapsulate() indels = job.wrapJobFn(run_indel_caller, bwa_tumor.rv(), bwa_normal.rv(), univ_options, 'indel_options', disk='100M', memory='100M', cores=1) merge_mutations = job.wrapJobFn(run_mutation_aggregator, { 'fusions': fusions.rv(), 'radia': radia.rv(), 'mutect': mutect.rv(), 'strelka': strelka.rv(), 'indels': indels.rv(), 'muse': muse.rv(), 'somaticsniper': somaticsniper.rv() }, univ_options, disk='100M', memory='100M', cores=1).encapsulate() snpeff = job.wrapJobFn(run_snpeff, merge_mutations.rv(), univ_options, tool_options['snpeff'], disk=PromisedRequirement( snpeff_disk, tool_options['snpeff']['tool_index'])) transgene = job.wrapJobFn(run_transgene, snpeff.rv(), star.rv(), univ_options, tool_options['transgene'], disk='100M', memory='100M', cores=1) merge_phlat = job.wrapJobFn(merge_phlat_calls, phlat_tumor_dna.rv(), phlat_normal_dna.rv(), phlat_tumor_rna.rv(), univ_options, disk='100M', memory='100M', cores=1) spawn_mhc = job.wrapJobFn(spawn_antigen_predictors, transgene.rv(), merge_phlat.rv(), univ_options, (tool_options['mhci'], tool_options['mhcii']), disk='100M', memory='100M', cores=1).encapsulate() merge_mhc = job.wrapJobFn(merge_mhc_peptide_calls, spawn_mhc.rv(), transgene.rv(), univ_options, disk='100M', memory='100M', cores=1) rank_boost = job.wrapJobFn(wrap_rankboost, rsem.rv(), merge_mhc.rv(), transgene.rv(), univ_options, tool_options['rank_boost'], disk='100M', memory='100M', cores=1) # Define the DAG in a static form job.addChild(sample_prep) # Edge 0->1 # A. The first step is running the alignments and the MHC haplotypers sample_prep.addChild(tumor_dna_fqs) # Edge 1->2 sample_prep.addChild(normal_dna_fqs) # Edge 1->2 sample_prep.addChild(tumor_rna_fqs) # Edge 1->2 tumor_rna_fqs.addChild(cutadapt) # Edge 1->2 tumor_dna_fqs.addChild(bwa_tumor) # Edge 1->3 normal_dna_fqs.addChild(bwa_normal) # Edge 1->4 tumor_dna_fqs.addChild(phlat_tumor_dna) # Edge 1->5 normal_dna_fqs.addChild(phlat_normal_dna) # Edge 1->6 tumor_rna_fqs.addChild(phlat_tumor_rna) # Edge 1->7 # B. cutadapt will be followed by star cutadapt.addChild(star) # Edge 2->9 # Ci. gene expression and fusion detection follow start alignment star.addChild(rsem) # Edge 9->10 star.addChild(fusions) # Edge 9->11 # Cii. Radia depends on all 3 alignments star.addChild(radia) # Edge 9->12 bwa_tumor.addChild(radia) # Edge 3->12 bwa_normal.addChild(radia) # Edge 4->12 # Ciii. mutect and indel calling depends on dna to have been aligned bwa_tumor.addChild(mutect) # Edge 3->13 bwa_normal.addChild(mutect) # Edge 4->13 bwa_tumor.addChild(muse) # Edge 3->13 bwa_normal.addChild(muse) # Edge 4->13 bwa_tumor.addChild(somaticsniper) # Edge 3->13 bwa_normal.addChild(somaticsniper) # Edge 4->13 bwa_tumor.addChild(strelka) # Edge 3->13 bwa_normal.addChild(strelka) # Edge 4->13 bwa_tumor.addChild(indels) # Edge 3->14 bwa_normal.addChild(indels) # Edge 4->14 # D. MHC haplotypes will be merged once all 3 samples have been PHLAT-ed phlat_tumor_dna.addChild(merge_phlat) # Edge 5->15 phlat_normal_dna.addChild(merge_phlat) # Edge 6->15 phlat_tumor_rna.addChild(merge_phlat) # Edge 7->15 # E. Delete the fastqs from the job store since all alignments are complete sample_prep.addChild(fastq_deletion_1) # Edge 1->8 cutadapt.addChild(fastq_deletion_1) # Edge 2->8 bwa_normal.addChild(fastq_deletion_1) # Edge 3->8 bwa_tumor.addChild(fastq_deletion_1) # Edge 4->8 phlat_normal_dna.addChild(fastq_deletion_1) # Edge 5->8 phlat_tumor_dna.addChild(fastq_deletion_1) # Edge 6>8 phlat_tumor_rna.addChild(fastq_deletion_1) # Edge 7->8 star.addChild(fastq_deletion_2) # F. Mutation calls need to be merged before they can be used # G. All mutations get aggregated when they have finished running fusions.addChild(merge_mutations) # Edge 11->18 radia.addChild(merge_mutations) # Edge 16->18 mutect.addChild(merge_mutations) # Edge 17->18 muse.addChild(merge_mutations) # Edge 17->18 somaticsniper.addChild(merge_mutations) # Edge 17->18 strelka.addChild(merge_mutations) # Edge 17->18 indels.addChild(merge_mutations) # Edge 14->18 # H. Aggregated mutations will be translated to protein space merge_mutations.addChild(snpeff) # Edge 18->19 # I. snpeffed mutations will be converted into peptides. # Transgene also accepts the RNA-seq bam and bai so that it can be rna-aware snpeff.addChild(transgene) # Edge 19->20 star.addChild(transgene) # J. Merged haplotypes and peptides will be converted into jobs and submitted for mhc:peptide # binding prediction merge_phlat.addChild(spawn_mhc) # Edge 15->21 transgene.addChild(spawn_mhc) # Edge 20->21 # K. The results from all the predictions will be merged. This is a follow-on job because # spawn_mhc will spawn an undetermined number of children. spawn_mhc.addFollowOn(merge_mhc) # Edges 21->XX->22 and 21->YY->22 # L. Finally, the merged mhc along with the gene expression will be used for rank boosting rsem.addChild(rank_boost) # Edge 10->23 merge_mhc.addChild(rank_boost) # Edge 22->23 # M. Assess the status of the MHC genes in the patient phlat_tumor_rna.addChild(mhc_pathway_assessment) # Edge 7->24 rsem.addChild(mhc_pathway_assessment) # Edge 10->24 return None
def run_gatk_preprocessing(job, bam, bai, ref, ref_dict, fai, g1k, mills, dbsnp, unsafe=False): """ GATK Preprocessing Pipeline 0: Mark duplicates 1: Create INDEL realignment intervals 2: Realign INDELs 3: Recalibrate base quality scores 4: Apply base score recalibration :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :param str bai: FileStoreID for BAM index file :param str ref: FileStoreID for reference genome fasta file :param str ref_dict: FileStoreID for reference sequence dictionary file :param str fai: FileStoreID for reference fasta index file :param str g1k: FileStoreID for 1000 Genomes VCF file :param str mills: FileStoreID for Mills VCF file :param str dbsnp: FileStoreID for dbSNP VCF file :param bool unsafe: If True, runs GATK tools in UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreIDs for BAM and BAI files :rtype: tuple(str, str) """ # The MarkDuplicates disk requirement depends on the input BAM and BAI files and the output # BAM and BAI files. The output BAM file is approximately the same size as the input BAM file. mdups_disk = PromisedRequirement( lambda bam_, bai_: 2 * (bam_.size + bai_.size), bam, bai) mdups = job.wrapJobFn(picard_mark_duplicates, bam, bai, cores=job.cores, disk=mdups_disk, memory=job.memory) # Get genome reference file sizes for calculating disk requirements genome_ref_size = ref.size + ref_dict.size + fai.size # Get INDEL resource file sizes and genome reference file sizes indel_ref_size = mills.size + g1k.size + genome_ref_size # The RealignerTargetCreator disk requirement depends on the input BAM/BAI files, the genome reference files, and # the output intervals file. The intervals file size is less than the reference file size, so estimate the interval # file size as the reference file size. realigner_target_disk = PromisedRequirement( lambda bam_, bai_, ref_size: bam_.size + bai_.size + 2 * ref_size, mdups.rv(0), mdups.rv(1), indel_ref_size) realigner_target = job.wrapJobFn( run_realigner_target_creator, mdups.rv(0), mdups.rv(1), ref, ref_dict, fai, g1k, mills, unsafe=unsafe, cores=1, # RealignerTargetCreator is single threaded disk=realigner_target_disk, memory=job.memory) # The INDEL realignment disk requirement depends on the input BAM and BAI files, the intervals # file, the variant resource files, and the output BAM and BAI files. Here, we assume the # output BAM and BAI files are approximately the same size as the input BAM and BAI files. indel_realign_disk = PromisedRequirement( lambda bam_, bai_, intervals, ref_size: 2 * (bam_.size + bai_.size) + intervals.size + ref_size, mdups.rv(0), mdups.rv(1), realigner_target.rv(), indel_ref_size) indel_realign = job.wrapJobFn( run_indel_realignment, realigner_target.rv(), mdups.rv(0), mdups.rv(1), ref, ref_dict, fai, g1k, mills, unsafe=unsafe, cores=1, # IndelRealigner is single threaded disk=indel_realign_disk, memory=job.memory) # Get size of BQSR databases and genome reference files bqsr_ref_size = dbsnp.size + mills.size + genome_ref_size # The BQSR disk requirement depends on the input BAM and BAI files, the reference files, and the output # recalibration table file. The recalibration table file size is less than the reference file sizes, so use # the reference file sizes to estimate the recalibration table file size. base_recal_disk = PromisedRequirement( lambda bam_, bai_, ref_size: bam_.size + bai_.size + 2 * ref_size, indel_realign.rv(0), indel_realign.rv(1), bqsr_ref_size) base_recal = job.wrapJobFn(run_base_recalibration, indel_realign.rv(0), indel_realign.rv(1), ref, ref_dict, fai, dbsnp, mills, unsafe=unsafe, cores=job.cores, disk=base_recal_disk, memory=job.memory) # The PrintReads disk requirement depends on the input BAM and BAI files, the recalibration table file, the # genome reference files, and the output BAM and BAI files. The output BAM and BAI files are approximately the # same size as the input BAM and BAI files. recalibrate_reads_disk = PromisedRequirement( lambda bam_, bai_, recal, ref_size: 2 * (bam_.size + bai_.size) + recal.size + ref_size, indel_realign.rv(0), indel_realign.rv(1), base_recal.rv(), genome_ref_size) recalibrate_reads = job.wrapJobFn(apply_bqsr_recalibration, base_recal.rv(), indel_realign.rv(0), indel_realign.rv(1), ref, ref_dict, fai, unsafe=unsafe, cores=job.cores, disk=recalibrate_reads_disk, memory=job.memory) job.addChild(mdups) mdups.addChild(realigner_target) realigner_target.addChild(indel_realign) indel_realign.addChild(base_recal) base_recal.addChild(recalibrate_reads) return recalibrate_reads.rv(0), recalibrate_reads.rv(1)