def add_addbamstats(self, bam_source):
     self.step(
         "vc_gatk_addbamstats",
         AddBamStatsGermline_0_1_0(
             bam=bam_source,
             vcf=self.vc_gatk_uncompress.out.as_type(Vcf),
             reference=self.reference,
         ),
     )
     self.output(
         "out_variants_bamstats",
         source=self.vc_gatk_addbamstats.out,
         output_folder="variants",
         output_name="gatk_bamstats",
     )
    def add_combine_variants(self, bam_source):

        # Note, this is reliant on the specific step names from previous steps

        # Combine
        self.step(
            "combine_variants",
            CombineVariants_0_0_8(
                vcfs=[
                    self.vc_gatk_uncompress.out.as_type(Vcf),
                    self.vc_strelka.out,
                    self.vc_vardict_uncompress_for_combine.out.as_type(Vcf),
                ],
                type="germline",
                columns=["AC", "AN", "AF", "AD", "DP", "GT"],
            ),
        )
        self.step("combined_compress",
                  BGZipLatest(file=self.combine_variants.out))
        self.step(
            "combined_sort",
            BcfToolsSort_1_9(
                vcf=self.combined_compress.out.as_type(CompressedVcf)),
        )
        self.step("combined_uncompress",
                  UncompressArchive(file=self.combined_sort.out))

        self.step(
            "combined_addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=bam_source,
                vcf=self.combined_uncompress.out.as_type(Vcf),
                reference=self.reference,
            ),
        )

        self.output(
            "out_variants",
            source=self.combined_addbamstats.out,
            output_folder="variants",
            output_name="combined",
            doc="Combined variants from all 3 callers",
        )
Exemplo n.º 3
0
 def add_addbamstats(self, bam_source):
     self.step(
         "vc_gatk_addbamstats",
         AddBamStatsGermline_0_1_0(
             bam=bam_source,
             vcf=self.vc_gatk_uncompress.out.as_type(Vcf),
             reference=self.reference,
         ),
     )
     self.output(
         "out_variants_bamstats",
         source=self.vc_gatk_addbamstats.out,
         output_folder=[
             "variants",
         ],
         output_name=StringFormatter(
             "{sample_name}",
             sample_name=self.sample_name,
         ),
         doc="Final vcf from GATK",
     )
    def constructor(self):

        # Inputs
        self.input("sample_name", String)
        self.input("fastqs", Array(FastqGzPair))
        self.input("reference", FastaWithDict)
        self.input("region_bed", Bed)
        self.input("region_bed_extended", Bed)
        self.input("region_bed_annotated", Bed)
        self.input("genecoverage_bed", Bed)
        self.input("genome_file", TextFile)
        self.input("black_list", Bed(optional=True))
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        # fastqc
        self.step(
            "fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads"
        )
        # get the overrepresentative sequence from fastqc
        self.step(
            "getfastqc_adapters",
            ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,),
            scatter="fastqc_datafiles",
        )
        # align and generate sorted index bam
        self.step(
            "align_and_sort",
            BwaAligner(
                fastq=self.fastqs,
                reference=self.reference,
                sample_name=self.sample_name,
                sortsam_tmpDir=".",
                cutadapt_adapter=self.getfastqc_adapters,
                cutadapt_removeMiddle3Adapter=self.getfastqc_adapters,
            ),
            scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"],
        )
        # merge into one bam and markdups
        self.step(
            "merge_and_mark",
            MergeAndMarkBams_4_1_3(
                bams=self.align_and_sort.out, sampleName=self.sample_name
            ),
        )
        # performance: doc
        self.step(
            "annotate_doc",
            AnnotateDepthOfCoverage_0_1_0(
                bam=self.merge_and_mark.out,
                bed=self.region_bed_annotated,
                reference=self.reference,
                sample_name=self.sample_name,
            ),
        )
        # performance
        self.step(
            "performance_summary",
            PerformanceSummaryTargeted_0_1_0(
                bam=self.merge_and_mark.out,
                region_bed=self.region_bed,
                genecoverage_bed=self.genecoverage_bed,
                sample_name=self.sample_name,
                genome_file=self.genome_file,
            ),
        )
        # gridss
        self.step(
            "gridss",
            Gridss_2_6_2(
                bams=self.merge_and_mark.out,
                reference=self.reference,
                blacklist=self.black_list,
                tmpdir=".",
            ),
        )
        # post gridss r script here
        # self.step("gridss_post_r", )
        # gatk bqsr bam
        self.step(
            "bqsr",
            GATKBaseRecalBQSRWorkflow_4_1_3(
                bam=self.merge_and_mark.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
            ),
        )
        # haploytype caller
        self.step(
            "haplotype_caller",
            Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out, reference=self.reference
            ),
        )
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=self.merge_and_mark.out,
                vcf=self.splitnormalisevcf.out,
                reference=self.reference,
            ),
        )
        # output
        self.output("fastq_qc", source=self.fastqc.out, output_folder="QC")

        self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM")

        self.output("doc", source=self.annotate_doc.out, output_folder="PERFORMANCE")
        self.output(
            "summary", source=self.performance_summary.out, output_folder="PERFORMANCE"
        )
        self.output(
            "gene_summary",
            source=self.performance_summary.geneFileOut,
            output_folder="PERFORMANCE",
        )
        self.output(
            "region_summary",
            source=self.performance_summary.regionFileOut,
            output_folder="PERFORMANCE",
        )

        self.output("gridss_vcf", source=self.gridss.out, output_folder="SV")
        self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV")

        self.output("hap_vcf", source=self.haplotype_caller.out, output_folder="VCF")
        self.output("hap_bam", source=self.haplotype_caller.bam, output_folder="VCF")
        self.output("normalise_vcf", source=self.addbamstats.out, output_folder="VCF")
Exemplo n.º 5
0
    def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        # self.step(
        #     "split_bam",
        #     gatk4.Gatk4SplitReads_4_0(bam=self.bam, intervals=self.intervals),
        # )
        self.step(
            "base_recalibrator",
            gatk4.Gatk4BaseRecalibrator_4_0(
                bam=self.bam,
                intervals=self.intervals,
                reference=self.reference,
                knownSites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )
        self.step(
            "apply_bqsr",
            gatk4.Gatk4ApplyBqsr_4_0(
                bam=self.bam,
                intervals=self.intervals,
                recalFile=self.base_recalibrator.out,
                reference=self.reference,
            ),
        )
        self.step(
            "haplotype_caller",
            gatk4.Gatk4HaplotypeCaller_4_0(
                inputRead=self.apply_bqsr,
                intervals=self.intervals,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out,
                reference=self.reference),
        )
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(bam=self.bam,
                                      vcf=self.splitnormalisevcf.out,
                                      reference=self.reference),
        )

        self.output("variants", source=self.haplotype_caller.out)
        self.output("out_bam", source=self.haplotype_caller.bam)
        self.output("out", source=self.addbamstats.out)
    def constructor(self):

        # Inputs
        self.input("sample_name", String)
        self.input("fastqs", Array(FastqGzPair))
        self.input("seqrun", String, doc="SeqRun Name (for Vcf2Tsv)")
        self.input("reference", FastaWithDict)
        self.input("region_bed", Bed)
        self.input("region_bed_extended", Bed)
        self.input("region_bed_annotated", Bed)
        self.input("genecoverage_bed", Bed)
        self.input("genome_file", TextFile)
        self.input("panel_name", String)
        self.input("vcfcols", TextFile)
        self.input("black_list", Bed(optional=True))
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)
        self.input("mutalyzer_server", String)
        self.input("pathos_db", String)
        self.input("maxRecordsInRam", Int)
        # tumor only
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))

        # fastqc
        self.step(
            "fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads"
        )
        # get the overrepresentative sequence from fastqc
        self.step(
            "getfastqc_adapters",
            ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,),
            scatter="fastqc_datafiles",
        )
        # align and generate sorted index bam
        self.step(
            "align_and_sort",
            BwaAligner(
                fastq=self.fastqs,
                reference=self.reference,
                sample_name=self.sample_name,
                sortsam_tmpDir=".",
                cutadapt_adapter=self.getfastqc_adapters,
                cutadapt_removeMiddle3Adapter=self.getfastqc_adapters,
            ),
            scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"],
        )
        # merge into one bam and markdups
        self.step(
            "merge_and_mark",
            MergeAndMarkBams_4_1_3(
                bams=self.align_and_sort.out,
                sampleName=self.sample_name,
                maxRecordsInRam=self.maxRecordsInRam,
            ),
        )
        # performance: doc
        self.step(
            "annotate_doc",
            AnnotateDepthOfCoverage_0_1_0(
                bam=self.merge_and_mark.out,
                bed=self.region_bed_annotated,
                reference=self.reference,
                sample_name=self.sample_name,
            ),
        )

        # performance
        self.step(
            "performance_summary",
            PerformanceSummaryTargeted_0_1_0(
                bam=self.merge_and_mark.out,
                region_bed=self.region_bed,
                genecoverage_bed=self.genecoverage_bed,
                sample_name=self.sample_name,
                genome_file=self.genome_file,
            ),
        )
        # gridss
        self.step(
            "gridss",
            Gridss_2_6_2(
                bams=self.merge_and_mark.out,
                reference=self.reference,
                blacklist=self.black_list,
                tmpdir=".",
            ),
        )
        # post gridss r for tumor only + tumor only mode
        # self.step("gridss_post_r", GRIDSSProcessOutput(inp=self.gridss.out))
        # gatk bqsr bam
        self.step(
            "bqsr",
            GATKBaseRecalBQSRWorkflow_4_1_3(
                bam=self.merge_and_mark.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
            ),
        )
        # mutect2
        self.step(
            "mutect2",
            GatkSomaticVariantCallerTumorOnlyTargeted(
                bam=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                gnomad=self.gnomad,
                panel_of_normals=self.panel_of_normals,
            ),
        )
        # haplotypecaller to do: take base recal away from the
        self.step(
            "haplotype_caller",
            Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out, reference=self.reference
            ),
        )
        # combine variants
        self.step(
            "combinevariants",
            CombineVariants_0_0_8(
                vcfs=[self.splitnormalisevcf.out, self.mutect2.out],
                type="germline",
                columns=["AD", "DP", "AF", "GT"],
            ),
        )
        self.step("compressvcf", BGZip_1_9(file=self.combinevariants.out))
        self.step("sortvcf", BcfToolsSort_1_9(vcf=self.compressvcf.out))
        self.step("uncompressvcf", UncompressArchive(file=self.sortvcf.out))
        # addbamstats
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=self.merge_and_mark.out,
                vcf=self.uncompressvcf.out,
                reference=self.reference,
            ),
        )
        # Molpath specific processes
        self.step("compressvcf2", BGZip_1_9(file=self.addbamstats.out))
        self.step("tabixvcf", TabixLatest(inp=self.compressvcf2.out))
        self.step(
            "calculate_variant_length",
            VcfLength_1_0_1(vcf=self.tabixvcf.out),
            doc="Add the length column for the output of AddBamStats",
        )

        filter_for_variants = self.input("filter_for_vcfs", str, default="length > 150")
        self.step(
            "filter_variants_1_failed",
            VcfFilter_1_0_1(
                vcf=self.calculate_variant_length.out, info_filter=filter_for_variants
            ),
        )
        self.step(
            "filter_variants_1",
            VcfFilter_1_0_1(
                vcf=self.calculate_variant_length.out,
                info_filter=filter_for_variants,
                invert=True,  # -v param
            ),
        )

        # Jiaan: copy over from the FRCP, can take the block comment out
        # # This one is the in-house molpath step
        # self.step(
        #     "normalise_vcfs",
        #     NormaliseVcf_1_5_4(
        #         pathos_version=self.pathos_db,
        #         mutalyzer=self.mutalyzer_server,  # mutalyzer="https://vmpr-res-mutalyzer1.unix.petermac.org.au",
        #         rdb=self.pathos_db,  # rdb="pa_uat",
        #         inp=self.filter_variants_1.out,
        #     ),
        # )

        # # repeat remove 150bp variants (workaround for normalise_vcf bug)
        # self.step(
        #     "filter_variants_2_failed",
        #     VcfFilter_1_0_1(
        #         vcf=self.normalise_vcfs.out, info_filter=filter_for_variants
        #     ),
        # )
        # self.step(
        #     "filter_variants_2",
        #     VcfFilter_1_0_1(
        #         vcf=self.normalise_vcfs.out,
        #         info_filter=filter_for_variants,
        #         invert=True,  # -v param
        #     ),
        # )

        # self.step(
        #     "convert_to_tsv",
        #     Vcf2Tsv_1_5_4(
        #         pathos_version=self.pathos_db,
        #         inp=self.filter_variants_2.out,
        #         sample=self.sample_name,
        #         columns=self.vcfcols,
        #         seqrun=self.seqrun,
        #     ),
        # )

        # self.step(
        #     "index_with_igvtools", IgvIndexFeature_2_5_3(inp=self.filter_variants_2.out)
        # )

        # output
        self.output("fastq_qc", source=self.fastqc.out, output_folder="QC")

        self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM")

        self.output(
            "doc_out", source=self.annotate_doc.out, output_folder="PERFORMANCE"
        )
        self.output(
            "summary", source=self.performance_summary.out, output_folder="PERFORMANCE"
        )
        self.output(
            "gene_summary",
            source=self.performance_summary.geneFileOut,
            output_folder="PERFORMANCE",
        )
        self.output(
            "region_summary",
            source=self.performance_summary.regionFileOut,
            output_folder="PERFORMANCE",
        )

        self.output("gridss_vcf", source=self.gridss.out, output_folder="SV")
        self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV")

        self.output(
            "haplotypecaller_vcf",
            source=self.haplotype_caller.out,
            output_folder="VCF",
        )
        self.output(
            "haplotypecaller_bam",
            source=self.haplotype_caller.bam,
            output_folder="VCF",
        )
        self.output(
            "haplotypecaller_norm",
            source=self.splitnormalisevcf.out,
            output_folder="VCF",
        )
        self.output("mutect2_vcf", source=self.mutect2.variants, output_folder="VCF")
        self.output("mutect2_bam", source=self.mutect2.out_bam, output_folder="VCF")
        self.output("mutect2_norm", source=self.mutect2.out, output_folder="VCF")
        self.output("addbamstats_vcf", source=self.addbamstats.out)