def add_gatk_variantcaller(self, normal_bam_source, tumor_bam_source): if "generate_gatk_intervals" in self.step_nodes: generated_intervals = self.generate_gatk_intervals.out_regions else: generated_intervals = self.step( "generate_gatk_intervals", GenerateIntervalsByChromosome(reference=self.reference), when=self.gatk_intervals.is_null(), ).out_regions intervals = FirstOperator([self.gatk_intervals, generated_intervals]) recal_ins = { "reference": self.reference, "intervals": intervals, "snps_dbsnp": self.snps_dbsnp, "snps_1000gp": self.snps_1000gp, "known_indels": self.known_indels, "mills_indels": self.mills_indels, } self.step( "bqsr_normal", GATKBaseRecalBQSRWorkflow_4_1_3(bam=normal_bam_source, **recal_ins), scatter="intervals", ) self.step( "bqsr_tumor", GATKBaseRecalBQSRWorkflow_4_1_3(bam=tumor_bam_source, **recal_ins), scatter="intervals", ) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.bqsr_normal.out, tumor_bam=self.bqsr_tumor.out, normal_name=self.normal_name, intervals=intervals, reference=self.reference, gnomad=self.gnomad, panel_of_normals=self.panel_of_normals, ), scatter=["intervals", "normal_bam", "tumor_bam"], ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) self.step("vc_gatk_compressvcf", BGZipLatest(file=self.vc_gatk_merge.out)) self.step( "vc_gatk_sort_combined", BcfToolsSort_1_9( vcf=self.vc_gatk_compressvcf.out.as_type(CompressedVcf)), ) self.step( "vc_gatk_uncompressvcf", UncompressArchive(file=self.vc_gatk_sort_combined.out), )
def add_vardict_variantcaller(self, normal_bam_source, tumor_bam_source): self.step( "generate_vardict_headerlines", GenerateVardictHeaderLines(reference=self.reference), ) self.step( "vc_vardict", VardictSomaticVariantCaller( normal_bam=normal_bam_source, tumor_bam=tumor_bam_source, normal_name=self.normal_name, tumor_name=self.tumor_name, header_lines=self.generate_vardict_headerlines.out, intervals=self.vardict_intervals, reference=self.reference, allele_freq_threshold=self.allele_freq_threshold, minMappingQual=self.minMappingQual, filter=self.filter, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) self.step("vc_vardict_compress_for_sort", BGZipLatest(file=self.vc_vardict_merge.out)) self.step( "vc_vardict_sort_combined", BcfToolsSort_1_9(vcf=self.vc_vardict_compress_for_sort.out.as_type( CompressedVcf)), ) self.step( "vc_vardict_uncompress_for_combine", UncompressArchive(file=self.vc_vardict_sort_combined.out), ) self.output( "out_variants_vardict", source=self.vc_vardict_sort_combined.out, output_folder=[ "vcf", ], output_name=StringFormatter( "{tumor_name}--{normal_name}_vardict", tumor_name=self.tumor_name, normal_name=self.normal_name, ), doc="Merged variants from the VarDict caller", ) self.output( "out_variants_vardict_split", source=self.vc_vardict.out, output_folder=[ "vcf", "VardictByInterval", ], doc="Unmerged variants from the GATK caller (by interval)", )
def add_vardict_variantcaller(self, bam_source): self.input( "allele_freq_threshold", Float, 0.05, ), self.input("minMappingQual", Int(optional=True)) self.input("filter", String(optional=True)) # Vardict self.step( "generate_vardict_headerlines", GenerateVardictHeaderLines(reference=self.reference), ) self.step( "vc_vardict", VardictGermlineVariantCaller( bam=bam_source, reference=self.reference, intervals=self.vardict_intervals, sample_name=self.sample_name, allele_freq_threshold=self.allele_freq_threshold, header_lines=self.generate_vardict_headerlines.out, minMappingQual=self.minMappingQual, filter=self.filter, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) self.step( "vc_vardict_compress_for_sort", BGZipLatest(file=self.vc_vardict_merge.out.as_type(Vcf)), ) self.step( "vc_vardict_sort_combined", BcfToolsSort_1_9( vcf=self.vc_vardict_compress_for_sort.out.as_type(CompressedVcf) ), ) self.step( "vc_vardict_uncompress_for_combine", UncompressArchive(file=self.vc_vardict_sort_combined.out), ) self.output( "out_variants_vardict", source=self.vc_vardict_sort_combined.out, output_folder=["variants"], output_name="vardict", doc="Merged variants from the VarDict caller", ) self.output( "out_variants_vardict_split", source=self.vc_vardict.out, output_folder=["variants", "vardict"], doc="Unmerged variants from the VarDict caller (by interval)", )
def constructor(self): self.input( "normal_inputs", Array(FastqGzPair), doc=InputDocumentation( "An array of NORMAL FastqGz pairs. These are aligned separately and merged to create higher depth coverages from multiple sets of reads", quality=InputQualityType.user, example='["normal_R1.fastq.gz", "normal_R2.fastq.gz"]', ), ) self.input( "tumor_inputs", Array(FastqGzPair), doc=InputDocumentation( "An array of TUMOR FastqGz pairs. These are aligned separately and merged to create higher depth coverages from multiple sets of reads", quality=InputQualityType.user, example='["tumor_R1.fastq.gz", "tumor_R2.fastq.gz"]', ), ) self.input( "normal_name", String(), doc=InputDocumentation( "Sample name for the NORMAL sample from which to generate the readGroupHeaderLine for BwaMem", quality=InputQualityType.user, example="NA24385_normal", ), ) self.input( "tumor_name", String(), doc=InputDocumentation( "Sample name for the TUMOR sample from which to generate the readGroupHeaderLine for BwaMem", quality=InputQualityType.user, example="NA24385_tumor", ), ) self.input( "cutadapt_adapters", File(optional=True), doc=InputDocumentation( "Specifies a containment list for cutadapt, which contains a list of sequences to determine valid overrepresented sequences from " "the FastQC report to trim with Cuatadapt. The file must contain sets of named adapters in the form: " "``name[tab]sequence``. Lines prefixed with a hash will be ignored.", quality=InputQualityType.static, example= "https://github.com/csf-ngs/fastqc/blob/master/Contaminants/contaminant_list.txt", ), ) self.input( "gatk_intervals", Array(Bed), doc=InputDocumentation( "List of intervals over which to split the GATK variant calling", quality=InputQualityType.static, example="BRCA1.bed", ), ) self.input( "gridss_blacklist", Bed, doc=InputDocumentation( "BED file containing regions to ignore.", quality=InputQualityType.static, example="https://github.com/PapenfussLab/gridss#blacklist", ), ) self.input( "vardict_intervals", Array(Bed), doc=InputDocumentation( "List of intervals over which to split the VarDict variant calling", quality=InputQualityType.static, example="BRCA1.bed", ), ) self.input( "strelka_intervals", BedTabix, doc=InputDocumentation( "An interval for which to restrict the analysis to.", quality=InputQualityType.static, example="BRCA1.bed.gz", ), ) self.input( "allele_freq_threshold", Float, default=0.05, doc=InputDocumentation( "The threshold for VarDict's allele frequency, default: 0.05 or 5%", quality=InputQualityType.configuration, example=None, ), ) self.input( "reference", FastaWithDict, doc=InputDocumentation( """\ The reference genome from which to align the reads. This requires a number indexes (can be generated \ with the 'IndexFasta' pipeline This pipeline has been tested using the HG38 reference set. This pipeline expects the assembly references to be as they appear in the GCP example: - (".fai", ".amb", ".ann", ".bwt", ".pac", ".sa", "^.dict").""", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.fasta", ), ) self.input( "snps_dbsnp", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "(WARNING: The file available from the genomics-public-data resource on Google Cloud Storage is NOT compressed and indexed. This will need to be completed prior to starting the pipeline.\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.gz", ), ) self.input( "snps_1000gp", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", ), ) self.input( "known_indels", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz", ), ) self.input( "mills_indels", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", ), ) self.step( "normal", self.process_subpipeline( reads=self.normal_inputs, sample_name=self.normal_name, reference=self.reference, cutadapt_adapters=self.cutadapt_adapters, ), ) self.step( "tumor", self.process_subpipeline( reads=self.tumor_inputs, sample_name=self.tumor_name, reference=self.reference, cutadapt_adapters=self.cutadapt_adapters, ), ) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.tumor.out, tumor_bam=self.normal.out, normal_name=self.normal_name, tumor_name=self.tumor_name, intervals=self.gatk_intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, ), scatter="intervals", ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk)) self.step( "vc_strelka", IlluminaSomaticVariantCaller( normal_bam=self.normal.out, tumor_bam=self.tumor.out, intervals=self.strelka_intervals, reference=self.reference, ), ) self.step( "vc_gridss", Gridss_2_6_2( bams=[self.normal.out, self.tumor.out], reference=self.reference, blacklist=self.gridss_blacklist, ), ) self.step( "generate_vardict_headerlines", GenerateVardictHeaderLines(reference=self.reference), ) self.step( "vc_vardict", VardictSomaticVariantCaller( normal_bam=self.tumor.out, tumor_bam=self.normal.out, normal_name=self.normal_name, tumor_name=self.tumor_name, header_lines=self.generate_vardict_headerlines.out, intervals=self.vardict_intervals, reference=self.reference, allele_freq_threshold=self.allele_freq_threshold, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) self.step( "combine_variants", CombineVariants_0_0_4( normal=self.normal_name, tumor=self.tumor_name, vcfs=[ self.vc_gatk_merge.out, self.vc_strelka.out, self.vc_vardict_merge.out, ], type="somatic", columns=["AD", "DP", "GT"], ), ) self.step("sortCombined", BcfToolsSort_1_9(vcf=self.combine_variants.vcf)) # Outputs self.output( "normal_report", source=self.normal.reports, output_folder="reports", doc="A zip file of the NORMAL FastQC quality reports.", ) self.output( "tumor_report", source=self.tumor.reports, output_folder="reports", doc="A zip file of the TUMOR FastQC quality reports.", ) self.output( "normal_bam", source=self.normal.out, output_folder="bams", output_name=self.normal_name, doc="Aligned and indexed NORMAL bam", ) self.output( "tumor_bam", source=self.tumor.out, output_folder="bams", output_name=self.tumor_name, doc="Aligned and indexed TUMOR bam", ) self.output( "gridss_assembly", source=self.vc_gridss.assembly, output_folder="bams", doc="Assembly returned by GRIDSS", ) self.output( "variants_gatk", source=self.vc_gatk_merge.out, output_folder="variants", doc="Merged variants from the GATK caller", ) self.output( "variants_strelka", source=self.vc_strelka.out, output_folder="variants", doc="Variants from the Strelka variant caller", ) self.output( "variants_vardict", source=self.vc_vardict_merge.out, output_folder="variants", doc="Merged variants from the VarDict caller", ) self.output( "variants_gridss", source=self.vc_gridss.out, output_folder="variants", doc="Variants from the GRIDSS variant caller", ) self.output( "variants", source=self.combine_variants.vcf, output_folder="variants", doc="Combined variants from all 3 callers", )
def constructor(self): self.input("normal", BamBai) self.input("tumor", BamBai) self.input("normal_name", String(), value="NA24385_normal") self.input("tumor_name", String(), value="NA24385_tumour") self.input("gridss_blacklist", Bed) self.input("gatk_intervals", Array(Bed)) self.input("vardict_intervals", Array(Bed)) self.input("strelka_intervals", BedTabix(optional=True)) self.input("vardict_header_lines", File) self.input("allele_freq_threshold", Float, default=0.05) self.input("reference", FastaWithDict) self.input("snps_dbsnp", VcfTabix) self.input("snps_1000gp", VcfTabix) self.input("known_indels", VcfTabix) self.input("mills_indels", VcfTabix) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.tumor, tumor_bam=self.normal, normal_name=self.normal_name, tumor_name=self.tumor_name, intervals=self.gatk_intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, ), scatter="intervals", ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk)) self.step( "vc_strelka", IlluminaSomaticVariantCaller( normal_bam=self.normal, tumor_bam=self.tumor, intervals=self.strelka_intervals, reference=self.reference, ), ) self.step( "vc_gridss", Gridss_2_6_3( bams=[self.normal, self.tumor], reference=self.reference, blacklist=self.gridss_blacklist, ), ) self.step( "vc_vardict", VardictSomaticVariantCaller( normal_bam=self.tumor, tumor_bam=self.normal, normal_name=self.normal_name, tumor_name=self.tumor_name, header_lines=self.vardict_header_lines, intervals=self.vardict_intervals, reference=self.reference, allele_freq_threshold=self.allele_freq_threshold, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) self.step( "combine_variants", CombineVariants_0_0_4( normal=self.normal_name, tumor=self.tumor_name, vcfs=[ self.vc_gatk_merge.out, self.vc_strelka.out, self.vc_vardict_merge.out, ], type="somatic", columns=["AD", "DP", "GT"], ), ) self.step("sortCombined", BcfToolsSort_1_9(vcf=self.combine_variants.vcf)) # Outputs self.output("gridss_assembly", source=self.vc_gridss.out, output_folder="bams") self.output("variants_gatk", source=self.vc_gatk_merge.out, output_folder="variants") self.output("variants_strelka", source=self.vc_strelka.out, output_folder="variants") self.output( "variants_vardict", source=self.vc_vardict_merge.out, output_folder="variants", ) self.output("variants_gridss", source=self.vc_gridss.out, output_folder="variants") self.output( "variants_combined", source=self.combine_variants.vcf, output_folder="variants", )
def add_gatk_variantcaller(self, bam_source): # VARIANT CALLERS intervals = FirstOperator( [ self.gatk_intervals, self.step( "generate_gatk_intervals", GenerateIntervalsByChromosome(reference=self.reference), when=self.gatk_intervals.is_null(), ).out_regions, ] ) # GATK self.step( "bqsr", GATKBaseRecalBQSRWorkflow_4_1_3( bam=bam_source, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, intervals=intervals, ), scatter="intervals", doc="Perform base quality score recalibration", ) self.step( "vc_gatk", GatkGermlineVariantCaller_4_1_3( bam=self.bqsr.out, intervals=intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, ), scatter=["intervals", "bam"], ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) self.step("vc_gatk_compressvcf", BGZipLatest(file=self.vc_gatk_merge.out)) self.step( "vc_gatk_sort_combined", BcfToolsSort_1_9(vcf=self.vc_gatk_compressvcf.out.as_type(CompressedVcf)), ) self.step( "vc_gatk_uncompress", UncompressArchive(file=self.vc_gatk_sort_combined.out), ) self.output( "out_variants_gatk", source=self.vc_gatk_sort_combined.out, output_folder="variants", output_name="gatk", doc="Merged variants from the GATK caller", ) self.output( "out_variants_gatk_split", source=self.vc_gatk.out, output_folder=["variants", "gatk"], doc="Unmerged variants from the GATK caller (by interval)", )
def constructor(self): self.input( "normal_inputs", Array(FastqGzPair), doc=InputDocumentation( "An array of NORMAL FastqGz pairs. These are aligned separately and merged to create higher depth coverages from multiple sets of reads", quality=InputQualityType.user, example='["normal_R1.fastq.gz", "normal_R2.fastq.gz"]', ), ) self.input( "tumor_inputs", Array(FastqGzPair), doc=InputDocumentation( "An array of TUMOR FastqGz pairs. These are aligned separately and merged to create higher depth coverages from multiple sets of reads", quality=InputQualityType.user, example='["tumor_R1.fastq.gz", "tumor_R2.fastq.gz"]', ), ) self.input( "normal_name", String(), doc=InputDocumentation( "Sample name for the NORMAL sample from which to generate the readGroupHeaderLine for BwaMem", quality=InputQualityType.user, example="NA24385_normal", ), ) self.input( "tumor_name", String(), doc=InputDocumentation( "Sample name for the TUMOR sample from which to generate the readGroupHeaderLine for BwaMem", quality=InputQualityType.user, example="NA24385_tumor", ), ) self.input( "cutadapt_adapters", File(optional=True), doc=InputDocumentation( "Specifies a containment list for cutadapt, which contains a list of sequences to determine valid overrepresented sequences from " "the FastQC report to trim with Cuatadapt. The file must contain sets of named adapters in the form: " "``name[tab]sequence``. Lines prefixed with a hash will be ignored.", quality=InputQualityType.static, example= "https://github.com/csf-ngs/fastqc/blob/master/Contaminants/contaminant_list.txt", ), ) self.input( "gatk_intervals", Array(Bed), doc=InputDocumentation( "List of intervals over which to split the GATK variant calling", quality=InputQualityType.static, example="BRCA1.bed", ), ) self.input( "reference", FastaWithDict, doc=InputDocumentation( """\ The reference genome from which to align the reads. This requires a number indexes (can be generated \ with the 'IndexFasta' pipeline This pipeline has been tested using the HG38 reference set. This pipeline expects the assembly references to be as they appear in the GCP example: - (".fai", ".amb", ".ann", ".bwt", ".pac", ".sa", "^.dict").""", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.fasta", ), ) self.input( "snps_dbsnp", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "(WARNING: The file available from the genomics-public-data resource on Google Cloud Storage is NOT compressed and indexed. This will need to be completed prior to starting the pipeline.\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.gz", ), ) self.input( "snps_1000gp", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", ), ) self.input( "known_indels", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz", ), ) self.input( "mills_indels", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", ), ) self.step( "tumor", self.process_subpipeline( reads=self.tumor_inputs, sample_name=self.tumor_name, reference=self.reference, cutadapt_adapters=self.cutadapt_adapters, ), ) self.step( "normal", self.process_subpipeline( reads=self.normal_inputs, sample_name=self.normal_name, reference=self.reference, cutadapt_adapters=self.cutadapt_adapters, ), ) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.normal.out, tumor_bam=self.tumor.out, normal_name=self.normal_name, tumor_name=self.tumor_name, intervals=self.gatk_intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, ), scatter="intervals", ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk)) self.step("sorted", BcfToolsSort_1_9(vcf=self.vc_gatk_merge.out)) # Outputs self.output( "normal_bam", source=self.normal.out, output_folder="bams", output_name=self.normal_name, ) self.output( "tumor_bam", source=self.tumor.out, output_folder="bams", output_name=self.tumor_name, ) self.output("normal_report", source=self.normal.reports, output_folder="reports") self.output("tumor_report", source=self.tumor.reports, output_folder="reports") self.output( "variants", source=self.sorted.out, output_folder="variants", doc="Merged variants from the GATK caller", ) self.output( "variants_split", source=self.vc_gatk.out, output_folder=["variants", "byInterval"], doc="Unmerged variants from the GATK caller (by interval)", )
def add_gatk_variantcaller(self, normal_bam_source, tumor_bam_source): """ Reimplemented because need steps for combine """ if "generate_gatk_intervals" in self.step_nodes: generated_intervals = self.generate_gatk_intervals.out_regions else: generated_intervals = self.step( "generate_gatk_intervals", GenerateIntervalsByChromosome(reference=self.reference), when=self.gatk_intervals.is_null(), ).out_regions intervals = FirstOperator([self.gatk_intervals, generated_intervals]) recal_ins = { "reference": self.reference, "intervals": intervals, "snps_dbsnp": self.snps_dbsnp, "snps_1000gp": self.snps_1000gp, "known_indels": self.known_indels, "mills_indels": self.mills_indels, } self.step( "bqsr_normal", GATKBaseRecalBQSRWorkflow_4_1_3(bam=normal_bam_source, **recal_ins), scatter="intervals", ) self.step( "bqsr_tumor", GATKBaseRecalBQSRWorkflow_4_1_3(bam=tumor_bam_source, **recal_ins), scatter="intervals", ) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.bqsr_normal.out, tumor_bam=self.bqsr_tumor.out, normal_name=self.normal_name, intervals=intervals, reference=self.reference, gnomad=self.gnomad, panel_of_normals=self.panel_of_normals, ), scatter=["intervals", "normal_bam", "tumor_bam"], ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) self.step("vc_gatk_compress_for_sort", BGZipLatest(file=self.vc_gatk_merge.out)) self.step( "vc_gatk_sort_combined", BcfToolsSort_1_9( vcf=self.vc_gatk_compress_for_sort.out.as_type(CompressedVcf)), ) self.step( "vc_gatk_uncompressvcf", UncompressArchive(file=self.vc_gatk_sort_combined.out), ) # VCF self.output( "out_variants_gatk", source=self.vc_gatk_sort_combined.out, output_folder=[ "vcf", ], output_name=StringFormatter( "{tumor_name}--{normal_name}_gatk", tumor_name=self.tumor_name, normal_name=self.normal_name, ), doc="Merged variants from the GATK caller", ) self.output( "out_variants_split", source=self.vc_gatk.out, output_folder=[ "vcf", "GATKByInterval", ], doc="Unmerged variants from the GATK caller (by interval)", )
def constructor(self): self.input( "sample_name", String, doc= "Sample name from which to generate the readGroupHeaderLine for BwaMem", ) self.input( "bam", BamBai, doc= "An array of FastqGz pairs. These are aligned separately and merged to create higher depth coverages from multiple sets of reads", ) self.input( "reference", FastaWithDict, doc= "The reference genome from which to align the reads. This requires a number indexes (can be generated with the 'IndexFasta' pipeline. This pipeline has been tested with the hg38 reference genome.", ) self.input( "cutadapt_adapters", File(optional=True), doc= "Specifies a file which contains a list of sequences to determine valid overrepresented sequences from the FastQC report to trim with Cuatadapt. The file must contain sets of named adapters in the form: ``name[tab]sequence``. Lines prefixed with a hash will be ignored.", ) self.input( "gatk_intervals", Array(Bed), doc= "List of intervals over which to split the GATK variant calling", ) self.input( "vardict_intervals", Array(Bed), doc= "List of intervals over which to split the VarDict variant calling", ) self.input( "strelka_intervals", BedTabix, doc= "An interval for which to restrict the analysis to. Recommended HG38 interval: ", ) self.input( "header_lines", File(optional=True), doc= "Header lines passed to BCFTools annotate as ``--header-lines``.", ) self.input( "allele_freq_threshold", Float, default=0.05, doc= "The threshold for VarDict's allele frequency, default: 0.05 or 5%", ) # self.input("gridssBlacklist", Bed) self.input( "snps_dbsnp", VcfTabix, doc= "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", ) self.input( "snps_1000gp", VcfTabix, doc= "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", ) self.input( "known_indels", VcfTabix, doc= "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", ) self.input( "mills_indels", VcfTabix, doc= "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", ) # VARIANT CALLERS # GATK self.step( "vc_gatk", GatkGermlineVariantCaller_4_1_3( bam=self.bam, intervals=self.gatk_intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, ), scatter="intervals", ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) # Strelka self.step( "vc_strelka", IlluminaGermlineVariantCaller(bam=self.bam, reference=self.reference, intervals=self.strelka_intervals), ) # Vardict self.step( "vc_vardict", VardictGermlineVariantCaller( bam=self.bam, reference=self.reference, intervals=self.vardict_intervals, sample_name=self.sample_name, allele_freq_threshold=self.allele_freq_threshold, header_lines=self.header_lines, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) # GRIDSS # self.step( # "vc_gridss", # GridssGermlineVariantCaller( # bam=self.merge_and_mark.out, # reference=self.reference, # blacklist=self.gridssBlacklist, # ), # ) # Combine self.step( "combine_variants", CombineVariants_0_0_4( vcfs=[ self.vc_gatk_merge.out, self.vc_strelka.out, self.vc_vardict_merge.out, # self.vc_gridss.out, ], type="germline", columns=["AC", "AN", "AF", "AD", "DP", "GT"], ), ) self.step("sort_combined", BcfToolsSort_1_9(vcf=self.combine_variants.vcf)) self.output( "variants_combined", source=self.sort_combined.out, output_folder="variants", doc="Combined variants from all 3 callers", ) self.output( "variants_gatk", source=self.vc_gatk_merge.out, output_folder="variants", output_name="gatk", doc="Merged variants from the GATK caller", ) self.output( "variants_vardict", source=self.vc_vardict_merge.out, output_folder=["variants"], output_name="vardict", doc="Merged variants from the VarDict caller", ) self.output( "variants_strelka", source=self.vc_strelka.out, output_folder="variants", output_name="strelka", doc="Variants from the Strelka variant caller", ) self.output( "variants_gatk_split", source=self.vc_gatk.out, output_folder=["variants", "gatk"], doc="Unmerged variants from the GATK caller (by interval)", ) self.output( "variants_vardict_split", source=self.vc_vardict.out, output_folder=["variants", "variants"], doc="Unmerged variants from the VarDict caller (by interval)", )
def constructor(self): self.input( "sample_name", String, doc=InputDocumentation( "Sample name from which to generate the readGroupHeaderLine for BwaMem", quality=InputQualityType.user, example="NA12878", ), ) self.input( "fastqs", Array(FastqGzPair), doc=InputDocumentation( "An array of FastqGz pairs. These are aligned separately and merged " "to create higher depth coverages from multiple sets of reads", quality=InputQualityType.user, example="[[BRCA1_R1.fastq.gz, BRCA1_R2.fastq.gz]]", ), ) self.input( "reference", FastaWithDict, doc=InputDocumentation( """\ The reference genome from which to align the reads. This requires a number indexes (can be generated \ with the 'IndexFasta' pipeline This pipeline has been tested using the HG38 reference set. This pipeline expects the assembly references to be as they appear in the GCP example: - (".fai", ".amb", ".ann", ".bwt", ".pac", ".sa", "^.dict").""", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.fasta", ), ) self.input( "cutadapt_adapters", File(optional=True), doc=InputDocumentation( "Specifies a containment list for cutadapt, which contains a list of sequences to determine valid overrepresented sequences from " "the FastQC report to trim with Cuatadapt. The file must contain sets of named adapters in the form: " "``name[tab]sequence``. Lines prefixed with a hash will be ignored.", quality=InputQualityType.static, example= "https://github.com/csf-ngs/fastqc/blob/master/Contaminants/contaminant_list.txt", ), ) self.input( "gatk_intervals", Array(Bed), doc=InputDocumentation( "List of intervals over which to split the GATK variant calling", quality=InputQualityType.static, example="BRCA1.bed", ), ) self.input( "vardict_intervals", Array(Bed), doc=InputDocumentation( "List of intervals over which to split the VarDict variant calling", quality=InputQualityType.static, example="BRCA1.bed", ), ) self.input( "strelka_intervals", BedTabix, doc=InputDocumentation( "An interval for which to restrict the analysis to.", quality=InputQualityType.static, example="BRCA1.bed.gz", ), ) self.input( "allele_freq_threshold", Float, default=0.05, doc=InputDocumentation( "The threshold for VarDict's allele frequency, default: 0.05 or 5%", quality=InputQualityType.configuration, example=None, ), ) # self.input("gridssBlacklist", Bed) self.input( "snps_dbsnp", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "(WARNING: The file available from the genomics-public-data resource on Google Cloud Storage is NOT compressed and indexed. This will need to be completed prior to starting the pipeline.\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.gz", ), ) self.input( "snps_1000gp", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", ), ) self.input( "known_indels", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz", ), ) self.input( "mills_indels", VcfTabix, doc=InputDocumentation( "From the GATK resource bundle, passed to BaseRecalibrator as ``known_sites``", quality=InputQualityType.static, example= "HG38: https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/\n\n" "File: gs://genomics-public-data/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", ), ) # STEPS self.step("fastqc", FastQC_0_11_5(reads=self.fastqs), scatter="reads"), self.step( "getfastqc_adapters", ParseFastqcAdaptors( fastqc_datafiles=self.fastqc.datafile, cutadapt_adaptors_lookup=self.cutadapt_adapters, ), scatter="fastqc_datafiles", ) self.step( "align_and_sort", BwaAligner( fastq=self.fastqs, reference=self.reference, sample_name=self.sample_name, sortsam_tmpDir="./tmp", cutadapt_adapter=self.getfastqc_adapters, cutadapt_removeMiddle3Adapter=self.getfastqc_adapters, ), scatter=[ "fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter" ], ) self.step( "merge_and_mark", MergeAndMarkBams_4_1_3(bams=self.align_and_sort.out, sampleName=self.sample_name), ) # VARIANT CALLERS # GATK self.step( "vc_gatk", GatkGermlineVariantCaller_4_1_3( bam=self.merge_and_mark.out, intervals=self.gatk_intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, ), scatter="intervals", ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) # Strelka self.step( "vc_strelka", IlluminaGermlineVariantCaller( bam=self.merge_and_mark.out, reference=self.reference, intervals=self.strelka_intervals, ), ) # Vardict self.step( "generate_vardict_headerlines", GenerateVardictHeaderLines(reference=self.reference), ) self.step( "vc_vardict", VardictGermlineVariantCaller( bam=self.merge_and_mark.out, reference=self.reference, intervals=self.vardict_intervals, sample_name=self.sample_name, allele_freq_threshold=self.allele_freq_threshold, header_lines=self.generate_vardict_headerlines.out, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) # GRIDSS # self.step( # "vc_gridss", # GridssGermlineVariantCaller( # bam=self.merge_and_mark.out, # reference=self.reference, # blacklist=self.gridssBlacklist, # ), # ) # Combine self.step( "combine_variants", CombineVariants_0_0_4( vcfs=[ self.vc_gatk_merge.out, self.vc_strelka.out, self.vc_vardict_merge.out, # self.vc_gridss.out, ], type="germline", columns=["AC", "AN", "AF", "AD", "DP", "GT"], ), ) self.step("sort_combined", BcfToolsSort_1_9(vcf=self.combine_variants.vcf)) self.output( "reports", source=self.fastqc.out, output_folder="reports", doc="A zip file of the FastQC quality report.", ) self.output( "bam", source=self.merge_and_mark.out, output_folder="bams", doc="Aligned and indexed bam.", output_name=self.sample_name, ) self.output( "variants", source=self.sort_combined.out, output_folder="variants", output_name=self.sample_name, doc="Combined variants from all 3 callers", ) self.output( "variants_gatk", source=self.vc_gatk_merge.out, output_folder="variants", output_name="gatk", doc="Merged variants from the GATK caller", ) self.output( "variants_vardict", source=self.vc_vardict_merge.out, output_folder=["variants"], output_name="vardict", doc="Merged variants from the VarDict caller", ) self.output( "variants_strelka", source=self.vc_strelka.out, output_folder="variants", output_name="strelka", doc="Variants from the Strelka variant caller", ) self.output( "variants_gatk_split", source=self.vc_gatk.out, output_folder=["variants", "gatk"], doc="Unmerged variants from the GATK caller (by interval)", ) self.output( "variants_vardict_split", source=self.vc_vardict.out, output_folder=["variants", "variants"], doc="Unmerged variants from the VarDict caller (by interval)", )