def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") PipelineExome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) PipelineExome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) IOTools.zapFile(outfile1) PipelineExome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) IOTools.zapFile(outfile2)
def runMutectReverse(infiles, outfile): '''Use control as tumor and vis versa to estimate false positive rate''' infile, normal_panel = infiles infile_tumour = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" basename = P.snip(outfile, ".mutect.reverse.snp.vcf") call_stats_out = basename + "_call_stats.reverse.out" coverage_wig_out = basename + "_coverage.reverse.wig" mutect_log = basename + ".reverse.log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = ( PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile_tumour)
def defineEBioStudies(outfile): ''' For the cancer types specified in pipeline.ini, identify the relevent studies in eBio ''' cancer_types = PARAMS["annotation_ebio_cancer_types"] PipelineExome.defineEBioStudies(cancer_types, outfile, submit=False)
def runMutectOnDownsampled(infiles, outfile): '''call somatic SNPs using MuTect on downsampled bams''' infile, normal_panel = infiles infile_tumour = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = ( PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile_tumour, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile)
def runMutectReverse(infiles, outfile): '''Use control as tumor and vis versa to estimate false positive rate''' infile, normal_panel = infiles infile_tumour = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" basename = P.snip(outfile, ".mutect.reverse.snp.vcf") call_stats_out = basename + "_call_stats.reverse.out" coverage_wig_out = basename + "_coverage.reverse.wig" mutect_log = basename + ".reverse.log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile_tumour)
def extractEBioinfo(infiles, outfile): '''find the number of mutations identitified in previous studies (ebio_ids) for the mutated genes in the annotated vcfs''' eBio_ids = infiles[0] vcfs = infiles[1:] PipelineExome.extractEBioinfo(eBio_ids, vcfs, outfile, submit=False)
def mutationalSignature(infiles, outfiles): min_t_alt = PARAMS["filter_minimum_tumor_allele"] min_t_alt_freq = PARAMS["filter_minimum_tumor_allele_frequency"] min_n_depth = PARAMS["filter_minimum_normal_depth"] max_n_alt_freq = PARAMS["filter_maximum_normal_allele_frequency"] tumour = PARAMS["mutect_tumour"] PipelineExome.compileMutationalSignature( infiles, outfiles, min_t_alt, min_n_depth, max_n_alt_freq, min_t_alt_freq, tumour, submit=True)
def indelCaller(infile, outfile): '''Call somatic indels using Strelka''' infile_tumour = infile.replace("Control", PARAMS["mutect_tumour"]) outdir = "/".join(outfile.split("/")[0:2]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) config = "config.ini" PipelineExome.strelkaINDELCaller(infile, infile_tumour, outfile, genome, config, outdir, PARAMS['strelka_memory'], PARAMS['strelka_threads'])
def indelCaller(infile, outfile): '''Call somatic indels using Strelka''' infile_tumour = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) outdir = "/".join(outfile.split("/")[0:2]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.strelkaINDELCaller(infile, infile_tumour, outfile, genome, PARAMS['strelka_config'], outdir, PARAMS['strelka_memory'], PARAMS['strelka_threads'])
def filterMutect(infile, outfile): ''' filter mutect snps using allele frequencies ''' logfile = outfile.replace(".vcf", ".log") min_t_alt = PARAMS["filter_minimum_tumor_allele"] min_t_alt_freq = PARAMS["filter_minimum_tumor_allele_frequency"] min_n_depth = PARAMS["filter_minimum_normal_depth"] max_n_alt_freq = PARAMS["filter_maximum_normal_allele_frequency"] min_ratio = PARAMS["filter_minimum_ratio"] PipelineExome.filterMutect(infile, outfile, logfile, PARAMS["sample_control"], PARAMS["sample_tumour"], min_t_alt, min_n_depth, max_n_alt_freq, min_t_alt_freq, min_ratio)
def filterMutect(infile, outfile): ''' filter mutect snps using allele frequencies ''' logfile = outfile.replace(".vcf", ".log") min_t_alt = PARAMS["filter_minimum_tumor_allele"] min_t_alt_freq = PARAMS["filter_minimum_tumor_allele_frequency"] min_n_depth = PARAMS["filter_minimum_normal_depth"] max_n_alt_freq = PARAMS["filter_maximum_normal_allele_frequency"] min_ratio = PARAMS["filter_minimum_ratio"] PipelineExome.filterMutect( infile, outfile, logfile, PARAMS["sample_control"], PARAMS["sample_tumour"], min_t_alt, min_n_depth, max_n_alt_freq, min_t_alt_freq, min_ratio)
def callControlVariants(infile, outfile): '''run mutect to call snps in control sample''' basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" cosmic, dbsnp, = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], artifact=True)
def realignMatchedSample(infile, outfile): ''' repeat realignments with merged bam of control and tumor this should help avoid problems with sample-specific realignments''' genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.GATKIndelRealign(infile, outfile, genome) IOTools.zapFile(infile)
def runMutectOnDownsampled(infiles, outfile): '''call somatic SNPs using MuTect on downsampled bams''' infile, normal_panel = infiles infile_tumour = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile_tumour, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile)
def intersectHeatmap(infiles, outfile): ''' intersect DE test_ids across the different quantifiers''' PipelineExome.intersectionHeatmap(infiles, outfile)
def mutationalSignature(infiles, outfiles): PipelineExome.compileMutationalSignature( infiles, outfiles)
def summariseFiltering(infile, outfile): infile = infile.replace(".mutect.snp.vcf", "_call_stats.out") PipelineExome.parseMutectCallStats(infile, outfile, submit=True)
def mutationalSignature(infiles, outfiles): PipelineExome.compileMutationalSignature(infiles, outfiles)