def picard_calculate_hs_metrics(self): """ Compute on target percent of hybridisation based capture. """ jobs = [] created_interval_lists = [] for sample in self.samples: coverage_bed = bvatools.resolve_readset_coverage_bed( sample.readsets[0]) if coverage_bed: interval_list = re.sub("\.[^.]+$", ".interval_list", coverage_bed) if not interval_list in created_interval_lists: job = tools.bed2interval_list(None, coverage_bed, interval_list) job.name = "interval_list." + os.path.basename( coverage_bed) jobs.append(job) created_interval_lists.append(interval_list) input_file_prefix = os.path.join( "alignment", sample.name, sample.name + ".matefixed.sorted.") job = picard.calculate_hs_metrics( input_file_prefix + "bam", input_file_prefix + "onTarget.tsv", interval_list) job.name = "picard_calculate_hs_metrics." + sample.name job.samples = [sample] jobs.append(job) return jobs
def picard_calculate_hs_metrics(self): """ Compute on target percent of hybridisation based capture. """ jobs = [] created_interval_lists = [] for sample in self.samples: coverage_bed = bvatools.resolve_readset_coverage_bed(sample.readsets[0]) if coverage_bed: interval_list = re.sub("\.[^.]+$", ".interval_list", coverage_bed) if not interval_list in created_interval_lists: job = tools.bed2interval_list(None, coverage_bed, interval_list) job.name = "interval_list." + os.path.basename(coverage_bed) jobs.append(job) created_interval_lists.append(interval_list) input_file_prefix = os.path.join("alignment", sample.name, sample.name + ".matefixed.sorted.") job = picard.calculate_hs_metrics(input_file_prefix + "bam", input_file_prefix + "onTarget.tsv", interval_list) job.name = "picard_calculate_hs_metrics." + sample.name jobs.append(job) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ jobs = [] for sample in self.samples: input_file_prefix = os.path.join( "alignment", sample.name, sample.name + ".matefixed.sorted.") input = input_file_prefix + "bam" job = picard.collect_multiple_metrics( input, input_file_prefix + "all.metrics") job.name = "picard_collect_multiple_metrics." + sample.name job.samples = [sample] jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, input_file_prefix + "coverage.tsv", bvatools.resolve_readset_coverage_bed(sample.readsets[0]), other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False)) job.name = "bvatools_depth_of_coverage." + sample.name job.samples = [sample] jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name job.samples = [sample] jobs.append(job) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ jobs = [] for sample in self.samples: input_file_prefix = os.path.join("alignment", sample.name, sample.name + ".matefixed.sorted.") input = input_file_prefix + "bam" job = picard.collect_multiple_metrics(input, input_file_prefix + "all.metrics") job.name = "picard_collect_multiple_metrics." + sample.name jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, input_file_prefix + "coverage.tsv", bvatools.resolve_readset_coverage_bed(sample.readsets[0]), other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False) ) job.name = "bvatools_depth_of_coverage." + sample.name jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name jobs.append(job) return jobs
def call_variants(self): """ VarScan caller for insertions and deletions. """ jobs = [] nb_jobs = config.param('varscan', 'nb_jobs', type='posint') if nb_jobs > 50: log.warning( "Number of VarScan jobs is > 50. This is usually much. Anything beyond 20 can be problematic." ) variants_directory = os.path.join("variants") varscan_directory = os.path.join(variants_directory, "rawVarScan") beds = [] for idx in range(nb_jobs): beds.append( os.path.join(varscan_directory, 'chrs.' + str(idx) + '.bed')) genome_dictionary = config.param('DEFAULT', 'genome_dictionary', type='filepath') if nb_jobs > 1: bedJob = tools.dict2beds(genome_dictionary, beds) jobs.append( concat_jobs( [Job(command="mkdir -p " + varscan_directory), bedJob], name="varscan.genome.beds")) bams = [] sampleNamesFile = 'varscan_samples.tsv' sampleNames = open(sampleNamesFile, 'w') for sample in self.samples: alignment_directory = os.path.join("alignment", sample.name) input = os.path.join(alignment_directory, sample.name + ".matefixed.sorted.bam") bams.append(input) sampleNames.write("%s\n" % sample.name) bedfile = bvatools.resolve_readset_coverage_bed(sample.readsets[0]) #sampleNames.append(sample.name) if nb_jobs == 1: job = concat_jobs([ Job(command="mkdir -p " + varscan_directory, samples=self.samples), pipe_jobs([ samtools.mpileup(bams, None, config.param('varscan', 'mpileup_other_options'), regionFile=bedfile), varscan.mpileupcns( None, None, sampleNamesFile, config.param('varscan', 'other_options')), htslib.bgzip_tabix_vcf( None, os.path.join(variants_directory, "allSamples.vcf.gz")) ]) ], name="varscan.single") jobs.append(job) else: output_vcfs = [] for idx in range(nb_jobs): output_vcf = os.path.join(varscan_directory, "allSamples." + str(idx) + ".vcf.gz") varScanJob = pipe_jobs([ samtools.mpileup(bams, None, config.param('varscan', 'mpileup_other_options'), regionFile=beds[idx]), varscan.mpileupcns( None, None, sampleNamesFile, config.param('varscan', 'other_options')), htslib.bgzip_tabix_vcf(None, output_vcf) ], name="varscan." + str(idx)) varScanJob.samples = self.samples output_vcfs.append(output_vcf) jobs.append(varScanJob) job = gatk.cat_variants( output_vcfs, os.path.join(variants_directory, "allSamples.vcf.gz")) job.name = "gatk_cat_varscan" job.samples = self.samples jobs.append(job) return jobs
def ihec_sample_metrics_report(self): """ Retrieve the computed metrics which fit the IHEC standards and build a tsv report table for IHEC """ jobs = [] target_bed = bvatools.resolve_readset_coverage_bed( self.samples[0].readsets[0]) metrics_all_file = os.path.join("metrics", "IHEC.sampleMetrics.stats") report_metrics_file = os.path.join("report", "IHEC.sampleMetricsTable.tsv") if target_bed: report_file = os.path.join( "report", "MethylSeq.ihec_sample_metrics_targeted_report.md") else: report_file = os.path.join( "report", "MethylSeq.ihec_sample_metrics_report.md") # Create the list of input files to handle job dependencies inputs = [] sample_list = [] counter = 0 for sample in self.samples: sample_list.append(sample.name) metrics_file = os.path.join("ihec_metrics", sample.name + ".read_stats.txt") # Trim log files for readset in sample.readsets: inputs.append( os.path.join("trim", sample.name, readset.name + ".trim.log")) # Aligned pre-deduplicated bam files inputs.append( os.path.join("alignment", sample.name, sample.name + ".sorted.bam")) # Deduplicated bam files inputs.append( os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.bam")) # Coverage summary files inputs.append( os.path.join( "alignment", sample.name, sample.name + ".sorted.dedup.all.coverage.sample_summary")) # Filtered reads count files inputs.append( os.path.join( "alignment", sample.name, sample.name + ".sorted.dedup.filtered_reads.counts.txt")) # GC bias files inputs.append( os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.GCBias_all.txt")) # Bismark alignment files for readset in sample.readsets: inputs.append( os.path.join( "alignment", sample.name, readset.name, readset.name + ".sorted_noRG_bismark_bt2_PE_report.txt")) # CpG coverage files inputs.append( os.path.join( "methylation_call", sample.name, sample.name + ".readset_sorted.dedup.median_CpG_coverage.txt")) # pUC19 methylation files inputs.append( os.path.join( "methylation_call", sample.name, sample.name + ".readset_sorted.dedup.profile.pUC19.txt")) # Lambda conversion rate files [lambda_conv_file] = self.select_input_files([ [ os.path.join( "methylation_call", sample.name, sample.name + ".sorted.dedup.profile.lambda.conversion.rate.tsv") ], [ os.path.join( "methylation_call", sample.name, sample.name + ".readset_sorted.dedup.profile.lambda.conversion.rate.tsv" ) ] ]) inputs.append(lambda_conv_file) # CG stat files [cgstats_file] = self.select_input_files( [[ os.path.join( "methylation_call", sample.name, sample.name + ".sorted.dedup.profile.cgstats.txt") ], [ os.path.join( "methylation_call", sample.name, sample.name + ".readset_sorted.dedup.profile.cgstats.txt") ]]) inputs.append(cgstats_file) # Flagstat file if in targeted context if target_bed: inputs.append( os.path.join( "alignment", sample.name, sample.name + ".sorted.dedup.ontarget.bam.flagstat")) jobs.append( concat_jobs([ Job(command="mkdir -p ihec_metrics metrics"), tools.methylseq_ihec_metrics_report( sample.name, inputs, metrics_file, metrics_all_file, target_bed, counter), ], name=sample.name + ".ihec_sample_metrics_report")) counter += 1 jobs.append( concat_jobs([ Job(command="mkdir -p metrics"), Job([metrics_all_file], [report_file], [['ihec_sample_metrics_report', 'module_pandoc']], command="""\ mkdir -p report && \\ cp {metrics_file} {report_metrics_file} && \\ metrics_table_md=`sed 's/\t/|/g' {report_metrics_file}` pandoc \\ {report_template_dir}/{basename_report_file} \\ --template {report_template_dir}/{basename_report_file} \\ --variable sequence_alignment_table="$metrics_table_md" \\ --to markdown \\ > {report_file}""".format(report_template_dir=self.report_template_dir, metrics_file=metrics_file, basename_report_file=os.path.basename(report_file), report_metrics_file=report_metrics_file, report_file=report_file), report_files=[report_file]) ], name="ihec_sample_metrics_report")) return jobs
def all_sample_metrics_report(self): """ Retrieve all the computed metrics (alignment metrics as well as methylation metrics) to build a tsv report table """ jobs = [] target_bed = bvatools.resolve_readset_coverage_bed( self.samples[0].readsets[0]) metrics_file = os.path.join("metrics", "sampleMetrics.stats") report_metrics_file = os.path.join("report", "sampleMetricsTable.tsv") if target_bed: report_file = os.path.join( "report", "MethylSeq.all_sample_metrics_targeted_report.md") else: report_file = os.path.join( "report", "MethylSeq.all_sample_metrics_report.md") # Create the list of input files to handle job dependencies inputs = [] sample_list = [] for sample in self.samples: sample_list.append(sample.name) # Trim log files for readset in sample.readsets: inputs.append( os.path.join("trim", sample.name, readset.name + ".trim.log")) # Aligned pre-deduplicated bam files inputs.append( os.path.join("alignment", sample.name, sample.name + ".sorted.bam")) # Deduplicated bam files inputs.append( os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.bam")) # Coverage summary files inputs.append( os.path.join( "alignment", sample.name, sample.name + ".sorted.dedup.all.coverage.sample_summary")) # Lambda conversion rate files [lambda_conv_file] = self.select_input_files([ [ os.path.join( "methylation_call", sample.name, sample.name + ".sorted.dedup.profile.lambda.conversion.rate.tsv") ], [ os.path.join( "methylation_call", sample.name, sample.name + ".readset_sorted.dedup.profile.lambda.conversion.rate.tsv" ) ] ]) inputs.append(lambda_conv_file) # CG stat files [cgstats_file] = self.select_input_files( [[ os.path.join( "methylation_call", sample.name, sample.name + ".sorted.dedup.profile.cgstats.txt") ], [ os.path.join( "methylation_call", sample.name, sample.name + ".readset_sorted.dedup.profile.cgstats.txt") ]]) inputs.append(cgstats_file) # Flagstat file if in targeted context if target_bed: inputs.append( os.path.join( "alignment", sample.name, sample.name + ".sorted.dedup.ontarget.bam.flagstat")) jobs.append( concat_jobs([ Job(command="mkdir -p metrics"), tools.methylseq_metrics_report(sample_list, inputs, metrics_file, target_bed), Job([metrics_file], [report_file], [['all_sample_metrics_report', 'module_pandoc']], command="""\ mkdir -p report && \\ cp {metrics_file} {report_metrics_file} && \\ metrics_table_md=`sed 's/\t/|/g' {report_metrics_file}` pandoc \\ {report_template_dir}/{basename_report_file} \\ --template {report_template_dir}/{basename_report_file} \\ --variable sequence_alignment_table="$metrics_table_md" \\ --to markdown \\ > {report_file}""".format(report_template_dir=self.report_template_dir, metrics_file=metrics_file, basename_report_file=os.path.basename(report_file), report_metrics_file=report_metrics_file, report_file=report_file), report_files=[report_file]) ], name="all_sample_metrics_report")) return jobs
def methylation_profile(self): """ Generation of a CpG methylation profile by combining both forward and reverse strand Cs. Also generating of all the methylatoin metrics : CpG stats, pUC19 CpG stats, lambda conversion rate, median CpG coverage, GC bias """ jobs = [] for sample in self.samples: methyl_directory = os.path.join("methylation_call", sample.name) candidate_input_files = [[ os.path.join(methyl_directory, sample.name + ".sorted.dedup.CpG_report.txt.gz") ]] candidate_input_files.append([ os.path.join( methyl_directory, sample.name + ".readset_sorted.dedup.CpG_report.txt.gz") ]) [cpg_input_file] = self.select_input_files(candidate_input_files) cpg_profile = re.sub(".CpG_report.txt.gz", ".CpG_profile.strand.combined.csv", cpg_input_file) # Generate CpG methylation profile job = tools.bismark_combine(cpg_input_file, cpg_profile) job.name = "methylation_profile." + sample.name job.samples = [sample] jobs.append(job) # Generate stats for lambda, pUC19 and regular CpGs cg_stats_output = re.sub(".CpG_report.txt.gz", ".profile.cgstats.txt", cpg_input_file) lambda_stats_output = re.sub( ".CpG_report.txt.gz", ".profile.lambda.conversion.rate.tsv", cpg_input_file) puc19_stats_output = re.sub(".CpG_report.txt.gz", ".profile.pUC19.txt", cpg_input_file) job = tools.cpg_stats(cpg_profile, cg_stats_output, lambda_stats_output, puc19_stats_output) job.name = "CpG_stats." + sample.name job.samples = [sample] jobs.append(job) target_bed = bvatools.resolve_readset_coverage_bed( sample.readsets[0]) if target_bed: # Create targeted combined file target_cpg_profile = re.sub("combined", "combined.on_target", cpg_profile) job = bedtools.intersect(cpg_profile, target_cpg_profile, target_bed, include_header=True) job.name = "extract_target_CpG_profile." + sample.name job.samples = [sample] jobs.append(job) cpg_profile = target_cpg_profile # Caluculate median & mean CpG coverage median_CpG_coverage = re.sub(".CpG_report.txt.gz", ".median_CpG_coverage.txt", cpg_input_file) job = tools.cpg_cov_stats(cpg_profile, median_CpG_coverage) job.name = "median_CpG_coverage." + sample.name job.samples = [sample] if target_bed: job.removable_files = [target_cpg_profile] jobs.append(job) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ # check the library status library, bam = {}, {} for readset in self.readsets: if not library.has_key(readset.sample): library[readset.sample] = "SINGLE_END" if readset.run_type == "PAIRED_END": library[readset.sample] = "PAIRED_END" if not bam.has_key(readset.sample): bam[readset.sample] = "" if readset.bam: bam[readset.sample] = readset.bam jobs = [] created_interval_lists = [] for sample in self.samples: file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.") coverage_bed = bvatools.resolve_readset_coverage_bed( sample.readsets[0]) candidate_input_files = [[file_prefix + "bam"]] if bam[sample]: candidate_input_files.append([bam[sample]]) [input] = self.select_input_files(candidate_input_files) job = picard.collect_multiple_metrics(input, re.sub( "bam", "all.metrics", input), library_type=library[sample]) job.name = "picard_collect_multiple_metrics." + sample.name job.samples = [sample] jobs.append(job) # Compute genome coverage with GATK job = gatk.depth_of_coverage(input, re.sub("bam", "all.coverage", input), coverage_bed) job.name = "gatk_depth_of_coverage.genome." + sample.name job.samples = [sample] jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, re.sub("bam", "coverage.tsv", input), coverage_bed, other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False)) job.name = "bvatools_depth_of_coverage." + sample.name job.samples = [sample] jobs.append(job) if coverage_bed: # Get on-target reads (if on-target context is detected) ontarget_bam = re.sub("bam", "ontarget.bam", input) flagstat_output = re.sub("bam", "bam.flagstat", ontarget_bam) job = concat_jobs([ bedtools.intersect(input, ontarget_bam, coverage_bed), samtools.flagstat(ontarget_bam, flagstat_output) ]) job.name = "ontarget_reads." + sample.name job.removable_files = [ontarget_bam] job.samples = [sample] jobs.append(job) # Compute on target percent of hybridisation based capture interval_list = re.sub("\.[^.]+$", ".interval_list", coverage_bed) if not interval_list in created_interval_lists: job = tools.bed2interval_list(None, coverage_bed, interval_list) job.name = "interval_list." + os.path.basename( coverage_bed) jobs.append(job) created_interval_lists.append(interval_list) file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.") job = picard.calculate_hs_metrics(file_prefix + "bam", file_prefix + "onTarget.tsv", interval_list) job.name = "picard_calculate_hs_metrics." + sample.name job.samples = [sample] jobs.append(job) # Calculate the number of reads with higher mapping quality than the threshold passed in the ini file job = concat_jobs([ samtools.view( input, re.sub(".bam", ".filtered_reads.counts.txt", input), "-c " + config.param('mapping_quality_filter', 'quality_threshold')) ]) job.name = "mapping_quality_filter." + sample.name job.samples = [sample] jobs.append(job) # Calculate GC bias # For captured analysis #if coverage_bed: #target_input = re.sub(".bam", ".targeted.bam", input) #job = concat_jobs([ #bedtools.intersect( #input, #target_input, #coverage_bed #) #bedtools.coverage( #target_input, #re.sub(".bam", ".gc_cov.1M.txt", target_input) #), #metrics.gc_bias( #re.sub(".bam", ".gc_cov.1M.txt", target_input), #re.sub(".bam", ".GCBias_all.txt", target_input) #) #]) # Or for whole genome analysis #else: gc_content_file = re.sub(".bam", ".gc_cov.1M.txt", input) job = bedtools.coverage(input, gc_content_file, coverage_bed) if coverage_bed: gc_content_on_target_file = re.sub(".bam", ".gc_cov.1M.on_target.txt", input) gc_ontent_target_job = bedtools.intersect( gc_content_file, gc_content_on_target_file, coverage_bed) gc_content_file = gc_content_on_target_file job = concat_jobs([job, gc_ontent_target_job]) job = concat_jobs([ job, metrics.gc_bias(gc_content_file, re.sub(".bam", ".GCBias_all.txt", input)) ]) job.name = "GC_bias." + sample.name job.samples = [sample] jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name job.samples = [sample] jobs.append(job) return jobs
def call_variants(self): """ VarScan caller for insertions and deletions. """ jobs = [] nb_jobs = config.param('varscan', 'nb_jobs', type='posint') if nb_jobs > 50: log.warning("Number of VarScan jobs is > 50. This is usually much. Anything beyond 20 can be problematic.") variants_directory = os.path.join("variants") varscan_directory = os.path.join(variants_directory, "rawVarScan") beds = [] for idx in range(nb_jobs): beds.append(os.path.join(varscan_directory, 'chrs.' + str(idx) + '.bed')) genome_dictionary = config.param('DEFAULT', 'genome_dictionary', type='filepath') if nb_jobs > 1: bedJob = tools.dict2beds(genome_dictionary, beds) jobs.append(concat_jobs([mkdir_job,bedJob], name="varscan.genome.beds")) bams=[] sampleNamesFile = 'varscan_samples.tsv' sampleNames = open(sampleNamesFile, 'w') for sample in self.samples: alignment_directory = os.path.join("alignment", sample.name) input = os.path.join(alignment_directory, sample.name + ".matefixed.sorted.bam") bams.append(input) sampleNames.write("%s\n" % sample.name) bedfile = bvatools.resolve_readset_coverage_bed(sample.readsets[0]) #sampleNames.append(sample.name) if nb_jobs == 1: job = concat_jobs([ Job(command="mkdir -p " + varscan_directory), pipe_jobs([ samtools.mpileup(bams, None, config.param('varscan', 'mpileup_other_options'), regionFile=bedfile), varscan.mpileupcns(None, None, sampleNamesFile, config.param('varscan', 'other_options')), htslib.bgzip_tabix_vcf(None, os.path.join(variants_directory, "allSamples.vcf.gz")) ]) ], name="varscan.single") jobs.append(job) else: output_vcfs=[] for idx in range(nb_jobs): output_vcf = os.path.join(varscan_directory, "allSamples."+str(idx)+".vcf.gz") varScanJob = pipe_jobs([ samtools.mpileup(bams, None, config.param('varscan', 'mpileup_other_options'), regionFile=beds[idx]), varscan.mpileupcns(None, None, sampleNamesFile, config.param('varscan', 'other_options')), htslib.bgzip_tabix_vcf(None, output_vcf) ], name = "varscan." + str(idx)) output_vcfs.append(output_vcf) jobs.append(varScanJob) job=gatk.cat_variants(output_vcfs, os.path.join(variants_directory, "allSamples.vcf.gz")) job.name="gatk_cat_varscan" jobs.append(job) return jobs