def extract_sclip(self): jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".") sclip_directory = os.path.join("sclip", sample.name) sclip_file_prefix = os.path.join("sclip", sample.name, sample.name) job = concat_jobs([ Job(command="if [ ! -d " + sclip_directory + " ]; then mkdir -p " + sclip_directory + "; fi"), self.get_job_max_insert_size(sample), bvatools.extract_sclip(alignment_file_prefix + "sorted.dup.bam", sclip_file_prefix, "$maxInsertSize"), samtools.index(sclip_file_prefix + ".sc.bam"), samtools.index(sclip_file_prefix + ".scOthers.bam"), igvtools.compute_tdf(sclip_file_prefix + ".sc.bam", sclip_file_prefix + ".sc.tdf") ], name="extract_sclip_" + sample.name) jobs.append(job) return jobs
def metrics(self): jobs = [] for sample in self.samples: file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dup.") input = file_prefix + "bam" job = picard.collect_multiple_metrics(input, file_prefix + "all.metrics") job.name = "picard_collect_multiple_metrics." + sample.name jobs.append(job) # Compute genome coverage job = gatk.depth_of_coverage(input, file_prefix + "all.coverage") job.name = "gatk_depth_of_coverage.genome." + sample.name jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name jobs.append(job) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ jobs = [] for sample in self.samples: input_file_prefix = os.path.join( "alignment", sample.name, sample.name + ".matefixed.sorted.") input = input_file_prefix + "bam" job = picard.collect_multiple_metrics( input, input_file_prefix + "all.metrics") job.name = "picard_collect_multiple_metrics." + sample.name job.samples = [sample] jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, input_file_prefix + "coverage.tsv", bvatools.resolve_readset_coverage_bed(sample.readsets[0]), other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False)) job.name = "bvatools_depth_of_coverage." + sample.name job.samples = [sample] jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name job.samples = [sample] jobs.append(job) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ jobs = [] for sample in self.samples: input_file_prefix = os.path.join("alignment", sample.name, sample.name + ".matefixed.sorted.") input = input_file_prefix + "bam" job = picard.collect_multiple_metrics(input, input_file_prefix + "all.metrics") job.name = "picard_collect_multiple_metrics." + sample.name jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, input_file_prefix + "coverage.tsv", bvatools.resolve_readset_coverage_bed(sample.readsets[0]), other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False) ) job.name = "bvatools_depth_of_coverage." + sample.name jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name jobs.append(job) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ # check the library status library, bam = {}, {} for readset in self.readsets: if not library.has_key(readset.sample): library[readset.sample] = "SINGLE_END" if readset.run_type == "PAIRED_END": library[readset.sample] = "PAIRED_END" if not bam.has_key(readset.sample): bam[readset.sample] = "" if readset.bam: bam[readset.sample] = readset.bam jobs = [] created_interval_lists = [] for sample in self.samples: file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.") coverage_bed = bvatools.resolve_readset_coverage_bed( sample.readsets[0]) candidate_input_files = [[file_prefix + "bam"]] if bam[sample]: candidate_input_files.append([bam[sample]]) [input] = self.select_input_files(candidate_input_files) job = picard.collect_multiple_metrics(input, re.sub( "bam", "all.metrics", input), library_type=library[sample]) job.name = "picard_collect_multiple_metrics." + sample.name job.samples = [sample] jobs.append(job) # Compute genome coverage with GATK job = gatk.depth_of_coverage(input, re.sub("bam", "all.coverage", input), coverage_bed) job.name = "gatk_depth_of_coverage.genome." + sample.name job.samples = [sample] jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, re.sub("bam", "coverage.tsv", input), coverage_bed, other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False)) job.name = "bvatools_depth_of_coverage." + sample.name job.samples = [sample] jobs.append(job) if coverage_bed: # Get on-target reads (if on-target context is detected) ontarget_bam = re.sub("bam", "ontarget.bam", input) flagstat_output = re.sub("bam", "bam.flagstat", ontarget_bam) job = concat_jobs([ bedtools.intersect(input, ontarget_bam, coverage_bed), samtools.flagstat(ontarget_bam, flagstat_output) ]) job.name = "ontarget_reads." + sample.name job.removable_files = [ontarget_bam] job.samples = [sample] jobs.append(job) # Compute on target percent of hybridisation based capture interval_list = re.sub("\.[^.]+$", ".interval_list", coverage_bed) if not interval_list in created_interval_lists: job = tools.bed2interval_list(None, coverage_bed, interval_list) job.name = "interval_list." + os.path.basename( coverage_bed) jobs.append(job) created_interval_lists.append(interval_list) file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.") job = picard.calculate_hs_metrics(file_prefix + "bam", file_prefix + "onTarget.tsv", interval_list) job.name = "picard_calculate_hs_metrics." + sample.name job.samples = [sample] jobs.append(job) # Calculate the number of reads with higher mapping quality than the threshold passed in the ini file job = concat_jobs([ samtools.view( input, re.sub(".bam", ".filtered_reads.counts.txt", input), "-c " + config.param('mapping_quality_filter', 'quality_threshold')) ]) job.name = "mapping_quality_filter." + sample.name job.samples = [sample] jobs.append(job) # Calculate GC bias # For captured analysis #if coverage_bed: #target_input = re.sub(".bam", ".targeted.bam", input) #job = concat_jobs([ #bedtools.intersect( #input, #target_input, #coverage_bed #) #bedtools.coverage( #target_input, #re.sub(".bam", ".gc_cov.1M.txt", target_input) #), #metrics.gc_bias( #re.sub(".bam", ".gc_cov.1M.txt", target_input), #re.sub(".bam", ".GCBias_all.txt", target_input) #) #]) # Or for whole genome analysis #else: gc_content_file = re.sub(".bam", ".gc_cov.1M.txt", input) job = bedtools.coverage(input, gc_content_file, coverage_bed) if coverage_bed: gc_content_on_target_file = re.sub(".bam", ".gc_cov.1M.on_target.txt", input) gc_ontent_target_job = bedtools.intersect( gc_content_file, gc_content_on_target_file, coverage_bed) gc_content_file = gc_content_on_target_file job = concat_jobs([job, gc_ontent_target_job]) job = concat_jobs([ job, metrics.gc_bias(gc_content_file, re.sub(".bam", ".GCBias_all.txt", input)) ]) job.name = "GC_bias." + sample.name job.samples = [sample] jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name job.samples = [sample] jobs.append(job) return jobs