def bam_hard_clip(self): """ Generate a hardclipped version of the bam for the toxedo suite which doesn't support this official sam feature. """ jobs = [] for sample in self.samples: alignment_input = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.bam") alignment_output = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam") job=pipe_jobs([ samtools.view( alignment_input, None, "-h" ), Job( [None], [alignment_output], # awk to transform soft clip into hard clip for tuxedo suite command="""\ awk 'BEGIN {{OFS="\\t"}} {{if (substr($1,1,1)=="@") {{print;next}}; split($6,C,/[0-9]*/); split($6,L,/[SMDIN]/); if (C[2]=="S") {{$10=substr($10,L[1]+1); $11=substr($11,L[1]+1)}}; if (C[length(C)]=="S") {{L1=length($10)-L[length(L)-1]; $10=substr($10,1,L1); $11=substr($11,1,L1); }}; gsub(/[0-9]*S/,"",$6); print}}' """.format() ), samtools.view( "-", alignment_output, "-hbS" ), ]) job.name="tuxedo_hard_clip."+ sample.name jobs.append(job) return jobs
def extract_bam_unmap(self): jobs = [] for sample in self.samples: sclip_directory = os.path.join("sclip", sample.name) sclip_file_prefix = os.path.join("sclip", sample.name, sample.name + ".") extract_directory = os.path.join("extract", sample.name) extract_file_prefix = os.path.join("extract", sample.name, sample.name + ".") jobMkdir = Job(command="if [ ! -d " + extract_directory + " ]; then mkdir -p " + extract_directory + "; fi") ## extract Orphan job = concat_jobs([ jobMkdir, concat_jobs([ samtools.view(sclip_file_prefix + "scOthers.bam", extract_file_prefix + "ORPHAN.bam", "-b -h -f 12 -F 256"), samtools.sort(extract_file_prefix + "ORPHAN.bam", extract_file_prefix + "ORPHAN.sName", True) ]) ], name="extract_bam_ORPHAN_" + sample.name) jobs.append(job) ## extract OEA close to sclip job = concat_jobs([ jobMkdir, concat_jobs([ samtools.view(sclip_file_prefix + "sc.bam", extract_file_prefix + "OEAUNMAP.1.bam", "-b -h -f 68 -F 264"), samtools.sort(extract_file_prefix + "OEAUNMAP.1.bam", extract_file_prefix + "OEAUNMAP.1.sName", True) ]) ], name="extract_bam_OEAUNMAP1_" + sample.name) jobs.append(job) job = concat_jobs([ jobMkdir, concat_jobs([ samtools.view(sclip_file_prefix + "sc.bam", extract_file_prefix + "OEAUNMAP.2.bam", "-b -h -f 132 -F 264"), samtools.sort(extract_file_prefix + "OEAUNMAP.2.bam", extract_file_prefix + "OEAUNMAP.2.sName", True) ]) ], name="extract_bam_OEAUNMAP2_" + sample.name) jobs.append(job) job = concat_jobs([ jobMkdir, concat_jobs([ samtools.view(sclip_file_prefix + "sc.bam", extract_file_prefix + "OEAMAP.bam", "-b -h -f 8 -F 1284"), samtools.sort(extract_file_prefix + "OEAMAP.bam", extract_file_prefix + "OEAMAP.sName", True) ]) ], name="extract_bam_OEAMAP_" + sample.name) jobs.append(job) return jobs
def raw_counts(self): """ Count reads in features using [htseq-count](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html). """ jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join("alignment", sample.name, sample.name) input_bam = alignment_file_prefix + ".QueryNameSorted.bam" # Count reads output_count = os.path.join("raw_counts", sample.name + ".readcounts.csv") stranded = "no" if config.param('DEFAULT', 'strand_info') == "fr-unstranded" else "reverse" job = concat_jobs([ Job(command="mkdir -p raw_counts"), pipe_jobs([ samtools.view( input_bam, options="-F 4" ), htseq.htseq_count( "-", config.param('htseq_count', 'gtf', type='filepath'), output_count, config.param('htseq_count', 'options'), stranded ) ]) ], name="htseq_count." + sample.name) jobs.append(job) return jobs
def samtools_view_filter(self): """ Filter unique reads by mapping quality using [Samtools](http://www.htslib.org/). """ jobs = [] for readset in self.readsets: readset_bam_prefix = os.path.join("alignment", readset.sample.name, readset.name, readset.name + ".sorted.") readset_bam = readset_bam_prefix + "bam" filtered_readset_bam = readset_bam_prefix + "filtered.bam" job = samtools.view( readset_bam, filtered_readset_bam, "-b -F4 -q " + str( config.param( 'samtools_view_filter', 'min_mapq', type='int'))) job.name = "samtools_view_filter." + readset.name jobs.append(job) report_file = os.path.join("report", "ChipSeq.samtools_view_filter.md") jobs.append( Job([ os.path.join("alignment", readset.sample.name, readset.name, readset.name + ".sorted.filtered.bam") for readset in self.readsets ], [report_file], [['samtools_view_filter', 'module_pandoc']], command="""\ mkdir -p report && \\ pandoc --to=markdown \\ --template {report_template_dir}/{basename_report_file} \\ --variable min_mapq="{min_mapq}" \\ {report_template_dir}/{basename_report_file} \\ > {report_file}""".format(min_mapq=config.param('samtools_view_filter', 'min_mapq', type='int'), report_template_dir=self.report_template_dir, basename_report_file=os.path.basename(report_file), report_file=report_file), report_files=[report_file], name="samtools_view_filter_report")) return jobs
def samtools_view_filter(self): """ Filter unique reads by mapping quality using [Samtools](http://www.htslib.org/). """ jobs = [] for readset in self.readsets: readset_bam_prefix = os.path.join("alignment", readset.sample.name, readset.name, readset.name + ".sorted.") readset_bam = readset_bam_prefix + "bam" filtered_readset_bam = readset_bam_prefix + "filtered.bam" job = samtools.view(readset_bam, filtered_readset_bam, "-b -F4 -q " + str(config.param('samtools_view_filter', 'min_mapq', type='int'))) job.name = "samtools_view_filter." + readset.name jobs.append(job) report_file = os.path.join("report", "ChipSeq.samtools_view_filter.md") jobs.append( Job( [os.path.join("alignment", readset.sample.name, readset.name, readset.name + ".sorted.filtered.bam") for readset in self.readsets], [report_file], [['samtools_view_filter', 'module_pandoc']], command="""\ mkdir -p report && \\ pandoc --to=markdown \\ --template {report_template_dir}/{basename_report_file} \\ --variable min_mapq="{min_mapq}" \\ {report_template_dir}/{basename_report_file} \\ > {report_file}""".format( min_mapq=config.param('samtools_view_filter', 'min_mapq', type='int'), report_template_dir=self.report_template_dir, basename_report_file=os.path.basename(report_file), report_file=report_file ), report_files=[report_file], name="samtools_view_filter_report") ) return jobs
def metrics(self): """ Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage: Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads, Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads) whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step for easy visualization of coverage in the IGV browser. """ # check the library status library, bam = {}, {} for readset in self.readsets: if not library.has_key(readset.sample): library[readset.sample] = "SINGLE_END" if readset.run_type == "PAIRED_END": library[readset.sample] = "PAIRED_END" if not bam.has_key(readset.sample): bam[readset.sample] = "" if readset.bam: bam[readset.sample] = readset.bam jobs = [] created_interval_lists = [] for sample in self.samples: file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.") coverage_bed = bvatools.resolve_readset_coverage_bed( sample.readsets[0]) candidate_input_files = [[file_prefix + "bam"]] if bam[sample]: candidate_input_files.append([bam[sample]]) [input] = self.select_input_files(candidate_input_files) job = picard.collect_multiple_metrics(input, re.sub( "bam", "all.metrics", input), library_type=library[sample]) job.name = "picard_collect_multiple_metrics." + sample.name job.samples = [sample] jobs.append(job) # Compute genome coverage with GATK job = gatk.depth_of_coverage(input, re.sub("bam", "all.coverage", input), coverage_bed) job.name = "gatk_depth_of_coverage.genome." + sample.name job.samples = [sample] jobs.append(job) # Compute genome or target coverage with BVATools job = bvatools.depth_of_coverage( input, re.sub("bam", "coverage.tsv", input), coverage_bed, other_options=config.param('bvatools_depth_of_coverage', 'other_options', required=False)) job.name = "bvatools_depth_of_coverage." + sample.name job.samples = [sample] jobs.append(job) if coverage_bed: # Get on-target reads (if on-target context is detected) ontarget_bam = re.sub("bam", "ontarget.bam", input) flagstat_output = re.sub("bam", "bam.flagstat", ontarget_bam) job = concat_jobs([ bedtools.intersect(input, ontarget_bam, coverage_bed), samtools.flagstat(ontarget_bam, flagstat_output) ]) job.name = "ontarget_reads." + sample.name job.removable_files = [ontarget_bam] job.samples = [sample] jobs.append(job) # Compute on target percent of hybridisation based capture interval_list = re.sub("\.[^.]+$", ".interval_list", coverage_bed) if not interval_list in created_interval_lists: job = tools.bed2interval_list(None, coverage_bed, interval_list) job.name = "interval_list." + os.path.basename( coverage_bed) jobs.append(job) created_interval_lists.append(interval_list) file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.dedup.") job = picard.calculate_hs_metrics(file_prefix + "bam", file_prefix + "onTarget.tsv", interval_list) job.name = "picard_calculate_hs_metrics." + sample.name job.samples = [sample] jobs.append(job) # Calculate the number of reads with higher mapping quality than the threshold passed in the ini file job = concat_jobs([ samtools.view( input, re.sub(".bam", ".filtered_reads.counts.txt", input), "-c " + config.param('mapping_quality_filter', 'quality_threshold')) ]) job.name = "mapping_quality_filter." + sample.name job.samples = [sample] jobs.append(job) # Calculate GC bias # For captured analysis #if coverage_bed: #target_input = re.sub(".bam", ".targeted.bam", input) #job = concat_jobs([ #bedtools.intersect( #input, #target_input, #coverage_bed #) #bedtools.coverage( #target_input, #re.sub(".bam", ".gc_cov.1M.txt", target_input) #), #metrics.gc_bias( #re.sub(".bam", ".gc_cov.1M.txt", target_input), #re.sub(".bam", ".GCBias_all.txt", target_input) #) #]) # Or for whole genome analysis #else: gc_content_file = re.sub(".bam", ".gc_cov.1M.txt", input) job = bedtools.coverage(input, gc_content_file, coverage_bed) if coverage_bed: gc_content_on_target_file = re.sub(".bam", ".gc_cov.1M.on_target.txt", input) gc_ontent_target_job = bedtools.intersect( gc_content_file, gc_content_on_target_file, coverage_bed) gc_content_file = gc_content_on_target_file job = concat_jobs([job, gc_ontent_target_job]) job = concat_jobs([ job, metrics.gc_bias(gc_content_file, re.sub(".bam", ".GCBias_all.txt", input)) ]) job.name = "GC_bias." + sample.name job.samples = [sample] jobs.append(job) job = igvtools.compute_tdf(input, input + ".tdf") job.name = "igvtools_compute_tdf." + sample.name job.samples = [sample] jobs.append(job) return jobs
def wiggle(self): """ Generate wiggle tracks suitable for multiple browsers. """ jobs = [] ##check the library status library = {} for readset in self.readsets: if not library.has_key(readset.sample) : library[readset.sample]="PAIRED_END" if readset.run_type == "SINGLE_END" : library[readset.sample]="SINGLE_END" for sample in self.samples: bam_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.") input_bam = bam_file_prefix + "bam" bed_graph_prefix = os.path.join("tracks", sample.name, sample.name) big_wig_prefix = os.path.join("tracks", "bigWig", sample.name) if (config.param('DEFAULT', 'strand_info') != 'fr-unstranded') and library[sample] == "PAIRED_END": input_bam_f1 = bam_file_prefix + "tmp1.forward.bam" input_bam_f2 = bam_file_prefix + "tmp2.forward.bam" input_bam_r1 = bam_file_prefix + "tmp1.reverse.bam" input_bam_r2 = bam_file_prefix + "tmp2.reverse.bam" output_bam_f = bam_file_prefix + "forward.bam" output_bam_r = bam_file_prefix + "reverse.bam" bam_f_job = concat_jobs([ samtools.view(input_bam, input_bam_f1, "-bh -F 256 -f 81"), samtools.view(input_bam, input_bam_f2, "-bh -F 256 -f 161"), picard.merge_sam_files([input_bam_f1, input_bam_f2], output_bam_f), Job(command="rm " + input_bam_f1 + " " + input_bam_f2) ], name="wiggle." + sample.name + ".forward_strandspec") # Remove temporary-then-deleted files from job output files, otherwise job is never up to date bam_f_job.output_files.remove(input_bam_f1) bam_f_job.output_files.remove(input_bam_f2) bam_r_job = concat_jobs([ Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig")), samtools.view(input_bam, input_bam_r1, "-bh -F 256 -f 97"), samtools.view(input_bam, input_bam_r2, "-bh -F 256 -f 145"), picard.merge_sam_files([input_bam_r1, input_bam_r2], output_bam_r), Job(command="rm " + input_bam_r1 + " " + input_bam_r2) ], name="wiggle." + sample.name + ".reverse_strandspec") # Remove temporary-then-deleted files from job output files, otherwise job is never up to date bam_r_job.output_files.remove(input_bam_r1) bam_r_job.output_files.remove(input_bam_r2) jobs.extend([bam_f_job, bam_r_job]) outputs = [ [bed_graph_prefix + ".forward.bedGraph", big_wig_prefix + ".forward.bw"], [bed_graph_prefix + ".reverse.bedGraph", big_wig_prefix + ".reverse.bw"], ] else: outputs = [[bed_graph_prefix + ".bedGraph", big_wig_prefix + ".bw"]] for bed_graph_output, big_wig_output in outputs: job = concat_jobs([ Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig"), removable_files=["tracks"]), bedtools.graph(input_bam, bed_graph_output, big_wig_output,library[sample]) ], name="wiggle." + re.sub(".bedGraph", "", os.path.basename(bed_graph_output))) jobs.append(job) return jobs
def extract_fastq_oea_sclip(self): jobs = [] for sample in self.samples: extract_directory = os.path.join("extract", sample.name) extract_file_prefix = os.path.join("extract", sample.name, sample.name + ".") sclip_file_prefix = os.path.join("sclip", sample.name, sample.name + ".") jobMkdir = Job(command="if [ ! -d " + extract_directory + " ]; then mkdir -p " + extract_directory + "; fi") ## create fastq of OEA close to sclip job = pipe_jobs([ samtools.view(extract_file_prefix + "OEAUNMAP.1.sName.bam"), self.get_job_sam_to_fastq(extract_file_prefix + "OEAUNMAP.1.fastq.gz") ], name="extract_fastq_OEA1_" + sample.name) jobs.append(job) job = pipe_jobs([ samtools.view(extract_file_prefix + "OEAUNMAP.2.sName.bam"), self.get_job_sam_to_fastq(extract_file_prefix + "OEAUNMAP.2.fastq.gz") ], name="extract_fastq_OEA2_" + sample.name) jobs.append(job) job = pipe_jobs([ samtools.view(extract_file_prefix + "OEAMAP.bam", options="-f 64 -q " + config.param('DEFAULT', 'min_mapping_quality')), self.get_job_sam_to_fastq(extract_file_prefix + "OEAMAP.1.fastq.gz") ], name="extract_fastq_OEAMAP1_" + sample.name) jobs.append(job) job = pipe_jobs([ samtools.view(extract_file_prefix + "OEAMAP.bam", options="-f 128 -q " + config.param('DEFAULT', 'min_mapping_quality')), self.get_job_sam_to_fastq(extract_file_prefix + "OEAMAP.2.fastq.gz") ], name="extract_fastq_OEAMAP2_" + sample.name) jobs.append(job) ## equal fastq file beetween OEAMAP and OEAUNMAP (due to the filter of mapq) job = concat_jobs([ tools.py_equalFastqFile(extract_file_prefix + "OEAMAP.2.fastq.gz", extract_file_prefix + "OEAUNMAP.1.fastq.gz", extract_file_prefix + "OEAUNMAP.1.equal.fastq.gz"), Job(command="rm " + extract_file_prefix + "OEAUNMAP.1.fastq.gz") ], name="equal_fastq_OEA1_" + sample.name) jobs.append(job) job = concat_jobs([ tools.py_equalFastqFile(extract_file_prefix + "OEAMAP.1.fastq.gz", extract_file_prefix + "OEAUNMAP.2.fastq.gz", extract_file_prefix + "OEAUNMAP.2.equal.fastq.gz"), Job(command="rm " + extract_file_prefix + "OEAUNMAP.2.fastq.gz") ], name="equal_fastq_OEA2_" + sample.name) jobs.append(job) ## create fastq sclip jobMkdir = Job(command="if [ ! -d " + extract_directory + " ]; then mkdir -p " + extract_directory + "; fi") jobFastq = Job( input_files=[sclip_file_prefix+"scSequences.txt"], output_files=[extract_file_prefix+"sclip.1.fastq.gz"], command="awk 'NR>1 {if ($3==\"+\") { print \"@\"$4; print $5 ;print \"+\"; print $6}}' " + sclip_file_prefix + "scSequences.txt " + "| gzip -c > " + extract_file_prefix+"sclip.1.fastq.gz", name="fastq_sclip1_" + sample.name ) job = concat_jobs([ jobMkdir, jobFastq ], name="fastq_sclip1_" + sample.name) jobs.append(job) jobFastq = Job( input_files=[sclip_file_prefix+"scSequences.txt"], output_files=[extract_file_prefix+"sclip.2.fastq.gz"], command="awk 'NR>1 {if ($3==\"-\") { print \"@\"$4; print $5 ;print \"+\"; print $6}}' " + sclip_file_prefix + "scSequences.txt " + "| gzip -c > " + extract_file_prefix+"sclip.2.fastq.gz", name="fastq_sclip2_" + sample.name ) job = concat_jobs([ jobMkdir, jobFastq ], name="fastq_sclip2_" + sample.name) jobs.append(job) return jobs