def estimate_ribosomal_rna(self): """ Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per sequencing readset. The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem. BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/). This step takes as input files: readset Bam files """ jobs = [] for readset in self.readsets: readset_bam = os.path.join("alignment", readset.sample.name, readset.name , "Aligned.sortedByCoord.out.bam") output_folder = os.path.join("metrics",readset.sample.name, readset.name) readset_metrics_bam = os.path.join(output_folder,readset.name +"rRNA.bam") job = concat_jobs([ Job(command="mkdir -p " + os.path.dirname(readset_bam) + " " + output_folder), pipe_jobs([ bvatools.bam2fq( readset_bam ), bwa.mem( "/dev/stdin", None, read_group="'@RG" + \ "\tID:" + readset.name + \ "\tSM:" + readset.sample.name + \ ("\tLB:" + readset.library if readset.library else "") + \ ("\tPU:run" + readset.run + "_" + readset.lane if readset.run and readset.lane else "") + \ ("\tCN:" + config.param('bwa_mem_rRNA', 'sequencing_center') if config.param('bwa_mem_rRNA', 'sequencing_center', required=False) else "") + \ "\tPL:Illumina" + \ "'", ref=config.param('bwa_mem_rRNA', 'ribosomal_fasta'), ini_section='bwa_mem_rRNA' ), picard.sort_sam( "/dev/stdin", readset_metrics_bam, "coordinate", ini_section='picard_sort_sam_rrna' ) ]), tools.py_rrnaBAMcount ( bam=readset_metrics_bam, gtf=config.param('bwa_mem_rRNA', 'gtf'), output=os.path.join(output_folder,readset.name+"rRNA.stats.tsv"), typ="transcript")], name="bwa_mem_rRNA." + readset.name ) job.removable_files=[readset_metrics_bam] jobs.append(job) return jobs
def bwa_mem_picard_sort_sam(self): jobs = [] for readset in self.readsets: trim_file_prefix = os.path.join("trim", readset.sample.name, readset.name + ".trim.") alignment_directory = os.path.join("alignment", readset.sample.name) readset_bam = os.path.join(alignment_directory, readset.name + ".sorted.bam") if readset.run_type == "PAIRED_END": fastq1 = trim_file_prefix + "pair1.fastq.gz" fastq2 = trim_file_prefix + "pair2.fastq.gz" elif readset.run_type == "SINGLE_END": fastq1 = trim_file_prefix + "single.fastq.gz" fastq2 = None else: raise Exception("Error: run type \"" + readset.run_type + "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END or SINGLE_END)!") job = concat_jobs([ Job(command="mkdir -p " + alignment_directory), pipe_jobs([ bwa.mem( fastq1, fastq2, read_group="'@RG" + \ "\tID:" + readset.name + \ "\tSM:" + readset.sample.name + \ "\tLB:" + readset.library + \ "\tPU:run" + readset.run + "_" + readset.lane + \ "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \ "\tPL:Illumina" + \ "'" ), picard.sort_sam( "/dev/stdin", readset_bam, "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam." + readset.name) # If this readset is unique for this sample, further BAM merging is not necessary. # Thus, create a sample BAM symlink to the readset BAM, along with its index. if len(readset.sample.readsets) == 1: readset_index = re.sub("\.bam$", ".bai", readset_bam) sample_bam = os.path.join(alignment_directory, readset.sample.name + ".sorted.bam") sample_index = re.sub("\.bam$", ".bai", sample_bam) job = concat_jobs([ job, Job([readset_bam], [sample_bam], command="ln -s -f " + os.path.relpath(readset_bam, os.path.dirname(sample_bam)) + " " + sample_bam), Job([readset_bam], [sample_index], command="ln -s -f " + os.path.relpath(readset_index, os.path.dirname(sample_index)) + " " + sample_index) ], name=job.name) jobs.append(job) return jobs
def picard_sort_sam(self): """ The alignment file is reordered (QueryName) using [Picard](http://broadinstitute.github.io/picard/). The QueryName-sorted bam files will be used to determine raw read counts. """ jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join("alignment", sample.name, sample.name) job = picard.sort_sam( alignment_file_prefix + ".sorted.bam", alignment_file_prefix + ".QueryNameSorted.bam", "queryname" ) job.name = "picard_sort_sam." + sample.name jobs.append(job) return jobs
def get_alignment_job(self, readset): output = readset.bam + ".bam" job = concat_jobs([ Job(command="mkdir -p " + os.path.dirname(output)), pipe_jobs([ bwa.mem(readset.fastq1, readset.fastq2, read_group=RunProcessingAligner.get_rg_tag( readset, 'bwa_mem'), ref=readset.aligner_reference_index), picard.sort_sam("/dev/stdin", output, "coordinate") ]) ], name="bwa_mem_picard_sort_sam." + readset.name + "." + readset.run + "." + readset.lane) return job
def _estimate_ribosomal_rna(readset): """ Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per sequencing readset. The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem. BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/). """ jobs = [] if len(readset.annotation_files) > 1 and os.path.isfile(readset.annotation_files[0]) and os.path.isfile( readset.annotation_files[1]): readset_bam = readset.bam + ".bam" readset_metrics_bam = readset.bam + ".rRNA.bam" job = concat_jobs([ pipe_jobs([ bvatools.bam2fq( readset_bam ), bwa.mem( "/dev/stdin", None, read_group=RunProcessingAligner.get_rg_tag(readset, 'bwa_mem_rRNA'), ref=readset.annotation_files[1], ini_section='bwa_mem_rRNA' ), picard.sort_sam( "/dev/stdin", readset_metrics_bam, "coordinate", ini_section='picard_sort_sam_rrna' ) ]), tools.py_rrnaBAMcount( bam=readset_metrics_bam, gtf=readset.annotation_files[0], output=os.path.join(readset.bam + ".metrics.rRNA.tsv"), typ="transcript")], name="bwa_mem_rRNA." + readset.name + ".rRNA" + "." + readset.run + "." + readset.lane) job.removable_files = [readset_metrics_bam] jobs.append(job) return jobs
def _estimate_ribosomal_rna(readset): """ Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per sequencing readset. The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem. BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/). """ jobs = [] if len(readset.annotation_files) > 1 and os.path.isfile( readset.annotation_files[0]) and os.path.isfile( readset.annotation_files[1]): readset_bam = readset.bam + ".bam" readset_metrics_bam = readset.bam + ".rRNA.bam" job = concat_jobs([ pipe_jobs([ bvatools.bam2fq(readset_bam), bwa.mem("/dev/stdin", None, read_group=RunProcessingAligner.get_rg_tag( readset, 'bwa_mem_rRNA'), ref=readset.annotation_files[1], ini_section='bwa_mem_rRNA'), picard.sort_sam("/dev/stdin", readset_metrics_bam, "coordinate", ini_section='picard_sort_sam_rrna') ]), tools.py_rrnaBAMcount( bam=readset_metrics_bam, gtf=readset.annotation_files[0], output=os.path.join(readset.bam + ".metrics.rRNA.tsv"), typ="transcript") ], name="bwa_mem_rRNA." + readset.name + ".rRNA" + "." + readset.run + "." + readset.lane) job.removable_files = [readset_metrics_bam] jobs.append(job) return jobs
def get_alignment_jobs(self, readset): jobs = [] output = readset.bam + ".bam" job = concat_jobs([ Job(command="mkdir -p " + os.path.dirname(output)), pipe_jobs([ bwa.mem( readset.fastq1, readset.fastq2, read_group=RunProcessingAligner.get_rg_tag(readset, 'bwa_mem'), ref=readset.aligner_reference_index ), picard.sort_sam( "/dev/stdin", output, "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam." + readset.name + "_" + readset.run + "_" + readset.lane) jobs.append(job) return jobs
def map_on_scaffolds(self): jobs = [] for sample in self.samples: cov_directory = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'), "cov") extract_file_prefix = os.path.join("extract", sample.name, sample.name + ".") scaffolds_file = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'), "Scaffolds.fasta") #map Orphan read job = concat_jobs([ Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"), pipe_jobs([ bwa.mem( extract_file_prefix + "ORPHAN.1.fastq.gz", extract_file_prefix + "ORPHAN.2.fastq.gz", read_group="'@RG" + \ "\tID:" + sample.name + "_ray_orphan" \ "\tSM:" + sample.name + \ "\tLB:" + sample.name + \ "\tPU:orphan" + \ "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \ "\tPL:Illumina" + \ "'", ref=scaffolds_file ), picard.sort_sam( "/dev/stdin", os.path.join(cov_directory, "ORPHAN.bam"), "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam_ORPHAN_" + sample.name) jobs.append(job) #map OEA read job = concat_jobs([ Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"), pipe_jobs([ bwa.mem( extract_file_prefix + "OEAUNMAP.1.equal.fastq.gz", read_group="'@RG" + \ "\tID:" + sample.name + "_ray_scoea1"\ "\tSM:" + sample.name + \ "\tLB:" + sample.name + \ "\tPU:scoea1" + \ "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \ "\tPL:Illumina" + \ "'", ref=scaffolds_file ), picard.sort_sam( "/dev/stdin", os.path.join(cov_directory, "OEAUNMAP.1.bam"), "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam_OEA1_" + sample.name) jobs.append(job) job = concat_jobs([ Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"), pipe_jobs([ bwa.mem( extract_file_prefix + "OEAUNMAP.2.equal.fastq.gz", read_group="'@RG" + \ "\tID:" + sample.name + "_ray_scoea2" \ "\tSM:" + sample.name + \ "\tLB:" + sample.name + \ "\tPU:scoea2" + \ "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \ "\tPL:Illumina" + \ "'", ref=scaffolds_file ), picard.sort_sam( "/dev/stdin", os.path.join(cov_directory, "OEAUNMAP.2.bam"), "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam_OEA2_" + sample.name) jobs.append(job) #map sclip read job = concat_jobs([ Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"), pipe_jobs([ bwa.mem( extract_file_prefix + "sclip.1.fastq.gz", read_group="'@RG" + \ "\tID:" + sample.name + "_ray_sclip1" \ "\tSM:" + sample.name + \ "\tLB:" + sample.name + \ "\tPU:sclip1" + \ "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \ "\tPL:Illumina" + \ "'", ref=scaffolds_file ), picard.sort_sam( "/dev/stdin", os.path.join(cov_directory, "sclip.1.bam"), "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam_sclip1_" + sample.name) jobs.append(job) job = concat_jobs([ Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"), pipe_jobs([ bwa.mem( extract_file_prefix + "sclip.2.fastq.gz", read_group="'@RG" + \ "\tID:" + sample.name + "_ray_sclip2" \ "\tSM:" + sample.name + \ "\tLB:" + sample.name + \ "\tPU:sclip2" + \ "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \ "\tPL:Illumina" + \ "'", ref=scaffolds_file ), picard.sort_sam( "/dev/stdin", os.path.join(cov_directory, "sclip.2.bam"), "coordinate" ) ]) ], name="bwa_mem_picard_sort_sam_sclip2_" + sample.name) jobs.append(job) return jobs