def _distribution_output (self): """ """ output = "{}{}_Reference_distribution.csv".format(self.result_dir, self.outprefix) with open(output, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL) # Table for all reference writer.writerow(["Ref name","length","nread","RPKB"]) for ref in Reference.getInstances(): writer.writerow([ref.name, len(ref), ref.nread, float(ref.nread)/len(ref)*1000]) # Add a line for garbage reads excluding the secondary alignments nread = sum([seq.nread for seq in self.garbage_read[1:]]) writer.writerow(["Unmaped_and LowMapq","NA",nread,"NA"]) output = "{}{}_Sequence_distribution.csv".format(self.result_dir, self.outprefix) with open(output, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL) # Table decomposing Sequence per Reference writer.writerow(["Seq name","length","nread","RPKB"]) for ref in Reference.getInstances(): for seq in ref.seq_dict.values(): writer.writerow([seq.name, len(seq), seq.nread, float(seq.nread)/len(seq)*1000]) # Add a lines for garbage reads including the secondary alignments for seq in self.garbage_read: writer.writerow([seq.name, "NA", seq.nread, "NA"])
def _iterative_masker( self ): #### TODO The fuction directly manipulate reference field= change that """ Mask references homologies iteratively, starting by the last reference which is masked by all the others then to the penultimate masked by all others except the last and and so forth until there is only 1 reference remaining """ # Iterate over index in Reference.instances staring by the last one until the 2nd one for i in range(Reference.countInstances() - 1, 0, -1): # Extract subject and query_list from ref_list subject = Reference.Instances[i] query_list = Reference.Instances[0:i] print("\n# PROCESSING REFERENCE {} #\n".format(subject.name)) # Perform a blast of query list against subject hit_list = Blastn.align( query_list=[ref.ref_fasta for ref in query_list], subject_fasta=subject.ref_fasta, align_opt=self.blastn_opt, num_threads=self.blastn_threads, db_opt=self.mkblastdb_opt, db_outdir=self.db_dir, db_outname=subject.name) # Masking hits in suject fasta if hits in hit_list subject.ref_fasta = mask(subject_fasta=subject.ref_fasta, hit_list=hit_list, ref_outdir=self.ref_dir, ref_outname="masked_{}.fa".format( subject.name), compress_ouput=False)
def _iterative_masker (self): #### TODO The fuction directly manipulate reference field= change that """ Mask references homologies iteratively, starting by the last reference which is masked by all the others then to the penultimate masked by all others except the last and and so forth until there is only 1 reference remaining """ # Iterate over index in Reference.instances staring by the last one until the 2nd one for i in range(Reference.countInstances()-1, 0, -1): # Extract subject and query_list from ref_list subject = Reference.Instances[i] query_list = Reference.Instances[0:i] print ("\n# PROCESSING REFERENCE {} #\n".format(subject.name)) # Perform a blast of query list against subject hit_list = Blastn.align ( query_list = [ref.ref_fasta for ref in query_list], subject_fasta = subject.ref_fasta, align_opt = self.blastn_opt, num_threads = self.blastn_threads, db_opt = self.mkblastdb_opt, db_outdir = self.db_dir, db_outname = subject.name) # Masking hits in suject fasta if hits in hit_list subject.ref_fasta = mask ( subject_fasta= subject.ref_fasta, hit_list = hit_list, ref_outdir = self.ref_dir, ref_outname = "masked_{}.fa".format(subject.name), compress_ouput = False)
def _make_report(self): """ """ output = "{}{}_parameters.txt".format(self.result_dir, self.outprefix) with open(output, 'wb') as outfile: # References options outfile.write( "################## REFERENCES ##################\n\n") outfile.write(Reference.reprInstances()) if self.ref_masking: outfile.write( "Reference homologies were masked with RefMasker\n") outfile.write("blastn options : {}\n".format(self.blastn_opt)) outfile.write("makeblastdb options : {}\n".format( self.mkblastdb_opt)) else: outfile.write("No Reference homologies masking done\n") # Fastq options outfile.write( "\n################## FASTQ FILES ##################\n\n") outfile.write("R1 : {}\n".format(self.R1)) outfile.write("R2 : {}\n\n".format(self.R2)) if self.quality_filtering or self.adapter_trimming: outfile.write(repr(self.fFilter) + "\n") if self.quality_filtering: outfile.write(repr(self.qFilter) + "\n") if self.adapter_trimming: outfile.write(repr(self.ssw_aligner) + "\n") outfile.write(repr(self.trimmer) + "\n") else: outfile.write("\nNo Fastq Filtering done\n") # bwa alignment options outfile.write( "\n################## BWA ALIGNMENT ##################\n\n") outfile.write("index file : {}\n".format(self.bwa_index)) outfile.write("bwa index options: {}\n".format(self.bwa_index_opt)) outfile.write("bwa mem option: {}\n".format(self.bwa_mem_opt)) outfile.write("bwa threads : {}\n".format(self.bwa_threads)) # Output Options outfile.write("\n################## OUTPUT ##################\n\n") outfile.write("Minimal MAPQ score : {}\n".format(self.min_mapq)) outfile.write("Write garbage reads to sam: {}\n".format( str(self.unmapped_sam))) outfile.write("Write garbage reads to bam: {}\n".format( str(self.unmapped_bam))) outfile.write("Minimal depth for Coverage output : {}\n".format( self.cov_min_depth)) outfile.write("Minimal depth for Variant output : {}\n".format( self.var_min_depth)) outfile.write("Minimal Variant frequency : {}\n".format( self.var_min_freq))
def _sam_spliter(self): """ """ with pysam.Samfile(self.sam, "r") as samfile: self.bam_header = samfile.header # Give the header of the sam file to all Reference.Instances to respect the same order # references in sorted bam files Reference.set_global("bam_header", self.bam_header) # Create a dict to collect unmapped and low quality reads Secondary = Sequence(name='Secondary', length=0) Unmapped = Sequence(name='Unmapped', length=0) LowMapq = Sequence(name='LowMapq', length=0) self.garbage_read = [Secondary, Unmapped, LowMapq] for read in samfile: # Always remove secondary alignments if read.is_secondary: Secondary.add_read(read) # Filter Unmapped reads elif read.tid == -1: Unmapped.add_read(read) # Filter Low MAPQ reads elif read.mapq < self.min_mapq: LowMapq.add_read(read) # Filter short map ##### FOR FUTURE CREATE A SEPARATE CATEGORY elif len(read.query_alignment_sequence) < self.min_size: Unmapped.add_read(read) # Finally if all is fine attribute the read to a Reference else: Reference.addRead(samfile.getrname(read.tid), read) # Removing the original sam file which is no longer needed remove(self.sam) self.sam = None
def _sam_spliter (self): """ """ with pysam.Samfile(self.sam, "r" ) as samfile: self.bam_header = samfile.header # Give the header of the sam file to all Reference.Instances to respect the same order # references in sorted bam files Reference.set_global("bam_header", self.bam_header) # Create a dict to collect unmapped and low quality reads Secondary = Sequence (name = 'Secondary', length = 0) Unmapped = Sequence (name = 'Unmapped', length = 0) LowMapq = Sequence (name = 'LowMapq', length = 0) self.garbage_read = [Secondary, Unmapped, LowMapq] for read in samfile: # Always remove secondary alignments if read.is_secondary: Secondary.add_read(read) # Filter Unmapped reads elif read.tid == -1: Unmapped.add_read(read) # Filter Low MAPQ reads elif read.mapq < self.min_mapq: LowMapq.add_read(read) # Filter short map ##### FOR FUTURE CREATE A SEPARATE CATEGORY elif len(read.query_alignment_sequence) < self.min_size: Unmapped.add_read(read) # Finally if all is fine attribute the read to a Reference else: Reference.addRead(samfile.getrname(read.tid), read) # Removing the original sam file which is no longer needed remove(self.sam) self.sam = None
def _make_report (self): """ """ output = "{}{}_parameters.txt".format(self.result_dir, self.outprefix) with open(output, 'wb') as outfile: # References options outfile.write("################## REFERENCES ##################\n\n") outfile.write(Reference.reprInstances()) if self.ref_masking: outfile.write("Reference homologies were masked with RefMasker\n") outfile.write("blastn options : {}\n".format(self.blastn_opt)) outfile.write("makeblastdb options : {}\n".format(self.mkblastdb_opt)) else: outfile.write("No Reference homologies masking done\n") # Fastq options outfile.write("\n################## FASTQ FILES ##################\n\n") outfile.write("R1 : {}\n".format(self.R1)) outfile.write("R2 : {}\n\n".format(self.R2)) if self.quality_filtering or self.adapter_trimming: outfile.write(repr(self.fFilter)+"\n") if self.quality_filtering: outfile.write(repr (self.qFilter)+"\n") if self.adapter_trimming: outfile.write(repr (self.ssw_aligner)+"\n") outfile.write(repr (self.trimmer)+"\n") else: outfile.write("\nNo Fastq Filtering done\n") # bwa alignment options outfile.write("\n################## BWA ALIGNMENT ##################\n\n") outfile.write("index file : {}\n".format(self.bwa_index)) outfile.write("bwa index options: {}\n".format(self.bwa_index_opt)) outfile.write("bwa mem option: {}\n".format(self.bwa_mem_opt)) outfile.write("bwa threads : {}\n".format(self.bwa_threads)) # Output Options outfile.write("\n################## OUTPUT ##################\n\n") outfile.write("Minimal MAPQ score : {}\n".format(self.min_mapq)) outfile.write("Write garbage reads to sam: {}\n".format(str(self.unmapped_sam))) outfile.write("Write garbage reads to bam: {}\n".format(str(self.unmapped_bam))) outfile.write("Minimal depth for Coverage output : {}\n".format(self.cov_min_depth)) outfile.write("Minimal depth for Variant output : {}\n".format(self.var_min_depth)) outfile.write("Minimal Variant frequency : {}\n".format(self.var_min_freq))
def _extract_ref(self, expand=True): """ Import and expand fasta references and associated flags in a Reference object expand the file if Gziped to avoid multiple compression/decompression during execution if require for next operations """ for ref in self.raw_ref_list: # Expand fasta if needed if expand: ref_fasta = expand_file(infile=ref['fasta'], outdir=self.ref_dir) else: ref_fasta = ref['fasta'] # Create a Reference object Ref = Reference( name=ref['name'], ref_fasta=ref_fasta, bam_maker=Bam.BamMaker(make_bam='bam' in ref['output'], make_sam='sam' in ref['output']), cov_maker=Coverage.CoverageMaker(min_depth=self.cov_min_depth, make_bedgraph='bedgraph' in ref['output'], make_bed='bed' in ref['output'], make_covgraph='covgraph' in ref['output']), var_maker=Variant.VariantMaker(min_depth=self.var_min_depth, min_freq=self.var_min_freq, make_freqvar='variant' in ref['output'])) ## Test if all seq in ref are longer than 3000 for compatibility with bwa #for seq in Ref.seq_dict.values(): #if seq.length < 3000: #import_and_pad ( print(repr(Ref))
def __call__(self): """ Launch the complete pipeline of analyse: * Reference importation/parsing * Facultative step of reference masking to remove homologies between reference sequences * Facultative step of Fastq quality Filtering/ adapter trimming * Facultative step of reference indexing for bwa from merged references * Short read alignment with bwa mem * Spliting of sam to attribute reads to each original references (or unmmapped) * Output per reference bam, sam, bedgraph, bed, covgraph, variant call * Output distribution table and graph """ stime = time() self.outdir = mkdir(path.abspath(self.outdir)) print("\n##### PARSE REFERENCES #####\n") # Create CV_Reference.Reference object for each reference easily accessible through # Reference class methods if self.ref_masking or not self.bwa_index: self.ref_dir = mkdir(path.join(self.outdir, "references/")) self.index_dir = mkdir(path.join(self.outdir, "bwa_index/")) self._extract_ref(expand=True) else: self.ref_dir = "" self.index_dir = "" self._extract_ref(expand=False) # Reference Masking if self.ref_masking: print("\n##### REFERENCE HOMOLOGIES MASKING #####\n") self.db_dir = mkdir(path.join(self.outdir, "blast_db/")) ref_list = self._iterative_masker() # Erase existing index value if ref masking was performed bwa_index = None # Fastq Filtering if self.quality_filtering or self.adapter_trimming: print("\n##### FASTQ FILTERING #####\n") self.fastq_dir = mkdir(path.join(self.outdir, "fastq/")) self.R1, self.R2 = self._fastq_filter() # BWA alignment print("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n") # An index will be generated if no index was provided self.result_dir = mkdir(path.join(self.outdir, "results/")) self.sam = Mem.align(self.R1, self.R2, index=self.bwa_index, ref=Reference.allFasta(), align_opt=self.bwa_mem_opt, index_opt=self.bwa_index_opt, aligner=self.bwa_aligner, align_threads=self.bwa_threads, indexer=self.bwa_indexer, align_outdir=self.result_dir, index_outdir=self.index_dir, align_outname=self.outprefix + ".sam", index_outname=self.outprefix + ".idx") print("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n") # Split the output sam file according to each reference self._sam_spliter() print("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n") # Deal with garbage read dictionnary self._garbage_output() # Ask references to generate the output they were configured to Reference.mk_output_global(self.result_dir + self.outprefix) # Create a distribution table self._distribution_output() self._make_report() print("\n##### DONE #####\n") print("Total execution time = {}s".format(round(time() - stime, 2)))
def __call__(self): """ Launch the complete pipeline of analyse: * Reference importation/parsing * Facultative step of reference masking to remove homologies between reference sequences * Facultative step of Fastq quality Filtering/ adapter trimming * Facultative step of reference indexing for bwa from merged references * Short read alignment with bwa mem * Spliting of sam to attribute reads to each original references (or unmmapped) * Output per reference bam, sam, bedgraph, bed, covgraph, variant call * Output distribution table and graph """ stime = time() self.outdir = mkdir(path.abspath(self.outdir)) print ("\n##### PARSE REFERENCES #####\n") # Create CV_Reference.Reference object for each reference easily accessible through # Reference class methods if self.ref_masking or not self.bwa_index: self.ref_dir = mkdir(path.join(self.outdir, "references/")) self.index_dir = mkdir(path.join(self.outdir, "bwa_index/")) self._extract_ref(expand=True) else: self.ref_dir = "" self.index_dir = "" self._extract_ref(expand=False) # Reference Masking if self.ref_masking: print ("\n##### REFERENCE HOMOLOGIES MASKING #####\n") self.db_dir = mkdir(path.join(self.outdir, "blast_db/")) ref_list = self._iterative_masker() # Erase existing index value if ref masking was performed bwa_index = None # Fastq Filtering if self.quality_filtering or self.adapter_trimming: print ("\n##### FASTQ FILTERING #####\n") self.fastq_dir = mkdir(path.join(self.outdir, "fastq/")) self.R1, self.R2 = self._fastq_filter() # BWA alignment print ("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n") # An index will be generated if no index was provided self.result_dir = mkdir(path.join(self.outdir, "results/")) self.sam = Mem.align ( self.R1, self.R2, index = self.bwa_index, ref = Reference.allFasta(), align_opt = self.bwa_mem_opt, index_opt = self.bwa_index_opt, aligner = self.bwa_aligner, align_threads = self.bwa_threads, indexer = self.bwa_indexer, align_outdir = self.result_dir, index_outdir = self.index_dir, align_outname = self.outprefix+".sam", index_outname = self.outprefix+".idx") print ("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n") # Split the output sam file according to each reference self._sam_spliter () print ("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n") # Deal with garbage read dictionnary self._garbage_output() # Ask references to generate the output they were configured to Reference.mk_output_global(self.result_dir+self.outprefix) # Create a distribution table self._distribution_output() self._make_report() print ("\n##### DONE #####\n") print ("Total execution time = {}s".format(round(time()-stime, 2)))