示例#1
0
    def _distribution_output (self):
        """
        """
        output = "{}{}_Reference_distribution.csv".format(self.result_dir, self.outprefix)
        with open(output, 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
            # Table for all reference
            writer.writerow(["Ref name","length","nread","RPKB"])
            for ref in Reference.getInstances():
                writer.writerow([ref.name, len(ref), ref.nread, float(ref.nread)/len(ref)*1000])
            # Add a line for garbage reads excluding the secondary alignments
            nread = sum([seq.nread for seq in self.garbage_read[1:]])
            writer.writerow(["Unmaped_and LowMapq","NA",nread,"NA"])

        output = "{}{}_Sequence_distribution.csv".format(self.result_dir, self.outprefix)
        with open(output, 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
            # Table decomposing Sequence per Reference
            writer.writerow(["Seq name","length","nread","RPKB"])
            for ref in Reference.getInstances():
                for seq in ref.seq_dict.values():
                    writer.writerow([seq.name, len(seq), seq.nread, float(seq.nread)/len(seq)*1000])
            # Add a lines for garbage reads including the secondary alignments
            for seq in self.garbage_read:
                writer.writerow([seq.name, "NA", seq.nread, "NA"])
示例#2
0
    def _iterative_masker(
        self
    ):  #### TODO The fuction directly manipulate reference field= change that
        """
		Mask references homologies iteratively, starting by the last reference which is masked by
		all the others then to the penultimate masked by all others except the last and and so
		forth until there is only 1 reference remaining
		"""
        # Iterate over index in Reference.instances staring by the last one until the 2nd one
        for i in range(Reference.countInstances() - 1, 0, -1):

            # Extract subject and query_list from ref_list
            subject = Reference.Instances[i]
            query_list = Reference.Instances[0:i]
            print("\n# PROCESSING REFERENCE {} #\n".format(subject.name))

            # Perform a blast of query list against subject
            hit_list = Blastn.align(
                query_list=[ref.ref_fasta for ref in query_list],
                subject_fasta=subject.ref_fasta,
                align_opt=self.blastn_opt,
                num_threads=self.blastn_threads,
                db_opt=self.mkblastdb_opt,
                db_outdir=self.db_dir,
                db_outname=subject.name)

            # Masking hits in suject fasta if hits in hit_list
            subject.ref_fasta = mask(subject_fasta=subject.ref_fasta,
                                     hit_list=hit_list,
                                     ref_outdir=self.ref_dir,
                                     ref_outname="masked_{}.fa".format(
                                         subject.name),
                                     compress_ouput=False)
示例#3
0
    def _iterative_masker (self): #### TODO The fuction directly manipulate reference field= change that
        """
        Mask references homologies iteratively, starting by the last reference which is masked by
        all the others then to the penultimate masked by all others except the last and and so
        forth until there is only 1 reference remaining
        """
        # Iterate over index in Reference.instances staring by the last one until the 2nd one
        for i in range(Reference.countInstances()-1, 0, -1):

            # Extract subject and query_list from ref_list
            subject = Reference.Instances[i]
            query_list = Reference.Instances[0:i]
            print ("\n# PROCESSING REFERENCE {} #\n".format(subject.name))

            # Perform a blast of query list against subject
            hit_list = Blastn.align (
                query_list = [ref.ref_fasta for ref in query_list],
                subject_fasta = subject.ref_fasta,
                align_opt = self.blastn_opt,
                num_threads = self.blastn_threads,
                db_opt = self.mkblastdb_opt,
                db_outdir = self.db_dir,
                db_outname = subject.name)

            # Masking hits in suject fasta if hits in hit_list
            subject.ref_fasta = mask (
                subject_fasta= subject.ref_fasta,
                hit_list = hit_list,
                ref_outdir = self.ref_dir,
                ref_outname = "masked_{}.fa".format(subject.name),
                compress_ouput = False)
示例#4
0
    def _make_report(self):
        """
		"""
        output = "{}{}_parameters.txt".format(self.result_dir, self.outprefix)
        with open(output, 'wb') as outfile:

            # References options
            outfile.write(
                "################## REFERENCES ##################\n\n")
            outfile.write(Reference.reprInstances())

            if self.ref_masking:
                outfile.write(
                    "Reference homologies were masked with RefMasker\n")
                outfile.write("blastn options : {}\n".format(self.blastn_opt))
                outfile.write("makeblastdb options : {}\n".format(
                    self.mkblastdb_opt))

            else:
                outfile.write("No Reference homologies masking done\n")

            # Fastq options
            outfile.write(
                "\n################## FASTQ FILES ##################\n\n")
            outfile.write("R1 : {}\n".format(self.R1))
            outfile.write("R2 : {}\n\n".format(self.R2))

            if self.quality_filtering or self.adapter_trimming:
                outfile.write(repr(self.fFilter) + "\n")
                if self.quality_filtering:
                    outfile.write(repr(self.qFilter) + "\n")
                if self.adapter_trimming:
                    outfile.write(repr(self.ssw_aligner) + "\n")
                    outfile.write(repr(self.trimmer) + "\n")
            else:
                outfile.write("\nNo Fastq Filtering done\n")

            # bwa alignment options
            outfile.write(
                "\n################## BWA ALIGNMENT ##################\n\n")
            outfile.write("index file : {}\n".format(self.bwa_index))
            outfile.write("bwa index options: {}\n".format(self.bwa_index_opt))
            outfile.write("bwa mem option: {}\n".format(self.bwa_mem_opt))
            outfile.write("bwa threads : {}\n".format(self.bwa_threads))

            # Output Options
            outfile.write("\n################## OUTPUT ##################\n\n")
            outfile.write("Minimal MAPQ score : {}\n".format(self.min_mapq))
            outfile.write("Write garbage reads to sam: {}\n".format(
                str(self.unmapped_sam)))
            outfile.write("Write garbage reads to bam: {}\n".format(
                str(self.unmapped_bam)))
            outfile.write("Minimal depth for Coverage output : {}\n".format(
                self.cov_min_depth))
            outfile.write("Minimal depth for Variant output : {}\n".format(
                self.var_min_depth))
            outfile.write("Minimal Variant frequency : {}\n".format(
                self.var_min_freq))
示例#5
0
    def _sam_spliter(self):
        """
		"""
        with pysam.Samfile(self.sam, "r") as samfile:
            self.bam_header = samfile.header

            # Give the header of the sam file to all Reference.Instances to respect the same order
            # references in sorted bam files
            Reference.set_global("bam_header", self.bam_header)

            # Create a dict to collect unmapped and low quality reads
            Secondary = Sequence(name='Secondary', length=0)
            Unmapped = Sequence(name='Unmapped', length=0)
            LowMapq = Sequence(name='LowMapq', length=0)
            self.garbage_read = [Secondary, Unmapped, LowMapq]

            for read in samfile:
                # Always remove secondary alignments
                if read.is_secondary:
                    Secondary.add_read(read)
                # Filter Unmapped reads
                elif read.tid == -1:
                    Unmapped.add_read(read)
                # Filter Low MAPQ reads
                elif read.mapq < self.min_mapq:
                    LowMapq.add_read(read)
                # Filter short map ##### FOR FUTURE CREATE A SEPARATE CATEGORY
                elif len(read.query_alignment_sequence) < self.min_size:
                    Unmapped.add_read(read)
                # Finally if all is fine attribute the read to a Reference
                else:
                    Reference.addRead(samfile.getrname(read.tid), read)

        # Removing the original sam file which is no longer needed
        remove(self.sam)
        self.sam = None
示例#6
0
    def _sam_spliter (self):
        """
        """
        with pysam.Samfile(self.sam, "r" ) as samfile:
            self.bam_header = samfile.header

            # Give the header of the sam file to all Reference.Instances to respect the same order
            # references in sorted bam files
            Reference.set_global("bam_header", self.bam_header)

            # Create a dict to collect unmapped and low quality reads
            Secondary = Sequence (name = 'Secondary', length = 0)
            Unmapped = Sequence (name = 'Unmapped', length = 0)
            LowMapq = Sequence (name = 'LowMapq', length = 0)
            self.garbage_read = [Secondary, Unmapped, LowMapq]

            for read in samfile:
                # Always remove secondary alignments
                if read.is_secondary:
                    Secondary.add_read(read)
                # Filter Unmapped reads
                elif read.tid == -1:
                    Unmapped.add_read(read)
                # Filter Low MAPQ reads
                elif read.mapq < self.min_mapq:
                    LowMapq.add_read(read)
                # Filter short map ##### FOR FUTURE CREATE A SEPARATE CATEGORY
                elif len(read.query_alignment_sequence) < self.min_size:
                    Unmapped.add_read(read)    
                # Finally if all is fine attribute the read to a Reference
                else:
                    Reference.addRead(samfile.getrname(read.tid), read)
        
        # Removing the original sam file which is no longer needed
        remove(self.sam)
        self.sam = None
示例#7
0
 def _make_report (self):
     """
     """
     output = "{}{}_parameters.txt".format(self.result_dir, self.outprefix)
     with open(output, 'wb') as outfile:
     
         # References options
         outfile.write("################## REFERENCES ##################\n\n")
         outfile.write(Reference.reprInstances()) 
         
         if self.ref_masking:
             outfile.write("Reference homologies were masked with RefMasker\n")
             outfile.write("blastn options : {}\n".format(self.blastn_opt))
             outfile.write("makeblastdb options : {}\n".format(self.mkblastdb_opt))
             
         else:
             outfile.write("No Reference homologies masking done\n")
         
         # Fastq options
         outfile.write("\n################## FASTQ FILES ##################\n\n")
         outfile.write("R1 : {}\n".format(self.R1))
         outfile.write("R2 : {}\n\n".format(self.R2))
         
         if self.quality_filtering or self.adapter_trimming:
             outfile.write(repr(self.fFilter)+"\n")
             if self.quality_filtering:
                 outfile.write(repr (self.qFilter)+"\n")
             if self.adapter_trimming:
                 outfile.write(repr (self.ssw_aligner)+"\n")
                 outfile.write(repr (self.trimmer)+"\n")
         else:
             outfile.write("\nNo Fastq Filtering done\n")
             
         # bwa alignment options
         outfile.write("\n################## BWA ALIGNMENT ##################\n\n")
         outfile.write("index file : {}\n".format(self.bwa_index))
         outfile.write("bwa index options: {}\n".format(self.bwa_index_opt))
         outfile.write("bwa mem option: {}\n".format(self.bwa_mem_opt))
         outfile.write("bwa threads : {}\n".format(self.bwa_threads))
         
         # Output Options
         outfile.write("\n################## OUTPUT ##################\n\n")
         outfile.write("Minimal MAPQ score : {}\n".format(self.min_mapq))
         outfile.write("Write garbage reads to sam: {}\n".format(str(self.unmapped_sam)))
         outfile.write("Write garbage reads to bam: {}\n".format(str(self.unmapped_bam)))
         outfile.write("Minimal depth for Coverage output : {}\n".format(self.cov_min_depth))
         outfile.write("Minimal depth for Variant output : {}\n".format(self.var_min_depth))
         outfile.write("Minimal Variant frequency : {}\n".format(self.var_min_freq))
示例#8
0
    def _extract_ref(self, expand=True):
        """
		Import and expand fasta references and associated flags in a Reference object
		expand the file if Gziped to avoid multiple compression/decompression during execution
		if require for next operations
		"""
        for ref in self.raw_ref_list:
            # Expand fasta if needed
            if expand:
                ref_fasta = expand_file(infile=ref['fasta'],
                                        outdir=self.ref_dir)
            else:
                ref_fasta = ref['fasta']

            # Create a Reference object
            Ref = Reference(
                name=ref['name'],
                ref_fasta=ref_fasta,
                bam_maker=Bam.BamMaker(make_bam='bam' in ref['output'],
                                       make_sam='sam' in ref['output']),
                cov_maker=Coverage.CoverageMaker(min_depth=self.cov_min_depth,
                                                 make_bedgraph='bedgraph'
                                                 in ref['output'],
                                                 make_bed='bed'
                                                 in ref['output'],
                                                 make_covgraph='covgraph'
                                                 in ref['output']),
                var_maker=Variant.VariantMaker(min_depth=self.var_min_depth,
                                               min_freq=self.var_min_freq,
                                               make_freqvar='variant'
                                               in ref['output']))

            ## Test if all seq in ref are longer than 3000 for compatibility with bwa
            #for seq in Ref.seq_dict.values():
            #if seq.length < 3000:
            #import_and_pad (

            print(repr(Ref))
示例#9
0
    def __call__(self):
        """
		Launch the complete pipeline of analyse:

		* Reference importation/parsing
		* Facultative step of reference masking to remove homologies between reference sequences
		* Facultative step of Fastq quality Filtering/ adapter trimming
		* Facultative step of reference indexing for bwa from merged references
		* Short read alignment with bwa mem
		* Spliting of sam to attribute reads to each original references (or unmmapped)
		* Output per reference bam, sam, bedgraph, bed, covgraph, variant call
		* Output distribution table and graph
		"""
        stime = time()
        self.outdir = mkdir(path.abspath(self.outdir))

        print("\n##### PARSE REFERENCES #####\n")
        # Create CV_Reference.Reference object for each reference easily accessible through
        # Reference class methods

        if self.ref_masking or not self.bwa_index:
            self.ref_dir = mkdir(path.join(self.outdir, "references/"))
            self.index_dir = mkdir(path.join(self.outdir, "bwa_index/"))
            self._extract_ref(expand=True)
        else:
            self.ref_dir = ""
            self.index_dir = ""
            self._extract_ref(expand=False)

        # Reference Masking
        if self.ref_masking:
            print("\n##### REFERENCE HOMOLOGIES MASKING #####\n")
            self.db_dir = mkdir(path.join(self.outdir, "blast_db/"))
            ref_list = self._iterative_masker()
            # Erase existing index value if ref masking was performed
            bwa_index = None

        # Fastq Filtering
        if self.quality_filtering or self.adapter_trimming:
            print("\n##### FASTQ FILTERING #####\n")
            self.fastq_dir = mkdir(path.join(self.outdir, "fastq/"))
            self.R1, self.R2 = self._fastq_filter()

        # BWA alignment
        print("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n")
        # An index will be generated if no index was provided
        self.result_dir = mkdir(path.join(self.outdir, "results/"))

        self.sam = Mem.align(self.R1,
                             self.R2,
                             index=self.bwa_index,
                             ref=Reference.allFasta(),
                             align_opt=self.bwa_mem_opt,
                             index_opt=self.bwa_index_opt,
                             aligner=self.bwa_aligner,
                             align_threads=self.bwa_threads,
                             indexer=self.bwa_indexer,
                             align_outdir=self.result_dir,
                             index_outdir=self.index_dir,
                             align_outname=self.outprefix + ".sam",
                             index_outname=self.outprefix + ".idx")

        print("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n")
        # Split the output sam file according to each reference
        self._sam_spliter()

        print("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n")
        # Deal with garbage read dictionnary
        self._garbage_output()
        # Ask references to generate the output they were configured to
        Reference.mk_output_global(self.result_dir + self.outprefix)
        # Create a distribution table
        self._distribution_output()
        self._make_report()

        print("\n##### DONE #####\n")
        print("Total execution time = {}s".format(round(time() - stime, 2)))
示例#10
0
    def __call__(self):
        """
        Launch the complete pipeline of analyse:

        * Reference importation/parsing
        * Facultative step of reference masking to remove homologies between reference sequences
        * Facultative step of Fastq quality Filtering/ adapter trimming
        * Facultative step of reference indexing for bwa from merged references
        * Short read alignment with bwa mem
        * Spliting of sam to attribute reads to each original references (or unmmapped)
        * Output per reference bam, sam, bedgraph, bed, covgraph, variant call
        * Output distribution table and graph
        """
        stime = time()
        self.outdir = mkdir(path.abspath(self.outdir))

        print ("\n##### PARSE REFERENCES #####\n")
        # Create CV_Reference.Reference object for each reference easily accessible through
        # Reference class methods
        
        if self.ref_masking or not self.bwa_index:
            self.ref_dir = mkdir(path.join(self.outdir, "references/"))
            self.index_dir = mkdir(path.join(self.outdir, "bwa_index/"))
            self._extract_ref(expand=True)
        else:
            self.ref_dir = ""
            self.index_dir = ""
            self._extract_ref(expand=False)
        
        # Reference Masking
        if self.ref_masking:
            print ("\n##### REFERENCE HOMOLOGIES MASKING #####\n")
            self.db_dir = mkdir(path.join(self.outdir, "blast_db/"))
            ref_list = self._iterative_masker()
            # Erase existing index value if ref masking was performed
            bwa_index = None

        # Fastq Filtering
        if self.quality_filtering or self.adapter_trimming:
            print ("\n##### FASTQ FILTERING #####\n")
            self.fastq_dir = mkdir(path.join(self.outdir, "fastq/"))
            self.R1, self.R2 = self._fastq_filter()

        # BWA alignment
        print ("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n")
        # An index will be generated if no index was provided
        self.result_dir = mkdir(path.join(self.outdir, "results/"))
        
        self.sam = Mem.align (
            self.R1, self.R2,
            index = self.bwa_index,
            ref = Reference.allFasta(),
            align_opt = self.bwa_mem_opt,
            index_opt = self.bwa_index_opt,
            aligner = self.bwa_aligner,
            align_threads = self.bwa_threads,
            indexer = self.bwa_indexer,
            align_outdir = self.result_dir,
            index_outdir = self.index_dir,
            align_outname = self.outprefix+".sam",
            index_outname = self.outprefix+".idx")

        print ("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n")
        # Split the output sam file according to each reference
        self._sam_spliter ()

        print ("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n")
        # Deal with garbage read dictionnary
        self._garbage_output()
        # Ask references to generate the output they were configured to
        Reference.mk_output_global(self.result_dir+self.outprefix)
        # Create a distribution table
        self._distribution_output()
        self._make_report()
        
        print ("\n##### DONE #####\n")
        print ("Total execution time = {}s".format(round(time()-stime, 2)))