Exemplo n.º 1
0
    def __call__(self):
        """
		Launch the complete pipeline of analyse:

		* Reference importation/parsing
		* Facultative step of reference masking to remove homologies between reference sequences
		* Facultative step of Fastq quality Filtering/ adapter trimming
		* Facultative step of reference indexing for bwa from merged references
		* Short read alignment with bwa mem
		* Spliting of sam to attribute reads to each original references (or unmmapped)
		* Output per reference bam, sam, bedgraph, bed, covgraph, variant call
		* Output distribution table and graph
		"""
        stime = time()
        self.outdir = mkdir(path.abspath(self.outdir))

        print("\n##### PARSE REFERENCES #####\n")
        # Create CV_Reference.Reference object for each reference easily accessible through
        # Reference class methods

        if self.ref_masking or not self.bwa_index:
            self.ref_dir = mkdir(path.join(self.outdir, "references/"))
            self.index_dir = mkdir(path.join(self.outdir, "bwa_index/"))
            self._extract_ref(expand=True)
        else:
            self.ref_dir = ""
            self.index_dir = ""
            self._extract_ref(expand=False)

        # Reference Masking
        if self.ref_masking:
            print("\n##### REFERENCE HOMOLOGIES MASKING #####\n")
            self.db_dir = mkdir(path.join(self.outdir, "blast_db/"))
            ref_list = self._iterative_masker()
            # Erase existing index value if ref masking was performed
            bwa_index = None

        # Fastq Filtering
        if self.quality_filtering or self.adapter_trimming:
            print("\n##### FASTQ FILTERING #####\n")
            self.fastq_dir = mkdir(path.join(self.outdir, "fastq/"))
            self.R1, self.R2 = self._fastq_filter()

        # BWA alignment
        print("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n")
        # An index will be generated if no index was provided
        self.result_dir = mkdir(path.join(self.outdir, "results/"))

        self.sam = Mem.align(self.R1,
                             self.R2,
                             index=self.bwa_index,
                             ref=Reference.allFasta(),
                             align_opt=self.bwa_mem_opt,
                             index_opt=self.bwa_index_opt,
                             aligner=self.bwa_aligner,
                             align_threads=self.bwa_threads,
                             indexer=self.bwa_indexer,
                             align_outdir=self.result_dir,
                             index_outdir=self.index_dir,
                             align_outname=self.outprefix + ".sam",
                             index_outname=self.outprefix + ".idx")

        print("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n")
        # Split the output sam file according to each reference
        self._sam_spliter()

        print("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n")
        # Deal with garbage read dictionnary
        self._garbage_output()
        # Ask references to generate the output they were configured to
        Reference.mk_output_global(self.result_dir + self.outprefix)
        # Create a distribution table
        self._distribution_output()
        self._make_report()

        print("\n##### DONE #####\n")
        print("Total execution time = {}s".format(round(time() - stime, 2)))
Exemplo n.º 2
0
def align(R1,
          R2='',
          index='',
          ref='',
          aligner="bwa mem",
          align_opt="",
          align_threads=1,
          align_outdir="./bwa_align/",
          align_outname="out.sam",
          indexer="bwa index",
          index_opt="",
          index_outdir="./bwa_index/",
          index_outname="out"):
    """
    Main function of the package allowing to validate an existing index or to create a new one,
    then perform a alignment of single or paired fastq sequences against the index. Finally a sam
    file is returned for further analysis. If an valid existing index was given all index option
    and ref_fasta are not required.
    @param R1 Path to the file containing fastq sequences (can be gzipped)
    @param R2 Facultative path to the file containing paired fastq sequence (can be gzipped)
    @param index Index files basename if available
    @param ref Path of the fasta file containing the reference sequence (can be gzipped)
    This parameter can also be a list of fasta file (gzipped or not) in this case all references
    will be merged into a single fasta reference
    @param aligner Path ot the bwa mem executable. Not required if bwa if added to your path
    @param align_opt Bwa mem command line options as a string
    @param align_outdir Directory where to store the sam file
    @param align_outname Name of the output sam file
    @param indexer Path ot the bwa index executable. Not required if bwa if added to your path
    @param index_opt Bwa index command line options as a string
    @param index_outdir Directory where to store the index files
    @param index_outname Basename of the index file
    @return Path of the output sam file
    """
    # Try to import an existing index
    try:
        if not index:
            raise Exception("No index provided")

        print("Existing index provided")
        idx = ExistingIndex(index)

    # If no index or if an error occured during validation of the existing index = create a new one
    except Exception as E:
        print(E)

        # Verify the presence of the reference fasta file
        if not ref:
            raise Exception(
                "Invalid or no fasta file provided. Cannot create an index")

        print("Generating index...")
        mkdir(index_outdir)
        index_path = path.join(index_outdir, index_outname)
        idx = NewIndex(ref, index_path, index_opt, indexer)

    # Create a Aligner object
    mem = Aligner(idx, align_opt, aligner, align_threads)
    #~print (repr(mem))
    mkdir(align_outdir)

    # Align the reference index with R1 fastq (and R2)
    align_path = path.join(align_outdir, align_outname)
    return (mem.align(R1, R2, align_path))
Exemplo n.º 3
0
    def __call__(self):
        """
        Launch the complete pipeline of analyse:

        * Reference importation/parsing
        * Facultative step of reference masking to remove homologies between reference sequences
        * Facultative step of Fastq quality Filtering/ adapter trimming
        * Facultative step of reference indexing for bwa from merged references
        * Short read alignment with bwa mem
        * Spliting of sam to attribute reads to each original references (or unmmapped)
        * Output per reference bam, sam, bedgraph, bed, covgraph, variant call
        * Output distribution table and graph
        """
        stime = time()
        self.outdir = mkdir(path.abspath(self.outdir))

        print ("\n##### PARSE REFERENCES #####\n")
        # Create CV_Reference.Reference object for each reference easily accessible through
        # Reference class methods
        
        if self.ref_masking or not self.bwa_index:
            self.ref_dir = mkdir(path.join(self.outdir, "references/"))
            self.index_dir = mkdir(path.join(self.outdir, "bwa_index/"))
            self._extract_ref(expand=True)
        else:
            self.ref_dir = ""
            self.index_dir = ""
            self._extract_ref(expand=False)
        
        # Reference Masking
        if self.ref_masking:
            print ("\n##### REFERENCE HOMOLOGIES MASKING #####\n")
            self.db_dir = mkdir(path.join(self.outdir, "blast_db/"))
            ref_list = self._iterative_masker()
            # Erase existing index value if ref masking was performed
            bwa_index = None

        # Fastq Filtering
        if self.quality_filtering or self.adapter_trimming:
            print ("\n##### FASTQ FILTERING #####\n")
            self.fastq_dir = mkdir(path.join(self.outdir, "fastq/"))
            self.R1, self.R2 = self._fastq_filter()

        # BWA alignment
        print ("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n")
        # An index will be generated if no index was provided
        self.result_dir = mkdir(path.join(self.outdir, "results/"))
        
        self.sam = Mem.align (
            self.R1, self.R2,
            index = self.bwa_index,
            ref = Reference.allFasta(),
            align_opt = self.bwa_mem_opt,
            index_opt = self.bwa_index_opt,
            aligner = self.bwa_aligner,
            align_threads = self.bwa_threads,
            indexer = self.bwa_indexer,
            align_outdir = self.result_dir,
            index_outdir = self.index_dir,
            align_outname = self.outprefix+".sam",
            index_outname = self.outprefix+".idx")

        print ("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n")
        # Split the output sam file according to each reference
        self._sam_spliter ()

        print ("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n")
        # Deal with garbage read dictionnary
        self._garbage_output()
        # Ask references to generate the output they were configured to
        Reference.mk_output_global(self.result_dir+self.outprefix)
        # Create a distribution table
        self._distribution_output()
        self._make_report()
        
        print ("\n##### DONE #####\n")
        print ("Total execution time = {}s".format(round(time()-stime, 2)))
Exemplo n.º 4
0
def align  (query_list,
            subject_db = None,
            subject_fasta = None,
            aligner = "blastn",
            align_opt = "",
            num_threads = 1,
            db_maker = "makeblastdb",
            db_opt = "",
            db_outdir = "./blast_db/",
            db_outname = "out"):

    """
    Main function of RefMasker that integrate database creation, blast and homology masking
    * Instantiate Blast database and blastn object
    * Perform iterative blasts of query sequences against the subject database and create a list of
    hits.
    @param query_list List of paths indicating fasta files containing query sequences (can be
    gzipped). Fasta can contains multiple sequences.
    @param subject_db Basename of file from a blast database created by "makeblastdb" if available
    @param subject_fasta Reference fasta file. Required if no ref_index is given (can be gzipped)
    @param aligner Path ot the blastn executable. Not required if blast+ if added to your path
    @param blastn_opt Blastn command line options as a string
    @param db_maker Path ot the makeblastdb executable. Not required if blast+ if added to your path
    @param db_opt makeblastdb command line options as a string
    @param db_outdir Directory where to store the database files
    @param db_outname Basename of the database files
    @return A list of BlastHit objects
    """
    # Try to import an existing database
    try:
        if not subject_db:
            raise Exception("No Blast database was provided")

        print("Existing database provided")
        db = ExistingDB(subject_db)

    # If no DB or if an error occured during validation of the existing DB = create a new db
    except Exception as E:
        print (E)

        # Verify the presence of the reference fasta file
        if not subject_fasta or not path.isfile (subject_fasta):
            raise Exception("Invalid or no fasta file provided. Cannot create a database")

        print ("Generate a database...")
        mkdir(db_outdir)
        db_path = path.join (db_outdir, db_outname)

        # Create the new database
        db = NewDB(ref_path=subject_fasta, db_path=db_path, makeblastdb_opt=db_opt, makeblastdb=db_maker)

    # Initialise a Blastn object
    blast = Aligner(db, align_opt, aligner, num_threads)
    #~print (repr(blast))

    # Generate a list of hit containing hits of all sequence in query list in subject
    hit_list = []
    # Extend the list of hits for each query in a bigger list.
    for query in query_list:
        hit_list.extend(blast.align(query))

    return hit_list
Exemplo n.º 5
0
def mask(subject_fasta,
         hit_list,
         ref_outdir="./references/",
         ref_outname="masked_ref.fa",
         compress_ouput=True):
    """
    Import a reference fasta sequence, Mask positions indicated by hits from a hit_list and write
    the modified fasta sequence in a new file.
    @param subject_fasta Fasta sequence of the subject to edit (can be gzipped)
    @param hit_list List of hit objects. Hits need at least 3 fields named s_id, s_start and s_end
    coresponding to the name of the sequence matched, and the hit start/end (0 based).
    @param ref_outdir Directory where the masked reference will be created
    @param ref_outname Name of the masked reference
    @param compress_ouput If true the output will be gzipped
    @return A path to the modified sequence if the hit list was valid.
    """

    # Test if object the first object of hit_list have the require s_id, s_start and s_end fields
    try:
        a = hit_list[0].s_id
        a = hit_list[0].s_start
        a = hit_list[0].s_end

    except IndexError:
        print("No hit found, The subject fasta file will not be edited")
        return subject_fasta
    except AttributeError as E:
        print(
            "The list provided does not contain suitable hit object, The subject fasta file will not be edited"
        )
        return subject_fasta

    # Initialize output folder
    mkdir(ref_outdir)

    # Initialize input fasta file
    if subject_fasta[-2:].lower() == "gz":
        in_handle = gzip.open(subject_fasta, "r")
    else:
        in_handle = open(subject_fasta, "r")

    # Initialize output fasta file
    if compress_ouput:
        ref_path = path.join(ref_outdir, ref_outname + ".gz")
        out_handle = gzip.open(ref_path, 'w')
    else:
        ref_path = path.join(ref_outdir, ref_outname)
        out_handle = open(ref_path, 'w')

    # Generate a list of ref that will need to be modified
    id_list = {hit.s_id: 0 for hit in hit_list}.keys()

    # Iterate over record in the subject fasta file
    print("Masking hit positions and writting a new reference for {} ".format(
        ref_outname))
    i = j = 0
    start_time = time()
    for record in SeqIO.parse(in_handle, "fasta"):
        # Progress Marker
        stdout.write("*")
        stdout.flush()

        # Check if the record is in the list of record to modify
        if record.id in id_list:
            i += 1
            #~print ("Hit found in {}. Editing the sequence".format(record.id))
            # Casting Seq type to MutableSeq Type to allow string editing
            record.seq = record.seq.tomutable()

            # For each hit in the list of hit found
            for hit in hit_list:
                if record.id == hit.s_id:

                    # For all position between start and end coordinates modify the base by N
                    for position in range(hit.s_start, hit.s_end):
                        record.seq[position] = 'n'
        else:
            j += 1
            #~print ("No hit found in {}".format(record.id))

        # Finally write the sequence modified or not
        out_handle.write(record.format("fasta"))
    print("")
    # Report informations
    print("{} sequence(s) from {} modified in {}s".format(
        i, ref_outname, round(time() - start_time), 2))

    # Close files and return the masked ref path
    in_handle.close()
    out_handle.close()
    return ref_path
Exemplo n.º 6
0
def mask (  subject_fasta,
            hit_list,
            ref_outdir="./references/",
            ref_outname="masked_ref.fa",
            compress_ouput=True ):
    """
    Import a reference fasta sequence, Mask positions indicated by hits from a hit_list and write
    the modified fasta sequence in a new file.
    @param subject_fasta Fasta sequence of the subject to edit (can be gzipped)
    @param hit_list List of hit objects. Hits need at least 3 fields named s_id, s_start and s_end
    coresponding to the name of the sequence matched, and the hit start/end (0 based).
    @param ref_outdir Directory where the masked reference will be created
    @param ref_outname Name of the masked reference
    @param compress_ouput If true the output will be gzipped
    @return A path to the modified sequence if the hit list was valid.
    """

    # Test if object the first object of hit_list have the require s_id, s_start and s_end fields
    try:
        a = hit_list[0].s_id
        a = hit_list[0].s_start
        a = hit_list[0].s_end

    except IndexError:
        print ("No hit found, The subject fasta file will not be edited")
        return subject_fasta
    except AttributeError as E:
        print ("The list provided does not contain suitable hit object, The subject fasta file will not be edited")
        return subject_fasta

    # Initialize output folder
    mkdir(ref_outdir)

    # Initialize input fasta file
    if subject_fasta[-2:].lower() == "gz":
        in_handle = gzip.open(subject_fasta, "r")
    else:
        in_handle = open(subject_fasta, "r")

    # Initialize output fasta file
    if compress_ouput:
        ref_path = path.join (ref_outdir, ref_outname+".gz")
        out_handle = gzip.open(ref_path, 'w')
    else:
        ref_path = path.join (ref_outdir, ref_outname)
        out_handle = open(ref_path, 'w')

    # Generate a list of ref that will need to be modified
    id_list = {hit.s_id:0 for hit in hit_list}.keys()

    # Iterate over record in the subject fasta file
    print ("Masking hit positions and writting a new reference for {} ".format(ref_outname))
    i=j=0
    start_time = time()
    for record in SeqIO.parse(in_handle, "fasta"):
        # Progress Marker
        stdout.write("*")
        stdout.flush()

        # Check if the record is in the list of record to modify
        if record.id in id_list:
            i+=1
            #~print ("Hit found in {}. Editing the sequence".format(record.id))
            # Casting Seq type to MutableSeq Type to allow string editing
            record.seq = record.seq.tomutable()

            # For each hit in the list of hit found
            for hit in hit_list:
                if record.id == hit.s_id:

                    # For all position between start and end coordinates modify the base by N
                    for position in range (hit.s_start, hit.s_end):
                        record.seq[position]= 'n'
        else:
            j+=1
            #~print ("No hit found in {}".format(record.id))

        # Finally write the sequence modified or not
        out_handle.write(record.format("fasta"))
    print("")
    # Report informations
    print("{} sequence(s) from {} modified in {}s".format(i,ref_outname, round(time()-start_time),2))

    # Close files and return the masked ref path
    in_handle.close()
    out_handle.close()
    return ref_path
Exemplo n.º 7
0
def align(query_list,
          subject_db=None,
          subject_fasta=None,
          aligner="blastn",
          align_opt="",
          num_threads=1,
          db_maker="makeblastdb",
          db_opt="",
          db_outdir="./blast_db/",
          db_outname="out"):
    """
    Main function of RefMasker that integrate database creation, blast and homology masking
    * Instantiate Blast database and blastn object
    * Perform iterative blasts of query sequences against the subject database and create a list of
    hits.
    @param query_list List of paths indicating fasta files containing query sequences (can be
    gzipped). Fasta can contains multiple sequences.
    @param subject_db Basename of file from a blast database created by "makeblastdb" if available
    @param subject_fasta Reference fasta file. Required if no ref_index is given (can be gzipped)
    @param aligner Path ot the blastn executable. Not required if blast+ if added to your path
    @param blastn_opt Blastn command line options as a string
    @param db_maker Path ot the makeblastdb executable. Not required if blast+ if added to your path
    @param db_opt makeblastdb command line options as a string
    @param db_outdir Directory where to store the database files
    @param db_outname Basename of the database files
    @return A list of BlastHit objects
    """
    # Try to import an existing database
    try:
        if not subject_db:
            raise Exception("No Blast database was provided")

        print("Existing database provided")
        db = ExistingDB(subject_db)

    # If no DB or if an error occured during validation of the existing DB = create a new db
    except Exception as E:
        print(E)

        # Verify the presence of the reference fasta file
        if not subject_fasta or not path.isfile(subject_fasta):
            raise Exception(
                "Invalid or no fasta file provided. Cannot create a database")

        print("Generate a database...")
        mkdir(db_outdir)
        db_path = path.join(db_outdir, db_outname)

        # Create the new database
        db = NewDB(ref_path=subject_fasta,
                   db_path=db_path,
                   makeblastdb_opt=db_opt,
                   makeblastdb=db_maker)

    # Initialise a Blastn object
    blast = Aligner(db, align_opt, aligner, num_threads)
    #~print (repr(blast))

    # Generate a list of hit containing hits of all sequence in query list in subject
    hit_list = []
    # Extend the list of hits for each query in a bigger list.
    for query in query_list:
        hit_list.extend(blast.align(query))

    return hit_list
Exemplo n.º 8
0
Arquivo: Mem.py Projeto: a-slide/pyDNA
def align  (R1,
            R2='',
            index = '',
            ref = '',
            aligner = "bwa mem",
            align_opt="",
            align_threads = 1,
            align_outdir= "./bwa_align/",
            align_outname= "out.sam",
            indexer = "bwa index",
            index_opt="",
            index_outdir = "./bwa_index/",
            index_outname = "out"):
    """
    Main function of the package allowing to validate an existing index or to create a new one,
    then perform a alignment of single or paired fastq sequences against the index. Finally a sam
    file is returned for further analysis. If an valid existing index was given all index option
    and ref_fasta are not required.
    @param R1 Path to the file containing fastq sequences (can be gzipped)
    @param R2 Facultative path to the file containing paired fastq sequence (can be gzipped)
    @param index Index files basename if available
    @param ref Path of the fasta file containing the reference sequence (can be gzipped)
    This parameter can also be a list of fasta file (gzipped or not) in this case all references
    will be merged into a single fasta reference
    @param aligner Path ot the bwa mem executable. Not required if bwa if added to your path
    @param align_opt Bwa mem command line options as a string
    @param align_outdir Directory where to store the sam file
    @param align_outname Name of the output sam file
    @param indexer Path ot the bwa index executable. Not required if bwa if added to your path
    @param index_opt Bwa index command line options as a string
    @param index_outdir Directory where to store the index files
    @param index_outname Basename of the index file
    @return Path of the output sam file
    """
    # Try to import an existing index
    try:
        if not index:
            raise Exception("No index provided")

        print("Existing index provided")
        idx = ExistingIndex(index)

    # If no index or if an error occured during validation of the existing index = create a new one
    except Exception as E:
        print (E)

        # Verify the presence of the reference fasta file
        if not ref:
            raise Exception("Invalid or no fasta file provided. Cannot create an index")

        print("Generating index...")
        mkdir(index_outdir)
        index_path = path.join(index_outdir, index_outname)
        idx = NewIndex(ref, index_path, index_opt, indexer)

    # Create a Aligner object
    mem = Aligner(idx, align_opt, aligner, align_threads)
    #~print (repr(mem))
    mkdir(align_outdir)

    # Align the reference index with R1 fastq (and R2)
    align_path = path.join(align_outdir, align_outname)
    return (mem.align(R1, R2, align_path))