示例#1
0
    def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5):
        from Bio import Phylo, AlignIO
        import subprocess, glob, shutil
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        for seq in self.aln: seq.name=seq.id
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = ["fasttree"]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta")
        tree_cmd.append(">")
        tree_cmd.append("initial_tree.newick")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"
        if raxml:
            if raxml_time_limit>0:
                tmp_tree = Phylo.read('initial_tree.newick','newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter<10):
                    resolve_iter+=1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree,'initial_tree.newick', 'newick')
                AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
                print( "RAxML tree optimization with time limit", raxml_time_limit,  "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit*3600)
                process = subprocess.Popen("exec raxml -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system("raxml -f e -T " + str(self.nthreads) + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick")
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)
        else:
            shutil.copy('initial_tree.newick', out_fname)
        self.tt_from_file(out_fname, root)
        os.chdir('..')
        remove_dir(self.run_dir)
        self.is_timetree=False
示例#2
0
    def align(self, fname, debug=False):
        '''
        align sequences using mafft

        side-effects:
            self.aln {MultipleSeqAlignment} reference not present if not in self.seqs
            self.reference_aln {SeqRecord} always set, even if the reference is subsequently discarded
            self.sequence_lookup {dict} map linking seq.id to the alignment, potentialy without the reference
            saves the alignment (always including reference) to fname
        '''
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        if self.reference_in_dataset:
            out_seqs = self.seqs.values()
        else:
            self.log.notify("Adding reference for alignment step")
            out_seqs = self.seqs.values() + [self.reference_seq]

        SeqIO.write(out_seqs, "temp_in.fasta", "fasta")
        self.log.notify("Running alignment")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) +
                  " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr")
        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        os.chdir("..")
        os.rename(os.path.join(self.run_dir, "temp_out.fasta"), fname)
        if not debug: remove_dir(self.run_dir)

        self.set_reference_alignment()
        if not self.reference_in_dataset:
            self.remove_reference_from_alignment()
        self.set_sequence_lookup()
        self.add_attributes_to_aln()
示例#3
0
    def align(self):
        '''
        align sequences using mafft
        '''
        from Bio import AlignIO
        from Bio.Align import MultipleSeqAlignment
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        ref_in_set = self.reference_seq.name in self.seqs
        if ref_in_set:
            out_seqs = self.seqs.values()
        else:
            out_seqs = self.seqs.values() + [self.reference_seq]
        print("align: reference in set", ref_in_set)
        SeqIO.write(out_seqs, "temp_in.fasta", "fasta")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) +
                  " temp_in.fasta > temp_out.fasta")

        tmp_aln = AlignIO.read('temp_out.fasta', 'fasta')
        self.sequence_lookup = {seq.id: seq for seq in tmp_aln}
        # add attributes to alignment
        for seqid, seq in self.seqs.iteritems():
            self.sequence_lookup[seqid].attributes = seq.attributes
        self.aln = MultipleSeqAlignment([
            s for s in tmp_aln
            if s.name != self.reference_seq.name or ref_in_set
        ])
        os.chdir('..')
        remove_dir(self.run_dir)
示例#4
0
文件: tree.py 项目: zachcp/augur
 def build_newick(self, newick_file, nthreads=2, root='midpoint', raxml=True, raxml_bin='raxml', debug=False, num_distinct_starting_trees=1):
     from Bio import Phylo, AlignIO
     import subprocess, glob, shutil
     make_dir(self.run_dir)
     os.chdir(self.run_dir)
     for seq in self.aln: seq.name=seq.id
     AlignIO.write(self.aln, 'temp.fasta', 'fasta')
     out_fname = os.path.join("..", newick_file)
     if raxml:
         self.logger("modified RAxML script - no branch length optimisation or time limit", 1)
         AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
         if num_distinct_starting_trees == 1:
             cmd = raxml_bin + " -f d -T " + str(nthreads) + " -m GTRCAT -c 25 -p 235813 -n tre -s temp.phyx"
         else:
             self.logger("RAxML running with {} starting trees (longer but better...)".format(num_distinct_starting_trees), 1)
             cmd = raxml_bin + " -f d -T " + str(nthreads) + " -N " + str(num_distinct_starting_trees) + " -m GTRCAT -c 25 -p 235813 -n tre -s temp.phyx"
         fh = open("raxml.log", 'w')
         try:
             check_call(cmd, stdout=fh, stderr=STDOUT, shell=True)
             self.logger("RAXML COMPLETED.", 1)
         except CalledProcessError:
             self.logger("RAXML TREE FAILED - check {}/raxml.log".format(self.run_dir), 1)
             sys.exit(2)
         shutil.copy('RAxML_bestTree.tre', out_fname)
     else:
         tree_cmd = ["fasttree"]
         if self.nuc: tree_cmd.append("-nt")
         tree_cmd.extend(["temp.fasta","1>","initial_tree.newick", "2>", "fasttree_stderr"])
         os.system(" ".join(tree_cmd))
         shutil.copy('initial_tree.newick', out_fname)
     os.chdir('..')
     if not debug:
         remove_dir(self.run_dir)
示例#5
0
    def codon_align(self, alignment_tool="mafft", prune=True, verbose=0):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO, SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        bad_seq = 0
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune == False:
                aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id)
                aa_seqs[seq.id].attributes = seq.attributes
            else:
                if verbose: print(seq.id, "has premature stops, discarding")
            bad_seq += '*' in str(tempseq)[:-1]

        print('Number of sequences with stops:', bad_seq, 'out of total',
              len(self.seqs))
        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname, 'fasta')

        if alignment_tool == 'muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname,
                                      out=tmpfname[:-5] + 'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta")
        elif alignment_tool == 'mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:', alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id: seq for seq in self.aln}
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)
示例#6
0
    def codon_align(self, alignment_tool="mafft", prune=True, verbose=0):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO,SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        bad_seq = 0
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune==False:
                aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id)
                aa_seqs[seq.id].attributes = seq.attributes
            else:
                if verbose: print(seq.id,"has premature stops, discarding")
            bad_seq+='*' in str(tempseq)[:-1]

        print('Number of sequences with stops:',bad_seq,'out of total',len(self.seqs))
        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname,'fasta')

        if alignment_tool=='muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta")
        elif alignment_tool=='mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:',alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id:seq for seq in self.aln}
        self.reference_aligned = self.sequence_lookup[self.reference.id]
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)
示例#7
0
    def align(self):
        from Bio import AlignIO
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta")

        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        self.sequence_lookup = {seq.id:seq for seq in self.aln}
        self.reference_aligned = self.sequence_lookup[self.reference.id]
        # add attributes to alignment
        for seqid, seq in self.seqs.iteritems():
            self.sequence_lookup[seqid].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)
示例#8
0
 def build_newick(self, newick_file, nthreads=2, method="raxml", raxml_options={},
                  iqtree_options={}, debug=False):
     make_dir(self.run_dir)
     os.chdir(self.run_dir)
     for seq in self.aln: seq.name=seq.id
     out_fname = os.path.join("..", newick_file)
     if method=="raxml":
         self.build_newick_raxml(out_fname, nthreads=nthreads, **raxml_options)
     elif method=="fasttree":
         self.build_newick_fasttree(out_fname)
     elif method=="iqtree":
         self.build_newick_iqtree(out_fname, **iqtree_options)
     os.chdir('..')
     self.logger("Saved new tree to %s"%out_fname, 1)
     if not debug:
         remove_dir(self.run_dir)
示例#9
0
    def align(self):
        from Bio import AlignIO
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) +
                  " temp_in.fasta > temp_out.fasta")

        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        self.sequence_lookup = {seq.id: seq for seq in self.aln}
        self.reference_aligned = self.sequence_lookup[self.reference.id]
        # add attributes to alignment
        for seqid, seq in self.seqs.iteritems():
            self.sequence_lookup[seqid].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)