def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5): from Bio import Phylo, AlignIO import subprocess, glob, shutil make_dir(self.run_dir) os.chdir(self.run_dir) for seq in self.aln: seq.name=seq.id AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = ["fasttree"] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta") tree_cmd.append(">") tree_cmd.append("initial_tree.newick") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml: if raxml_time_limit>0: tmp_tree = Phylo.read('initial_tree.newick','newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter<10): resolve_iter+=1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree,'initial_tree.newick', 'newick') AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed") print( "RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit*3600) process = subprocess.Popen("exec raxml -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system("raxml -f e -T " + str(self.nthreads) + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick") shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) else: shutil.copy('initial_tree.newick', out_fname) self.tt_from_file(out_fname, root) os.chdir('..') remove_dir(self.run_dir) self.is_timetree=False
def align(self, fname, debug=False): ''' align sequences using mafft side-effects: self.aln {MultipleSeqAlignment} reference not present if not in self.seqs self.reference_aln {SeqRecord} always set, even if the reference is subsequently discarded self.sequence_lookup {dict} map linking seq.id to the alignment, potentialy without the reference saves the alignment (always including reference) to fname ''' make_dir(self.run_dir) os.chdir(self.run_dir) if self.reference_in_dataset: out_seqs = self.seqs.values() else: self.log.notify("Adding reference for alignment step") out_seqs = self.seqs.values() + [self.reference_seq] SeqIO.write(out_seqs, "temp_in.fasta", "fasta") self.log.notify("Running alignment") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr") self.aln = AlignIO.read('temp_out.fasta', 'fasta') os.chdir("..") os.rename(os.path.join(self.run_dir, "temp_out.fasta"), fname) if not debug: remove_dir(self.run_dir) self.set_reference_alignment() if not self.reference_in_dataset: self.remove_reference_from_alignment() self.set_sequence_lookup() self.add_attributes_to_aln()
def align(self): ''' align sequences using mafft ''' from Bio import AlignIO from Bio.Align import MultipleSeqAlignment make_dir(self.run_dir) os.chdir(self.run_dir) ref_in_set = self.reference_seq.name in self.seqs if ref_in_set: out_seqs = self.seqs.values() else: out_seqs = self.seqs.values() + [self.reference_seq] print("align: reference in set", ref_in_set) SeqIO.write(out_seqs, "temp_in.fasta", "fasta") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta") tmp_aln = AlignIO.read('temp_out.fasta', 'fasta') self.sequence_lookup = {seq.id: seq for seq in tmp_aln} # add attributes to alignment for seqid, seq in self.seqs.iteritems(): self.sequence_lookup[seqid].attributes = seq.attributes self.aln = MultipleSeqAlignment([ s for s in tmp_aln if s.name != self.reference_seq.name or ref_in_set ]) os.chdir('..') remove_dir(self.run_dir)
def build_newick(self, newick_file, nthreads=2, root='midpoint', raxml=True, raxml_bin='raxml', debug=False, num_distinct_starting_trees=1): from Bio import Phylo, AlignIO import subprocess, glob, shutil make_dir(self.run_dir) os.chdir(self.run_dir) for seq in self.aln: seq.name=seq.id AlignIO.write(self.aln, 'temp.fasta', 'fasta') out_fname = os.path.join("..", newick_file) if raxml: self.logger("modified RAxML script - no branch length optimisation or time limit", 1) AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed") if num_distinct_starting_trees == 1: cmd = raxml_bin + " -f d -T " + str(nthreads) + " -m GTRCAT -c 25 -p 235813 -n tre -s temp.phyx" else: self.logger("RAxML running with {} starting trees (longer but better...)".format(num_distinct_starting_trees), 1) cmd = raxml_bin + " -f d -T " + str(nthreads) + " -N " + str(num_distinct_starting_trees) + " -m GTRCAT -c 25 -p 235813 -n tre -s temp.phyx" fh = open("raxml.log", 'w') try: check_call(cmd, stdout=fh, stderr=STDOUT, shell=True) self.logger("RAXML COMPLETED.", 1) except CalledProcessError: self.logger("RAXML TREE FAILED - check {}/raxml.log".format(self.run_dir), 1) sys.exit(2) shutil.copy('RAxML_bestTree.tre', out_fname) else: tree_cmd = ["fasttree"] if self.nuc: tree_cmd.append("-nt") tree_cmd.extend(["temp.fasta","1>","initial_tree.newick", "2>", "fasttree_stderr"]) os.system(" ".join(tree_cmd)) shutil.copy('initial_tree.newick', out_fname) os.chdir('..') if not debug: remove_dir(self.run_dir)
def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO, SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune == False: aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id, "has premature stops, discarding") bad_seq += '*' in str(tempseq)[:-1] print('Number of sequences with stops:', bad_seq, 'out of total', len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname, 'fasta') if alignment_tool == 'muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5] + 'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta") elif alignment_tool == 'mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:', alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id: seq for seq in self.aln} # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO,SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune==False: aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id,"has premature stops, discarding") bad_seq+='*' in str(tempseq)[:-1] print('Number of sequences with stops:',bad_seq,'out of total',len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname,'fasta') if alignment_tool=='muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta") elif alignment_tool=='mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:',alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id:seq for seq in self.aln} self.reference_aligned = self.sequence_lookup[self.reference.id] # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def align(self): from Bio import AlignIO make_dir(self.run_dir) os.chdir(self.run_dir) SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta") self.aln = AlignIO.read('temp_out.fasta', 'fasta') self.sequence_lookup = {seq.id:seq for seq in self.aln} self.reference_aligned = self.sequence_lookup[self.reference.id] # add attributes to alignment for seqid, seq in self.seqs.iteritems(): self.sequence_lookup[seqid].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def build_newick(self, newick_file, nthreads=2, method="raxml", raxml_options={}, iqtree_options={}, debug=False): make_dir(self.run_dir) os.chdir(self.run_dir) for seq in self.aln: seq.name=seq.id out_fname = os.path.join("..", newick_file) if method=="raxml": self.build_newick_raxml(out_fname, nthreads=nthreads, **raxml_options) elif method=="fasttree": self.build_newick_fasttree(out_fname) elif method=="iqtree": self.build_newick_iqtree(out_fname, **iqtree_options) os.chdir('..') self.logger("Saved new tree to %s"%out_fname, 1) if not debug: remove_dir(self.run_dir)
def align(self): from Bio import AlignIO make_dir(self.run_dir) os.chdir(self.run_dir) SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta") self.aln = AlignIO.read('temp_out.fasta', 'fasta') self.sequence_lookup = {seq.id: seq for seq in self.aln} self.reference_aligned = self.sequence_lookup[self.reference.id] # add attributes to alignment for seqid, seq in self.seqs.iteritems(): self.sequence_lookup[seqid].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)