def validateInputs(msa, tree=None): # Check for existence and proper FASTA formatting of input MSA try: msaHandle = open(msa, "rU") except: print '** HYPNO input error: Given MSA file location does not exist or is not accessible: '+msa sys.exit(1) try: AlignIO.parse(msaHandle, "fasta").next() except: print '** HYPNO input error: improper MSA file format, must be aligned FASTA or a2m format: '+msa sys.exit(1) if tree: try: treeHandle = open(tree, "rU") except: print '** HYPNO input error: Given tree file location does not exist or is not accessible: '+tree sys.exit(1) try: Phylo.read(treeHandle, "newick") except: print '** HYPNO input error: improper tree file format, must be Newick format: '+msa sys.exit(1) if not internet_connected(): print '** HYPNO connection error: Please connect to the internet to enable HYPNO remote database queries.' sys.exit(1) return 0
def to_Biopython(tree): from Bio import Phylo from StringIO import StringIO from itertools import izip try: bT = Phylo.read(StringIO(tree.as_newick_string()), 'newick') except: nwk_str = tree.as_string(schema='newick')[5:] print("raw string:", nwk_str) print("stringIO output:", StringIO(nwk_str).readlines()) try: bT = Phylo.read(StringIO(nwk_str), 'newick') except: bT = Phylo.read(StringIO(nwk_str+')'), 'newick') for new_leaf, old_leaf in izip(bT.get_terminals(), tree.leaf_nodes()): for attr,val in old_leaf.__dict__.iteritems(): try: new_leaf.__setattr__(attr, float(val)) except: new_leaf.__setattr__(attr, val) for new_leaf, old_leaf in izip(bT.get_nonterminals(order='postorder'), tree.postorder_internal_node_iter()): for attr,val in old_leaf.__dict__.iteritems(): try: new_leaf.__setattr__(attr, float(val)) except: new_leaf.__setattr__(attr, val) return bT
def annotate_cOTU_tree(cOTU_tree_string,results_list): from Bio import Phylo from StringIO import StringIO tree = Phylo.read(StringIO(cOTU_tree_string),'newick',rooted=True) for node_dict in results_list: node_tree = Phylo.read(StringIO(load_de_numericized_newick_tree(node_dict['s_nodes'],before="cOTU_",after="")),'newick',rooted=True) ###debug### #print node_tree node_ref = [] for terminal in node_tree.get_terminals(): node_ref.append({"name": terminal.name}) node = tree.common_ancestor(node_ref) node.confidence = float(node_dict['fdr_p']) #print node_dict['fdr_p'] out = StringIO() Phylo.write(tree,out,'newick') return out.getvalue()
def test_phylo_read_extra(self): """Additional tests to check correct parsing.""" tree = Phylo.read(StringIO("(A:1, B:-2, (C:3, D:4):-2)"), 'newick') self.assertEqual(tree.distance('A'), 1) self.assertEqual(tree.distance('B'), -2) self.assertEqual(tree.distance('C'), 1) self.assertEqual(tree.distance('D'), 2) tree = Phylo.read(StringIO("((A:1, B:-2):-5, (C:3, D:4):-2)"), 'newick') self.assertEqual(tree.distance('A'), -4) self.assertEqual(tree.distance('B'), -7) self.assertEqual(tree.distance('C'), 1) self.assertEqual(tree.distance('D'), 2) tree = Phylo.read(StringIO("((:1, B:-2):-5, (C:3, D:4):-2)"), 'newick') distances = {-4.0: 1, -7.0: 1, 1: 1, 2: 1} for x in tree.get_terminals(): entry = int(tree.distance(x)) distances[entry] -= distances[entry] self.assertEqual(distances[entry], 0) tree = Phylo.read(StringIO("((:\n1\n,\n B:-2):-5, (C:3, D:4):-2);"), 'newick') distances = {-4.0: 1, -7.0: 1, 1: 1, 2: 1} for x in tree.get_terminals(): entry = int(tree.distance(x)) distances[entry] -= distances[entry] self.assertEqual(distances[entry], 0)
def Main(): global alphabet, rev_alphabet alphabet = {"A": 0, "C": 1, "G": 2, "T": 3} rev_alphabet = {0: "A", 1: "C", 2: "G", 3: "T"} # get trees from each file tree1 = Phylo.read("tree1.txt", "newick") tree1.rooted = True tree2 = Phylo.read("tree2.txt", "newick") tree2.rooted = True tree3 = Phylo.read("tree3.txt", "newick") tree3.rooted = True root1 = tree1.clade root2 = tree2.clade root3 = tree3.clade print("------------------ Tree 1 ------------------") plk = felsenstein(root1) finalProb(plk) print("\n\n------------------ Tree 2 ------------------") plk = felsenstein(root2) finalProb(plk) print("\n\n------------------ Tree 3 ------------------") plk = felsenstein(root3) finalProb(plk)
def GetExec(): Recs = os.listdir(os.getcwd()) newList=[] j = 0 listdata=dict() k = 0 while k < len(Recs): (name, ext) = os.path.splitext(Recs[k]) if len(ext)>3 and ext[0:4]=='.dnd': tree = Phylo.read(Recs[k], "newick") tree.rooted = True newList.append([tree,'ok']) listdata[j] = j,str(Recs[k]) j+=1 elif len(ext)>3 and ext[0:4]=='.xml': tree = Phylo.read(Recs[k], "phyloxml") tree.rooted = True newList.append([tree,'ok']) listdata[j] = j,str(Recs[k]) j+=1 k += 1 return [newList,listdata]
def get_tree(tree_file, name_tree): tree = Phylo.read( open(tree_file, 'r'), "newick") tree_name = Phylo.read( open(name_tree, 'r'), "newick") #set node number for nonterminal nodes and specify root node numInternalNode = 0 for clade in tree.get_nonterminals(): clade.name = 'N' + str(numInternalNode) clade.branch_length = clade.confidence numInternalNode += 1 for clade_iter in range(len(tree.get_terminals())): clade = tree.get_terminals()[clade_iter] clade.branch_length = clade.confidence clade.name = tree_name.get_terminals()[clade_iter].name tree_phy = tree.as_phyloxml(rooted = 'True') tree_nx = Phylo.to_networkx(tree_phy) triples = ((u.name, v.name, d['weight']) for (u, v, d) in tree_nx.edges(data = True)) # data = True to have the blen as 'weight' T = nx.DiGraph() edge_to_blen = {} for va, vb, blen in triples: edge = (va, vb) T.add_edge(*edge) edge_to_blen[edge] = blen edge_list = edge_to_blen.keys() edge_list.sort(key = lambda node: int(node[0][1:])) return edge_to_blen, edge_list
def test_root_with_outgroup(self): """Tree.root_with_outgroup: reroot at a given clade.""" # On a large realistic tree, at a deep internal node tree = Phylo.read(EX_APAF, 'phyloxml') orig_num_tips = len(tree.get_terminals()) orig_tree_len = tree.total_branch_length() tree.root_with_outgroup('19_NEMVE', '20_NEMVE') self.assertEqual(orig_num_tips, len(tree.get_terminals())) self.assertAlmostEqual(orig_tree_len, tree.total_branch_length()) # Now, at an external node tree.root_with_outgroup('1_BRAFL') self.assertEqual(orig_num_tips, len(tree.get_terminals())) self.assertAlmostEqual(orig_tree_len, tree.total_branch_length()) # Specifying outgroup branch length mustn't change the total tree size tree.root_with_outgroup('2_BRAFL', outgroup_branch_length=0.5) self.assertEqual(orig_num_tips, len(tree.get_terminals())) self.assertAlmostEqual(orig_tree_len, tree.total_branch_length()) tree.root_with_outgroup('36_BRAFL', '37_BRAFL', outgroup_branch_length=0.5) self.assertEqual(orig_num_tips, len(tree.get_terminals())) self.assertAlmostEqual(orig_tree_len, tree.total_branch_length()) # On small contrived trees, testing edge cases for small_nwk in ( '(A,B,(C,D));', '((E,F),((G,H)),(I,J));', '((Q,R),(S,T),(U,V));', '(X,Y);', ): tree = Phylo.read(StringIO(small_nwk), 'newick') orig_tree_len = tree.total_branch_length() for node in list(tree.find_clades()): tree.root_with_outgroup(node) self.assertAlmostEqual(orig_tree_len, tree.total_branch_length())
def test_newick_read_scinot(self): """Parse Newick branch lengths in scientific notation.""" tree = Phylo.read(StringIO("(foo:1e-1,bar:0.1)"), 'newick') clade_a = tree.clade[0] self.assertEqual(clade_a.name, 'foo') self.assertAlmostEqual(clade_a.branch_length, 0.1) """Additional tests to check correct parsing""" tree = Phylo.read(StringIO("(A:1, B:-2, (C:3, D:4):-2)"),'newick') self.assertEqual(tree.distance('A'),1) self.assertEqual(tree.distance('B'),-2) self.assertEqual(tree.distance('C'),1) self.assertEqual(tree.distance('D'),2) tree = Phylo.read(StringIO("((A:1, B:-2):-5, (C:3, D:4):-2)"),'newick') self.assertEqual(tree.distance('A'),-4) self.assertEqual(tree.distance('B'),-7) self.assertEqual(tree.distance('C'),1) self.assertEqual(tree.distance('D'),2) tree = Phylo.read(StringIO("((:1, B:-2):-5, (C:3, D:4):-2)"),'newick') distances = {-4.0:1,-7.0:1,1:1,2:1} for x in tree.get_terminals(): entry = int(tree.distance(x)) distances[entry] -= distances[entry] self.assertEqual(distances[entry],0) tree = Phylo.read(StringIO("((:\n1\n,\n B:-2):-5, (C:3, D:4):-2);"),'newick') distances = {-4.0:1,-7.0:1,1:1,2:1} for x in tree.get_terminals(): entry = int(tree.distance(x)) distances[entry] -= distances[entry] self.assertEqual(distances[entry],0)
def is_starting_tree_valid(starting_tree): try: Phylo.read(starting_tree, "newick") tree = dendropy.Tree.get_from_path(starting_tree, "newick", preserve_underscores=True) except: print("Error with the input starting tree: Is it a valid Newick file?") return 0 return 1
def test_newick_read_single2(self): """Read second Newick file with one tree.""" tree = Phylo.read(EX_NEWICK2, 'newick') self.assertEqual(len(tree.get_terminals()), 33) self.assertEqual(tree.find_any('H**o sapiens').comment, 'modern human') self.assertEqual(tree.find_any('Equus caballus').comment, "wild horse; also 'Equus ferus caballus'") self.assertEqual(tree.root.confidence, 80) tree = Phylo.read(EX_NEWICK2, 'newick', comments_are_confidence=True) self.assertEqual(tree.root.confidence, 100)
def test_draw(self): """Run the tree layout algorithm, but don't display it.""" pyplot.ioff() # Turn off interactive display dollo = Phylo.read(EX_DOLLO, 'phyloxml') apaf = Phylo.read(EX_APAF, 'phyloxml') Phylo.draw(dollo, do_show=False) Phylo.draw(apaf, do_show=False) # Fancier options Phylo.draw(apaf, do_show=False, branch_labels={apaf.root: 'Root'}) Phylo.draw(apaf, do_show=False, branch_labels=lambda c: c.branch_length)
def test_draw_with_label_colors_callable(self): """Run the tree layout algorithm with a label_colors argument passed in as a callable. Don't display tree.""" pyplot.ioff() # Turn off interactive display dollo = Phylo.read(EX_DOLLO, 'phyloxml') apaf = Phylo.read(EX_APAF, 'phyloxml') label_colors_dollo = lambda label: 'r' if label == 'f_50' else 'k' label_colors_apaf = lambda label: 'r' Phylo.draw(dollo, label_colors=label_colors_dollo, do_show=False) Phylo.draw(apaf, label_colors=label_colors_apaf, do_show=False)
def test_newick_write(self): """Parse a Nexus file with multiple trees.""" # Tree with internal node labels mem_file = StringIO() tree = Phylo.read(StringIO("(A,B,(C,D)E)F;"), "newick") Phylo.write(tree, mem_file, "newick") mem_file.seek(0) tree2 = Phylo.read(mem_file, "newick") # Sanity check self.assertEqual(tree2.count_terminals(), 4) # Check internal node labels were retained internal_names = set(c.name for c in tree2.get_nonterminals() if c is not None) self.assertEqual(internal_names, set(("E", "F")))
def handleData(sample, current, n): global total_branch_length, total_mutations total_branch_length = 0; total_mutations = 0; newickForm1 = newick(sample, 1)#tree in terms of time heterozygosity = analysis(sample)#analysis computes 1) heterozygosity, 2) total branch length, 3) number of mutations newickForm2 = newick(sample, 2)#tree in terms of mutations newickForm1 = str(newickForm1) newickForm2 = str(newickForm2) handle1 = StringIO(newickForm1) handle2 = StringIO(newickForm2) tree1 = Phylo.read(handle1, 'newick') tree2 = Phylo.read(handle2, 'newick') data = Node(n, total_branch_length, total_mutations, heterozygosity, tree1, tree2) current.next = data;
def get_pairwise_distances(seq_series, tree_file = None, seq_file = None): if seq_file is None: fasta_handle = NTF() if tree_file is None: tree_handle = NTF() else: tree_handle = open(tree_file, 'w') for (pat, visit), seq in zip(seq_series.index, seq_series.values): nheader = '%s-%s' % (pat, visit) fasta_handle.write('>%s\n%s\n' % (nheader, ''.join(seq))) fasta_handle.flush() os.fsync(fasta_handle.fileno()) cmd = 'muscle -in %(ifile)s -tree2 %(treefile)s -gapopen -2.9' cmdlist = shlex.split(cmd % { 'ifile':fasta_handle.name, 'treefile':tree_handle.name }) t = check_call(cmdlist) tree = Phylo.read(open(tree_handle.name), 'newick') seq_names = tree.get_terminals() dmat = {} for p1, p2 in combinations(seq_names, 2): d = tree.distance(p1, p2) dmat[(p1.name, p2.name)] = d dmat[(p2.name, p1.name)] = d return dmat
def call_root2tip(self, tree): """ Call jar file that implements a modified version of Andrew Rambaut's root-to-tip method (Path-O-Gen). :param tree: a Newick tree string :return: a dictionary that includes the time-scaled tree """ # write tree to temporary file with open(self.tmpfile, "w") as handle: handle.write(tree) out1 = os.path.join(self.tmp, "anchre.r2t.timetree") out2 = os.path.join(self.tmp, "anchre.r2t.csv") p = subprocess.check_call( [self.java, "-jar", "java/RLRootToTip.jar", "-timetree", out1, "-newick", self.tmpfile, out2], stdout=subprocess.PIPE, ) # read outputs with open(out1, "rU") as handle: timetree = Phylo.read(handle, "nexus") with open(out2, "rU") as handle: coef = handle.readlines() # convert NEXUS to Newick string newick = self.phylo2newick(timetree) res = {"timetree": newick} values = coef[1].strip("\n").split(",") for i, key in enumerate(coef[0].strip("\n").split(",")): res.update({key: values[i]}) return res
def tree(alignment, run_id = 'T%05i' % (0,), bionj = False): old_cwd = os.getcwd() new_wd = config.dataPath('phyml') if not os.path.isdir(new_wd): os.mkdir(new_wd) os.chdir(new_wd) infilepath = 'infile{0}'.format(run_id) infile = open(infilepath,'w') aio.write(alignment, infile, 'phylip') infile.close() command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' ) print command subprocess.call(command, shell = True, stdout = subprocess.PIPE) treefilepath = infilepath + '_phyml_tree.txt' treefile = open(treefilepath) tree =phylo.read(treefile, 'newick') treefile.close() os.chdir(old_cwd) return tree
def removeParalogs(self): self.getseqsfromCodeFile() self.uilist = [] self.tree_in = Phylo.read(self.PathtoOutput + '/BestRaxTrees/' + self.OG + '_outrax.tree','newick') try: self.alignment = open(self.PathtoOutput + '/RenamedAlignments/' + self.OG + '_renamed.contrem','r') except: self.alignment = open(self.PathtoOutput + '/RenamedAlignments/' + self.OG + '_renamed','r') for seq in self.tree_in.get_terminals(): print self.OG try: ui = self.sequenceDict[str(seq).split('_')[0]][1] #ui is MC_mc_code self.paralogDict[ui].append(str(seq)) # so len is # of paralogs per taxon if ui not in self.uilist: self.uilist.append(ui) except: print 'problem with ' + self.OG for ui in self.uilist: print 'self.paralogDict[ui] ' + str(self.paralogDict[ui]) if len(self.paralogDict[ui]) > 1: print ui self.pickParalog(ui) print 'seq to delete ' + str(self.seqtoDelete) self.deleteSeqsFromAlignment() self.alignment.close()
def make_tree_figure(wanted_seqs, trop_dict, tree_file): mat_data = get_pairwise_distances(wanted_seqs, tree_file = tree_file) tree = Phylo.read(open(tree_file), 'newick') net = Phylo.to_networkx(tree) node_mapping = {} clade = 1 for node in net.nodes(): if node.name is None: node_mapping[node] = 'Clade-%i' % clade clade += 1 else: node_mapping[node] = node.name new_net = networkx.relabel_nodes(net, node_mapping) colors = [] for node in new_net.nodes(): if node.startswith('Clade'): colors.append('w') elif trop_dict[node]: colors.append('g') elif not trop_dict[node]: colors.append('r') else: print node #print colors, len(colors), len(new_net.nodes()) pos = networkx.graphviz_layout(new_net, 'twopi') networkx.draw_networkx(new_net, pos, with_labels = False, node_color = colors)
def run_paml_per_group(groups, alignment, tree, output_dir, working_dir): """ This function take the group, alignment, tree and folder information and runs a paml analysis on each defined group. The steps needed are to modify the tree to add the #1 that defines the branches in the tree for paml and then runs PAML on that tree, using the provided alignment. The working dir is important (different from the output dir), because different PAML runs at the same time may override each other. This is particularly important if running this script in more than one processor """ from Bio import Phylo from SelectionAnalysis import paml_run cluster_tree = Phylo.read(tree, "newick") # Read the input tree #Names have a pipe sign (|) with the organism|protein_id. #Here I create a dictionary where the key is the protein_id and the value is the organism clades_in_tree_by_gene_id = {str(clade).split("|")[1]: str(clade).split("|")[0] for clade in cluster_tree.get_terminals()} species_in_tree = set(str(clade).split("|")[0] for clade in cluster_tree.get_terminals()) clade_results = dict() #Iterate on each group for group in groups: #Check that all the branches are present on the tree (and is not the only branch) if set(groups[group]).issubset(species_in_tree) and len(species_in_tree) > len(groups[group]): dict_new_clade_names = dict() for gene_id in clades_in_tree_by_gene_id: genome = clades_in_tree_by_gene_id[gene_id] if genome in groups[group]: dict_new_clade_names[genome + "|" + gene_id] = genome + "|" + gene_id + " #1" else: continue #Replace the names in the tree and save the tree old_tree_information = open(tree).read() new_tree_information = multiple_replace(dict_new_clade_names, old_tree_information) group_tree = working_dir + "/" + group + ".tre" new_tree_file = open(group_tree, 'w') new_tree_file.write(new_tree_information) new_tree_file.close() #Run model for the new tree paml_results = paml_run.ma_m1a(alignment, group_tree, output_dir, working_dir) clade_results[group] = paml_results else: clade_results[group] = None return clade_results
def genTaxTree(resolver, namesdict, logger, taxonomy=None, draw=False): """Return Phylo from TaxonNamesResolver class.""" ranks = resolver.retrieve('classification_path_ranks') qnames = resolver.retrieve('query_name') lineages = resolver.retrieve('classification_path') # replace ' ' with '_' for taxon tree qnames = [re.sub("\s", "_", e) for e in qnames] resolved_names_bool = [e in namesdict.keys() for e in qnames] ranks = [ranks[ei] for ei, e in enumerate(resolved_names_bool) if e] lineages = [lineages[ei] for ei, e in enumerate(resolved_names_bool) if e] # identify unresolved names unresolved_names = [qnames[ei] for ei, e in enumerate(resolved_names_bool) if not e] idents = [qnames[ei] for ei, e in enumerate(resolved_names_bool) if e] statement = "Unresolved names: " for each in unresolved_names: statement += " " + each logger.debug(statement) # make taxdict taxdict = TaxDict(idents=idents, ranks=ranks, lineages=lineages, taxonomy=taxonomy) # make treestring treestring = taxTree(taxdict) if not taxonomy: d = 22 # default_taxonomy + 1 in tnr else: d = len(taxonomy) + 1 # add outgroup treestring = '({0},outgroup:{1});'.format(treestring[:-1], float(d)) tree = Phylo.read(StringIO(treestring), "newick") if draw: Phylo.draw_ascii(tree) return tree
def rootTree(f, root,output): tree = Phylo.read(f,'newick') if ',' in root: taxa = root.split(',') root = tree.common_ancestor(taxa) tree.root_with_outgroup(root) Phylo.write(tree,output,'newick')
def getPhylotasticTree(): absoluteFileName = getFileName() filePrefix = absoluteFileName[:-4] # Load the kept nodes and create the comma-delimited species # string for sending to PTastic speciesList = [l.strip() for l in open(filePrefix+'_species_present.txt').readlines()] # Need underscores instead of spaces speciesList = [x.replace(' ', '_') for x in speciesList] speciesString = ','.join(speciesList) phylotasticUrlBase = 'http://phylotastic-wg.nescent.org/script/phylotastic.cgi?species=' speciesTreeUrl = phylotasticUrlBase+speciesString+'&tree=mammals&format=newick' conn = urllib2.urlopen(speciesTreeUrl) speciesTreeString = conn.read() speciesTreeString = speciesTreeString.strip() speciesTreeFilename = filePrefix+'_species_tree.txt' open(speciesTreeFilename,'w').write(speciesTreeString) #setting a counter for counting nodes i.e.number of species in the species newick tree. #I have referenced the link http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc182 to understand the count_terminals() function got_nodes=0 #counter to keep the count of the nodes tree = Phylo.read(speciesTreeFilename, 'newick') #tree reads the species labels in newick format got_nodes=BaseTree.TreeMixin.count_terminals(tree); #no. of non-terminal nodes received in got_nodes.Now can be printed or checked with user input value to test if all species received. speciesTreeWebFile = getRelativeWebPath('_species_tree.txt') return response.json( dict(vizFile = speciesTreeWebFile, vizLabel = "Phylotastic Species Tree", got_nodes=got_nodes ) )
def reroot_tree_with_outgroup(tree_name, outgroups): clade_outgroups = GubbinsCommon.get_monophyletic_outgroup(tree_name, outgroups) outgroups = [{"name": taxon_name} for taxon_name in clade_outgroups] tree = Phylo.read(tree_name, "newick") tree.root_with_outgroup(*outgroups) Phylo.write(tree, tree_name, "newick") tree = dendropy.Tree.get_from_path(tree_name, "newick", preserve_underscores=True) tree.deroot() tree.update_bipartitions() output_tree_string = tree.as_string( schema="newick", suppress_leaf_taxon_labels=False, suppress_leaf_node_labels=True, suppress_internal_taxon_labels=False, suppress_internal_node_labels=False, suppress_rooting=True, suppress_edge_lengths=False, unquoted_underscores=True, preserve_spaces=False, store_tree_weights=False, suppress_annotations=True, annotations_as_nhx=False, suppress_item_comments=True, node_label_element_separator=" ", ) with open(tree_name, "w+") as output_file: output_file.write(output_tree_string.replace("'", "")) output_file.closed
def _newick_to_nx(newick, default_lineages=None): newick = StringIO(newick) phy = Phylo.read(newick, "newick") phy.rooted = True edges = [] nodes = [] node_data = {} clades = [phy.root] phy.root.name = phy.root.name or "root" i = 0 while clades: clade = clades.pop() nd = _extract_momi_fields(clade.comment or "") if 'lineages' not in nd and default_lineages is not None: nd['lineages'] = default_lineages nodes.append((clade.name, nd)) for c_clade in clade.clades: clades += clade.clades if c_clade.name is None: c_clade.name = "node%d" % i i += 1 ed = {'branch_length': c_clade.branch_length} edges.append((clade.name, (c_clade.name), ed)) t = nx.DiGraph(data=edges) t.add_nodes_from(nodes) tn = dict(t.nodes(data=True)) for node in node_data: tn[node].update(node_data[node]) return t
def test_draw_ascii(self): """Tree to Graph conversion, if networkx is available.""" handle = StringIO() tree = Phylo.read(EX_APAF, 'phyloxml') Phylo.draw_ascii(tree, file=handle) Phylo.draw_ascii(tree, file=handle, column_width=120) handle.close()
def test_find_elements(self): """TreeMixin: find_elements() method.""" # From the docstring example tree = self.phylogenies[5] matches = list(tree.find_elements(PhyloXML.Taxonomy, code='OCTVU')) self.assertEqual(len(matches), 1) self.assertTrue(isinstance(matches[0], PhyloXML.Taxonomy)) self.assertEqual(matches[0].code, 'OCTVU') self.assertEqual(matches[0].scientific_name, 'Octopus vulgaris') # Iteration and regexps tree = self.phylogenies[10] for point, alt in zip(tree.find_elements(geodetic_datum=r'WGS\d{2}'), (472, 10, 452)): self.assertTrue(isinstance(point, PhyloXML.Point)) self.assertEqual(point.geodetic_datum, 'WGS84') self.assertAlmostEqual(point.alt, alt) # class filter tree = self.phylogenies[4] events = list(tree.find_elements(PhyloXML.Events)) self.assertEqual(len(events), 2) self.assertEqual(events[0].speciations, 1) self.assertEqual(events[1].duplications, 1) # string filter & find_any tree = self.phylogenies[3] taxonomy = tree.find_any("B. subtilis") self.assertEqual(taxonomy.scientific_name, "B. subtilis") # integer filter tree = Phylo.read(EX_APAF, 'phyloxml') domains = list(tree.find_elements(start=5)) self.assertEqual(len(domains), 8) for dom in domains: self.assertEqual(dom.start, 5) self.assertEqual(dom.value, 'CARD')
def test_ancestral(): import os from Bio import AlignIO import numpy as np from treetime import TreeAnc, GTR root_dir = os.path.dirname(os.path.realpath(__file__)) fasta = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.fasta')) nwk = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.nwk')) for marginal in [True, False]: print('loading flu example') t = TreeAnc(gtr='Jukes-Cantor', tree=nwk, aln=fasta) print('ancestral reconstruction' + ("marginal" if marginal else "joint")) t.reconstruct_anc(method='ml', marginal=marginal) assert "".join(t.tree.root.sequence) == 'ATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATATGCTTCTTCATGCAAATTGCCATCTTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAACAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCCTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATAACACAGTACGTGATAGGACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAAAGAAATTCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATCGGCCCATCGTAGATATAAACATAAAGGATCATAGCATTGTTTCCAGTTATGTGTGTTCAGGACTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGTTTGGATCCTAACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGTGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACCTCAATCTCATGCCTATA' print('testing LH normalization') from Bio import Phylo,AlignIO tiny_tree = Phylo.read(StringIO("((A:0.60100000009,B:0.3010000009):0.1,C:0.2):0.001;"), 'newick') tiny_aln = AlignIO.read(StringIO(">A\nAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n" ">B\nAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT\n" ">C\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"), 'fasta') mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.9, 0.06, 0.02, 0.02]), W=np.ones((4,4))) t = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln) t.reconstruct_anc('ml', marginal=True, debug=True) lhsum = np.exp(t.sequence_LH(pos=np.arange(4**3))).sum() print (lhsum) assert(np.abs(lhsum-1.0)<1e-6) t.optimize_branch_len()
def test_raxml(self): """Run RAxML using the wrapper.""" cmd = RaxmlCommandline(raxml_exe, sequences=EX_PHYLIP, model="PROTCATWAG", name="test") # The parsimony seed should be set automatically self.assert_('-p' in str(cmd)) # Smoke test try: out, err = cmd() self.assert_(len(out) > 0) self.assert_(len(err) == 0) # Check the output tree tree = Phylo.read('RAxML_result.test', 'newick') self.assertEqual(tree.count_terminals(), 4) finally: # Remove RAxML-generated files, or RAxML will complain bitterly # during the next run for fname in ['RAxML_info.test', 'RAxML_log.test', 'RAxML_parsimonyTree.test', 'RAxML_result.test', # Present in 7.2.X+ but not 7.0.4: 'RAxML_bestTree.test', ]: if os.path.isfile(fname): os.remove(fname)
while len(labels) > 1: x,y = lowest_cell(table) join_table(table,x,y) join_labels(labels,x,y) return labels[0] def alpha_labels(start,end): labels = [] for i in range(ord(start), ord(end)+1): labels.append(chr(i)) return labels M_labels = alpha_labels("A","E") M = [ [], [0.189], [0.110,0.179], [0.113,0.192,0.094], [0.215,0.211,0.205,0.214]] u = (UPGMA(M,M_labels)) u = u.replace("A","Gorila") u = u.replace("B","Oragontango") u = u.replace("C","Humano") u = u.replace("D","Chimpanze") u = u.replace("E","Gibao") handle = StringIO(u) tree = Phylo.read(handle,"newick") Phylo.draw(tree)
dist = distance.euclidean(vec1, vec2) return dist if __name__ == '__main__': parser = argparse.ArgumentParser( description="Down sample sequences from FASTA", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--world_tree", type=str, required=True, help="path to a tree file") parser.add_argument("--sampled_tree", type=str, required=True, help="path to a tree file") parser.add_argument("--output", required=True, help="FASTA output file") args = parser.parse_args() tree1 = Phylo.read(args.world_tree, 'newick') tree2 = Phylo.read(args.sampled_tree, 'newick') tree1.root_with_outgroup({'name': 'Wuhan-Hu-1/2019'}) tree2.root_with_outgroup({'name': 'Wuhan-Hu-1/2019'}) tree1 = prune_world_tree(tree1, tree2) mapper = map_tree_to_vector_idx(tree2) dist = get_trees_distance(tree1, tree2, mapper) np.savetxt(args.output, dist)
def main(argv): print "AnnotateTreeCmd v1.0" if len(argv) == 2 and argv[1] == '-t': conduct_tests() exit(0) elif len(argv) != 7: print 'usage python AnnotateTreeCmd.py seqnumfile seqfile treefile cdrfile tag wd.' sys.exit(0) for file in argv[1:4]: check_file(file) (seqnumfile, seqfile, treefile, cdrfile, tag, wdir) = argv[1:7] if len(cdrfile) > 0: check_file(cdrfile) else: cdrfile = None try: if not os.path.exists(wdir): os.makedirs(wdir) except: print "Error creating directory %s." % wdir sys.exit(0) try: msa = Alignment() msa.read_nt( seqfile) # Check that the sequence comprises a valid set of codons for seq in msa: if '*' in seq: print "Stop codon found in sequence %s." % seq.id sys.exit(0) except: print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1]) sys.exit(0) try: seq_pos = msa.read_position_numbers(seqnumfile) except: print "Error parsing %s: %s." % (seqnumfile, sys.exc_info()[1]) sys.exit(0) if cdrfile is not None: try: acdr = AnalyseCDR(msa, file_name=cdrfile) except: print "Error parsing %s: %s." % (cdrfile, sys.exc_info()[1]) sys.exit(0) try: seq_align = AlignIO.read(seqfile, "fasta") except: try: seq_align = AlignIO.read(seqfile, "phylip") except: print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1]) sys.exit(0) try: tree = Phylo.read(treefile, "newick") except: print "Error parsing %s: %s." % (treefile, sys.exc_info()[1]) sys.exit(0) dnaml = Dnaml() int_aas = dnaml.run_dnaml(seq_align, tree, seq_pos, cdrfile, wdir, report, tag) if int_aas is not None: try: if cdrfile is not None: acdr = AnalyseCDR(int_aas, file_name=cdrfile) cdr_output = acdr.analyse() fo = open(wdir + "/" + tag + "cdr_analysis.html", "w") fo.write(cdr_output) fo.close() except: print "Warning: CDRs were not analysed: " + str(sys.exc_info()[1]) try: gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.svg") gc.collect() if cdrfile is not None: RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.svg") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.svg") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.svg") gc.collect() except: print "Error rendering trees: " + str(sys.exc_info()[1]) first = True orig_recs = [] for rec in SeqIO.parse(wdir + "/" + tag + "aa_alignment.fa", "fasta"): if not first and "node_" not in rec.id: orig_recs.append(rec) first = False logo_alignment_file = wdir + "/" + tag + "alignment_for_logo.fa" SeqIO.write(orig_recs, wdir + "/" + tag + "alignment_for_logo.fa", "fasta") with open(wdir + "/" + tag + "weblogo_status.txt", "w") as fo: retcode = subprocess.call( "seqlogo -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT) if retcode == 1: fo.write("Trying seqlogo.pl instead.\n") retcode = subprocess.call( "seqlogo.pl -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT) if retcode == 1: print "Weblogo not installed: logo plot will not be generated."
# Variables version = 'checkTreeFormat v1.0' # Script version arguments = "" # Arguments from ArgParse tree = "" # Tree variable # Grab arguments arguments = check_arg(sys.argv[1:]) # Checking if file exists if (not os.path.exists(arguments.tree_file)): print("Tree file not found.") sys.exit(1) # Read file, check it is in the correct format. try: tree = Phylo.read(arguments.tree_file, arguments.tree_format) except: if (arguments.tree_format == "newick"): print("Tree file not in newick format.") elif (arguments.tree_format == "nexus"): print("Tree file not in nexus format.") raise sys.exit(1) # If format =! newick convert to canonical format. if (arguments.tree_format != "newick"): print("Tree file not in canonical format. Converting to newick...") else: print("Tree is already in newick format, printing...") # Writing tree in newick format
""" BioE231 Vivian Fu, Jessica Wu, Zihui Xu Use Biopython's Phylo to visualize tree.nwk. """ from Bio import Phylo from io import StringIO import sys tree = Phylo.read(sys.stdin, 'newick') Phylo.draw(tree)
d = 1. - np.mean(temp) Wg = np.exp(-((d**2) / (d0**2))) s = g.Shannon(l) if s != None: entropy.append(Wg * s) score.append(sum(entropy)) ranking[l] = 1. + float(sum(score)) return ranking ###==================================================================================================== ### MAIN ###==================================================================================================== if __name__ == "__main__": msa = MSA(args.msa_file) tree = Phylo.read(args.tree_file, 'newick') tree.ladderize() # Flip branches so deeper clades are displayed at top clades = list(tree.find_clades(order='level')) subfamily = {} leaves = [] for i, clade in enumerate(clades): leaf = False if clade.is_terminal(): leaf = True if not leaf: clade.name = 'N%d' % i subfamily[clade] = Clade(msa, [ msa.sequence_indices[x.name] for x in list(clade.get_terminals()) ], clade.branch_length, clade.name) if leaf: leaves.append(subfamily[clade])
def run(args): """run mugration inference Parameters ---------- args : namespace command line arguments are parsed by argparse """ tree_fname = args.tree traits, columns = read_metadata(args.metadata) from Bio import Phylo T = Phylo.read(tree_fname, 'newick') missing_internal_node_names = [ n.name is None for n in T.get_nonterminals() ] if np.all(missing_internal_node_names): print("\n*** WARNING: Tree has no internal node names!") print( "*** Without internal node names, ancestral traits can't be linked up to the correct node later." ) print( "*** If you want to use 'augur export' later, re-run this command with the output of 'augur refine'." ) print( "*** If you haven't run 'augur refine', you can add node names to your tree by running:" ) print("*** augur refine --tree %s --output-tree <filename>.nwk" % (tree_fname)) print( "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'" ) mugration_states = defaultdict(dict) models = defaultdict(dict) out_prefix = '.'.join(args.tree.split('.')[:-1]) for column in args.columns: T, gtr, alphabet = mugration_inference( tree=tree_fname, seq_meta=traits, field=column, confidence=args.confidence, sampling_bias_correction=args.sampling_bias_correction) if T is None: # something went wrong continue for node in T.find_clades(): mugration_states[node.name][column] = node.__getattribute__(column) if args.confidence: mugration_states[node.name][ column + '_confidence'] = node.__getattribute__(column + '_confidence') mugration_states[node.name][ column + '_entropy'] = node.__getattribute__(column + '_entropy') if gtr: # add gtr models to json structure for export models[column]['rate'] = gtr.mu models[column]['alphabet'] = [ alphabet[k] for k in sorted(alphabet.keys()) ] models[column]['equilibrium_probabilities'] = list(gtr.Pi) models[column]['transition_matrix'] = [list(x) for x in gtr.W] if gtr: with open(out_prefix + '%s.mugration_model.txt' % column, 'w') as ofile: ofile.write('Map from character to field name\n') for k, v in alphabet.items(): ofile.write(k + ':\t' + str(v) + '\n') ofile.write('\n\n') ofile.write(str(gtr)) out_name = get_json_name(args, out_prefix + '_traits.json') write_json({"models": models, "nodes": mugration_states}, out_name) print( "\nInferred ancestral states of discrete character using TreeTime:" "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis" "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n", file=sys.stdout) print("results written to", out_name, file=sys.stdout)
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True, infer_gtr=True, root_state=None, missing='?', sampling_bias_correction=None): """ Infer likely ancestral states of a discrete character assuming a time reversible model. Parameters ---------- tree : str name of tree file seq_meta : dict meta data associated with sequences field : str, optional meta data field to use confidence : bool, optional calculate confidence values for inferences infer_gtr : bool, optional infer a GTR model for trait transitions (otherwises uses a flat model with rate 1) root_state : None, optional force the state of the root node (currently not implemented) missing : str, optional character that is to be interpreted as missing data, default='?' Returns ------- T : Phylo.Tree Biophyton tree gtr : treetime.GTR GTR model alphabet : dict mapping of character states to """ from treetime import GTR from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import Phylo T = Phylo.read(tree, 'newick') nodes = {n.name: n for n in T.get_terminals()} # Determine alphabet only counting tips in the tree places = set() for name, meta in seq_meta.items(): if field in meta and name in nodes: places.add(meta[field]) if root_state is not None: places.add(root_state) # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45) places = sorted(places) nc = len(places) if nc > 180: print("ERROR: geo_inference: can't have more than 180 places!", file=sys.stderr) return None, None, None elif nc == 0: print("ERROR: geo_inference: list of places is empty!", file=sys.stderr) return None, None, None elif nc == 1: print( "WARNING: geo_inference: only one place found -- set every internal node to %s!" % places[0], file=sys.stderr) alphabet = {'A': places[0]} alphabet_values = ['A'] gtr = None for node in T.find_clades(): node.sequence = ['A'] node.marginal_profile = np.array([[1.0]]) else: # set up model alphabet = {chr(65 + i): place for i, place in enumerate(places)} model = GTR.custom(pi=np.ones(nc, dtype=float) / nc, W=np.ones((nc, nc)), alphabet=np.array(sorted(alphabet.keys()))) missing_char = chr(65 + nc) alphabet[missing_char] = missing model.profile_map[missing_char] = np.ones(nc) model.ambiguous = missing_char alphabet_rev = {v: k for k, v in alphabet.items()} # construct pseudo alignment pseudo_seqs = [] for name, meta in seq_meta.items(): if name in nodes: s = alphabet_rev[ meta[field]] if field in meta else missing_char pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name)) aln = MultipleSeqAlignment(pseudo_seqs) # set up treetime and infer from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, gtr=model, convert_upper=False, verbose=0) tt.use_mutation_length = False tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=1.0, marginal=True, normalized_rate=False) if sampling_bias_correction: tt.gtr.mu *= sampling_bias_correction tt.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False) T = tt.tree gtr = tt.gtr alphabet_values = tt.gtr.alphabet # attach inferred states as e.g. node.region = 'africa' for node in T.find_clades(): node.__setattr__(field, alphabet[node.sequence[0]]) # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03 if confidence: for node in T.find_clades(): pdis = node.marginal_profile[0] S = -np.sum(pdis * np.log(pdis + TINY)) marginal = [(alphabet[alphabet_values[i]], pdis[i]) for i in range(len(alphabet_values))] marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods marginal = [(a, b) for a, b in marginal if b > 0.001 ][:4] #only take stuff over .1% and the top 4 elements conf = {a: b for a, b in marginal} node.__setattr__(field + "_entropy", S) node.__setattr__(field + "_confidence", conf) return T, gtr, alphabet
def get_morpheme_tree(clauses, scenario, tree_name, reconstructed=False): set_1 = {} for i in clauses: set_1[i] = {} for entry in function_paradigms(): if entry["Construction"] in set_1: if entry["Function"] not in set_1[entry["Construction"]].keys(): set_1[entry["Construction"]][entry["Function"]] = [ entry["Morpheme"] ] else: set_1[entry["Construction"]][entry["Function"]].append( entry["Morpheme"]) lang_clauses = {} for clause in set_1: cons = DBSession.query(Construction).filter( Construction.id == clause)[0] lang_clauses[cons.language.id] = set_1[clause] lang_clauses["kax"] = { "3>1+2": [["k-"]], "1>2": [["k-"]], "2>1": [["k-"]], "1>3": [["w-"]], "1+2>3": [["k(ɨt)-"]], "3>1": [["j-"], ["Ø-"]], "3>2": [["o(w)-"]] } lang_clauses["bak"] = { "3>1+2": [["k-"]], "1>2": [["ə-"]], "2>1": [["j-"]], "1>3": [["s-"]], "1+2>3": [["kɨd-"]], "3>1": [["ɨ-"], ["j-"]], "3>2": [["ə-"]] } lang_clauses["yuk"] = { "3>1+2": [["ɨp", "n-"]], "1>2": [["aw", "oj-"]], "2>1": [["am", "j-"]], "1>3": [["aw", "Ø-"]], "1+2>3": [["ɨp", "Ø-"]], "3>1": [["aw", "j-"]], "3>2": [["am", "oj-"]] } lang_clauses["aku"] = { "3>1+2": [["k-"]], "1>2": [["k-"]], "2>1": [["k-"]], "1>3": [["i-"], ["Ø-"]], "1+2>3": [["kɨt-"]], "3>1": [["jː-"], ["Øː-"]], "3>2": [["ə-"]] } lang_clauses["cum"] = { "1>2": [["kaj-"], ["kən-"], ["k-"]], "2>1": [["kaj-"], ["k-"]], "1>3": [["w-"], ["i-"]], } lang_clauses["tam"] = { "3>1+2": ["?"], "1>2": ["?"], "2>1": ["?"], "1>3": [["t-"]], "1+2>3": [["kɨt͡ʃ-"]] } lang_clauses["car"] = { "3>1+2": [["k-"]], "1>2": [["k-"]], "2>1": [["k-"]], "1>3": [["i-"]], "1+2>3": [["kɨt-"]], "3>1": [["j-"], ["ji-"], ["voice"]], "3>2": [["əj-"]] } lang_clauses["pem"] = { "1>3": "s-", "1>2": ["?"], "2>1": ["?"], } my_tree = Phylo.read(io.StringIO(Phylogeny.get("matter").newick), "newick") for node in my_tree.find_clades(): if node.name == None: continue if node.is_terminal(): node.name = node.name.replace("?", "") new_name = "lg:" + node.name else: new_name = node.name if node.name in lang_clauses.keys(): if scenario in lang_clauses[node.name].keys(): all_morphs = [] for morpheme_combo in lang_clauses[node.name][scenario]: this_morph = [] for morpheme in morpheme_combo: if DBSession.query(Morpheme).filter( Morpheme.id == morpheme).count() >= 1: if not reconstructed or DBSession.query( Morpheme).filter( Morpheme.id == morpheme )[0].counterparts[0].cognateset.id == "NA": this_morph.append( "morph:" + morpheme ) #data["Morpheme"][morpheme].name + " " else: # print() for counterpart in DBSession.query( Morpheme).filter( Morpheme.id == morpheme)[0].counterparts: this_morph.append( "cogset:" + counterpart.cognateset.id) else: this_morph.append("obj:" + morpheme) all_morphs.append("£".join(this_morph)) else: all_morphs = ["-"] else: all_morphs = ["-"] node.name = new_name + " " + " OR ".join(all_morphs) node.name = generate_markup(node.name) return get_clade_as_json(my_tree.clade)
def add_default_branch_lengths(s, branch_length): insert = ':' + str(branch_length) s = s.replace(')', insert + ')') s = s.replace(',', insert + ',') s = s.replace(';', insert + ';') return s with open(infile, "r") as f: data = [l.strip() for l in f.readlines()] trees = [(tree, node) for tree, node in zip(data[::3], data[1::3])] for tree, nodes in trees: ## Add default length to tree #tree = add_default_branch_lengths(tree, 1) ## Read tree in newick format ntree = Phylo.read(StringIO(tree), "newick") ## nodes for distance node1, node2 = nodes.split() ## compute distance between nodes print(int(ntree.distance(node1, node2)), end=" ") ## See the actual tree # Phylo.draw(tree) # matplotlib # nx.draw(G)
print(node) row = ref.loc[ref['saccver'].str.contains(node)] # get the row, for which the node is a substring in the saccver column org_name = row['organism_name'].to_string() # get the organism name org_name = org_name.lstrip('0123456789.- ') print(org_name) gcf = row['subject_gcf'].to_string() gcf = gcf.lstrip('0123456789.- ') label = org_name + ' ' + gcf # make the label by concatenating the organism name and gcf label = label.replace(' ', '_') line = pd.Series({'node': node, 'label': label}) results = results.append(line, ignore_index=True) i += 1 print(i) return results tree = Phylo.read(sys.argv[1], 'newick') # open the tree file names = lookup_by_names(tree) # use the function to get the dictionary nm = [i for i in names] #print(nm) #print(len(nm)) query = sys.argv[1].split('-')[0] #print(query) ref = get_reference_table(query) print(ref.head()) results = node_label_table(nm) print(results.head(10)) print(len(results))
def midpoint(input_fn, output_fn): tree = Phylo.read(input_fn, 'newick') tree.root_at_midpoint() Phylo.write(tree, output_fn, 'newick')
def align_hits(fasta_file, record_df): """ Use clustalw to align the fasta files to find the best sequence to use for the C. elegans vs Human comparison REQUIREMENTS - download clustaw from http://www.clustal.org/download/current/ and put folder in Applications Input : fasta_file Output """ from Bio.Align.Applications import ClustalwCommandline from Bio import AlignIO from Bio import Phylo gene = fasta_file.parent.stem print('analysis {}'.format(gene)) # check if alignment has already been done if len(list(fasta_file.parent.rglob('*.aln'))) > 0: print('{} alignment already done, nothing to do here'.format(gene)) return # import information about the gene from dataframe records = record_df[record_df.HGNC == gene].copy() # now do the alignment clustalw_exe = r"/Applications/clustalw-2.1-macosx/clustalw2" clustalw_cline = ClustalwCommandline(clustalw_exe, infile=fasta_file, stats=fasta_file.parent / 'stats.txt') stdout, stderr = clustalw_cline() #find alignment files align_file = list(fasta_file.parent.rglob('*.aln'))[0] tree_file = list(fasta_file.parent.rglob('*.dnd'))[0] alignment = AlignIO.read(align_file, "clustal") # find consensus sequence consensus = re.finditer(r"\*", alignment.column_annotations['clustal_consensus']) clist = [] for c in consensus: clist.append(c.span(0)) if len(clist) > 0: consensus = alignment[:, clist[0][0]:clist[-1][1]] else: consensus = alignment[:, ::] gap_count = {} for sequence in consensus: gap_count[sequence.id] = sequence.seq.count('-') records.loc[:, 'alignment_gaps'] = records.entrez_id.map(gap_count) records.sort_values(by=['alignment_gaps', 'sequence_length'], ascending=[True, False], inplace=True) records.reset_index(drop=True, inplace=True) #save top ranked to output file top_sequence = SeqRecord(Seq(records.Sequence.loc[0], IUPAC.IUPACAmbiguousDNA()), id=records.entrez_id.loc[0], name=gene) SeqIO.write(top_sequence, fasta_file.parent / '{}_sequence.fa'.format(top_sequence.id), 'fasta') tree = Phylo.read(tree_file, "newick") tree.ladderize() Phylo.draw(tree) plt.savefig(tree_file.parent / 'tree.png') plt.close('all') return
## initialize hash tables LOG_TAVARE_CONDITIONAL_LIKELIHOOD_DICT = {} TIME_BETA_DICT = {} TIME_ALPHA_DICT = {} idxsSamplesCorrectlyPolarized = [] individualMargEsts = [] numSamplesWronglyPolarized = 0 ind_i_hats = [] ind_i_sel_hats = [] branch_lengths = [] individualMargEsts = np.zeros( (numImportanceSamples, len(S_GRID), len(I_SEL), len(ds), len(FREQS))) for (k, line) in enumerate(lines): nwk = line.rstrip().split()[-1] derTree = Phylo.read(StringIO(nwk), 'newick') ancTree = Phylo.read(StringIO(nwk), 'newick') mixTree = Phylo.read(StringIO(nwk), 'newick') Phylo.read(StringIO(nwk), 'newick') n = len(derInds) m = len(ancInds) if k == 0: if args.popFreq != None: if args.popFreq != 1: discretizedPopFreqIdx = np.digitize(args.popFreq, FREQS) hPlus = FREQS[discretizedPopFreqIdx] - args.popFreq hMinus = args.popFreq - FREQS[discretizedPopFreqIdx - 1] sign = -1 * np.random.binomial(1, hPlus / (hPlus + hMinus)) discretizedPopFreqIdx += sign else:
# And counter-search to check if it's really a good match, the top hit should be the query sequence call('tblastx -db ../db/a_n_genes.fasta -query ' + output + 'b_g_GOI.fa -out ' + output + 'counterBlast.blast -num_threads 4 -max_target_seqs 1 -outfmt "7 sseqid evalue"', shell=True) ################################ # Identify motifs in sequences # ################################ # Do we need to do this? ################################# # Make a phylogeny of sequences # ################################# if foundBlu: unaln = open(output + 'unaligned.fa', 'a') bluFile = open(output + 'b_g_GOI.fa', 'r') addition = bluFile.read() unaln.write(addition) unaln.close() bluFile.close() print ('Creating phylogenetic tree...') call('clustalo -i ' + output + 'unaligned.fa -o ' + output + 'alignedAll.aln --force --outfmt=clu', shell=True) AlignIO.convert(output + 'alignedAll.aln', 'clustal', output + 'phyAlign.phy', 'phylip-relaxed') cmdline = PhymlCommandline(input=output + 'phyAlign.phy', alpha='e', bootstrap=1, sequential=False) call(str(cmdline), shell=True) my_tree = Phylo.read(output + "phyAlign.phy_phyml_tree.txt", "newick") Phylo.draw(my_tree, show_confidence=False) # Got to print 'Done' at the end print ('Done')
list = args.list action = args.action[0] index = args.index output = args.output # input = path + "metadata.tsv" # format = 'tsv' # list = path + 'seqList.txt' # action = 'keep' # output = path + 'output_ren.tsv' targets = [target.strip() for target in open(list, "r").readlines() if target[0] not in ['\n', '#']] if format == 'tree': tree = Phylo.read(input, 'newick') print('Starting tree file processing...') # rename clade names if action == 'rename': for clade in tree.find_clades(): for line in targets: oldName = line.split("\t")[0] newName = line.split("\t")[1].strip() if str(clade.name) == oldName: print('Renaming ' + oldName + ' as ' + newName) clade.name = newName Phylo.write([tree], output, 'newick') print('\nTree file successfully renamed: \'' + output)
return leftSibling #Case 3: Only the right sibling exists so return it elif rightSibling != None and rightSibling.genomeFragments != None and len(rightSibling.genomeFragments) > 0: return rightSibling #Case 4: None of the siblings exist so return NULL else: return None ###################################################### # main ###################################################### print('Starting application...') startTime = time.time() print('Reading newick tree from file: %s...' % (newickFileName)) newickTree = Phylo.read(newickFileName, 'newick') Phylo.draw(newickTree) globals.initialize() #Initialize the globals file globals.strains = strains #Assign pointer to the global strains array so we can access it anywhere createFile(outputFileName, newickTree) #Creates file where data will be output #Traverses the newick tree recursively reconstructing ancestral genomes print('Traversing newick tree...') result = traverseNewickTree(newickTree.clade, None) #Output newick tree after the ancestors have been added to it Phylo.draw(newickTree) #Need to traverse tree to ouput appropriate content to file newickTree.clade.name = '' #Make sure that the output for the root is not output
def main(): """Perform the main routine.""" import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" Given a newick tree, use this program to resolve polytomies (convert to bifurcating), and/or change the precision of branch lengths, and/or collapse.""") subparser_args1 = argparse.ArgumentParser(add_help=False) subparser_args1.add_argument("tree", help="Input newick tree") subparser_args1.add_argument("-p", "--precision", help="""Branch length precision (i.e., number of decimal places to print).""", default=None, type=int) subparser_args1.add_argument("-m", "--support_multiplier", help="""Multiply branch supports by this value. Use, for example, to convert scale of 0 to 1 to percentages. """, default=None, choices=[0.1, 100], type=float) subparser_args1.add_argument( "-b", "--dont_bifurcate_polytomies", help="Switch off conversion of node polytomies to bifurcating", default=False, action="store_true") subparser_args1.add_argument( "-c", "--collapse", help="Collapse nodes with support values less than this.", default=None, type=float) subparser_modules = parser.add_subparsers(title="Sub-commands help", help="", metavar="", dest="subparser_name") subparser_modules.add_parser( "smuggle", help="Smuggle the budgie.", description="Process the tree.", parents=[subparser_args1], formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparser_modules.add_parser("version", help="Print version.", description="Print version.") subparser_modules.add_parser( "test", help="Run test suite.", description="Run test suite.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) args = parser.parse_args() if not args.subparser_name: parser.print_help() elif args.subparser_name == "version": from budgitree import __version__ as version print(version) elif args.subparser_name == "test": import unittest from .tests.test_suite import suite runner = unittest.TextTestRunner(verbosity=2) runner.run(suite()) elif args.subparser_name == "smuggle": import sys from pathlib import Path # Check if file exists if not Path(args.tree).exists(): sys.exit( f"File '{Path(args.tree).absolute()}' not found. Exiting.") from Bio import Phylo from Bio.Phylo.NewickIO import Writer from io import StringIO # Read the tree tree = Phylo.read(args.tree, "newick") # Collapse the low-support nodes if requested if args.collapse is not None: print(f"Collapsing nodes with support < {args.collapse}.", file=sys.stderr) tree.collapse_all(lambda c: c.confidence is not None and c. confidence < args.collapse) if not args.dont_bifurcate_polytomies: print("Removing polytomies.", file=sys.stderr) from ete3 import Tree t = Tree(tree.format("newick")) t.standardize() tree = Phylo.read(StringIO(t.write(format=0)), "newick") if args.support_multiplier is not None: print(f"Multiplying branch supports by {args.support_multiplier}", file=sys.stderr) for non_terminal in tree.get_nonterminals(): if non_terminal.confidence is not None: if args.support_multiplier == 100: non_terminal.confidence = float("{0:.0f}".format( float(non_terminal.confidence * args.support_multiplier))) else: if args.support_multiplier == 0.1: non_terminal.confidence = float("{0:.2f}".format( float(non_terminal.confidence * args.support_multiplier))) # Polytomies created by collapsing nodes still need to be parseable. # Achieve this by increasing the recursion limit sys.setrecursionlimit(3000) # but don"t let it get too high (to prevent stack overflow) sys.setrecursionlimit(tree.count_terminals() * 2) trees = None if args.support_multiplier == 100: format_confidence = '%1.0f' else: format_confidence = '%1.2f' if args.precision is not None: print( f"Reformatting branch lengths to {args.precision} decimal places.", file=sys.stderr) trees = Writer([tree]). \ to_strings(format_branch_length = f"%1.{args.precision}f", format_confidence=format_confidence) else: trees = Writer([tree]). \ to_strings(format_branch_length = f"%g", format_confidence=format_confidence) # there is only one tree in trees, so: print(next(trees))
import numpy as np import matplotlib.pyplot as plt from augur.utils import read_node_data import argparse from Bio import Phylo parser = argparse.ArgumentParser(description="Analyze TMRCA.") parser.add_argument("--tree", help="tree file") parser.add_argument("--node-data", help="node_data file") parser.add_argument("--titers", help="titer_model file") parser.add_argument("--output", help="output file") args = parser.parse_args() T = Phylo.read(args.tree, 'newick') of = [args.node_data, args.titers] if args.titers else [args.node_data] node_data = read_node_data(of) T.root.up = None for n in T.find_clades(order='postorder'): n.numdate = node_data["nodes"][n.name]["numdate"] if args.titers: n.cTiter = node_data["nodes"][n.name]["cTiter"] n.dTiter = node_data["nodes"][n.name]["dTiter"] if n.is_terminal(): n.ntips = 1 n.tree_length = n.branch_length if args.titers: n.antigenic_length = n.dTiter else: n.ntips = np.sum([c.ntips for c in n]) n.tree_length = n.branch_length + np.sum([c.tree_length for c in n])
def subset_tree(args): ################################################# input ################################################# tree_file_in = args['tree'] group_to_taxon_file = args['taxon'] tree_file_out = args['out'] # define tmp file name tree_file_tmp_1 = '%s.tmp_1.tree' % tree_file_out tree_file_tmp_2 = '%s.tmp_2.tree' % tree_file_out time_format = '[%Y-%m-%d %H:%M:%S] ' ################################################ store input information ############################################### # read in tree tree_in = Phylo.read(tree_file_in, 'newick') # read in all identified taxons identified_taxon_list = set() for each_group in open(group_to_taxon_file): identified_taxon_list.add(each_group.strip()) print(datetime.now().strftime(time_format) + 'The number of provided taxon: %s' % len(identified_taxon_list)) ########################################## remove unwanted nodes recursively ########################################### # remove unwanted nodes recursively print(datetime.now().strftime(time_format) + 'Recursively removing unwanted nodes') deleted_leaf_num = 1 n = 0 tree_in_copy = copy.deepcopy(tree_in) while deleted_leaf_num > 0: tree_in_copy, deleted_leaf_num = remove_unwanted_leaf_nodes( tree_in_copy, identified_taxon_list) n += 1 print(datetime.now().strftime(time_format) + 'Removed %s nodes in %sth round' % (deleted_leaf_num, n)) # write out tree Phylo.write(tree_in_copy, tree_file_tmp_1, 'newick') ############################################# remove "100:" in clade name ############################################## # read in tree tree_tmp_1 = Phylo.read(tree_file_tmp_1, 'newick') tree_tmp_1_copy = copy.deepcopy(tree_tmp_1) for clade in tree_tmp_1_copy.find_clades(): clade_name = str(clade.name) if ':' in clade_name: clade.name = clade_name.split(':')[1] Phylo.write(tree_tmp_1_copy, tree_file_tmp_2, 'newick') ################################################ rename leaf nodes name ################################################ # read in tree tree_tmp_2 = Phylo.read(tree_file_tmp_2, 'newick') tree_tmp_2_copy = copy.deepcopy(tree_tmp_2) # get all leaf nodes all_leaf_nodes = tree_tmp_2_copy.get_terminals() for leaf_node in all_leaf_nodes: leaf_node_name_str = str(leaf_node.name) if ';' in leaf_node_name_str: leaf_node_name_split = leaf_node_name_str.split(';') # remove space at the begining or end leaf_node_name_split_no_space = [] for each_name in leaf_node_name_split: if each_name[0] == ' ': each_name = each_name[1:] if each_name[-1] == ' ': each_name = each_name[:-1] leaf_node_name_split_no_space.append(each_name) leaf_node_name_new = '' for identified_taxon in identified_taxon_list: if identified_taxon in leaf_node_name_split_no_space: leaf_node_name_new = identified_taxon leaf_node.name = leaf_node_name_new # write out tree Phylo.write(tree_tmp_2_copy, tree_file_out, 'newick') # report print(datetime.now().strftime(time_format) + 'Tree subset exported to: %s' % tree_file_out) # print warning message if some provided node(s) were not found extracted_leaf_nodes = tree_tmp_2_copy.get_terminals() if len(extracted_leaf_nodes) < len(identified_taxon_list): extracted_leaf_node_list = [] for extracted_leaf_node in extracted_leaf_nodes: extracted_leaf_node_list.append(str(extracted_leaf_node.name)) un_extracted_nodes = [] for provided_node in identified_taxon_list: if provided_node not in extracted_leaf_node_list: un_extracted_nodes.append(provided_node) print(datetime.now().strftime(time_format) + 'Warning!!! Found %s of %s provided nodes, missed: %s' % (len(extracted_leaf_nodes), len(identified_taxon_list), ', '.join(un_extracted_nodes))) ################################################### remove tmp files ################################################### # remove tmp files os.remove(tree_file_tmp_1) os.remove(tree_file_tmp_2)
# Fitch and Margoliash Method to build tree structure in newick format # point_dict_copy = point_dictionary.copy() fm = FitchMargoliash(hamming_table,point_dictionary) handle = fm.run() print(handle) for i in range(len(files)): print(files[i],' ',point_dict_copy[i]) # Phylogenetic Tree construction using phyloXML file (unrooted phylogram) from io import StringIO handle = StringIO(handle) tree = Phylo.read(handle,'newick') tree.name = TREE_NAME tree.id = process_id tree.ladderize() def tabulate_names(tree): names = {} for idx, clade in enumerate(tree.find_clades()): if clade.name: new_name = sequences[point_dict_copy.index(clade.name)].name clade.name = '%d_%s' % (idx, new_name) else: clade.name = "{}_inner".format(idx) names[clade.name] = clade return names
label=name) plt.legend(loc='best', fontsize=12) plt.savefig('offline-scores.png', bbox_inches='tight') plt.figure() plt.xlim([0, n_iters]) # plt.ylim(ymin=-400) plt.xlabel("Iterations", fontsize=fontsize) plt.ylabel("Data Log Likelihood", fontsize=fontsize) for name, likelihood in likelihoods.items(): plt.plot(likelihood, label=name) plt.legend(loc='best', fontsize=12) plt.savefig('offline-likelihoods.png', bbox_inches='tight') for type, model in models.items(): final_tree = model.copy() plt.figure() plot_tree_2d(final_tree, X, pca) for node in final_tree.dfs(): if node.is_leaf(): node.point = y[node.point] newick = final_tree.to_newick() tree = Phylo.read(StringIO(newick), 'newick') Phylo.draw_graphviz(tree, prog='neato') plt.savefig('tree-%s.png' % type, bbox_inches='tight') plt.show()
options = get_options() import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') import os import pandas as pd import numpy as np from Bio import Phylo t = Phylo.read(options.tree, 'newick') # Max distance to create better plots mdist = max([t.distance(t.root, x) for x in t.get_terminals()]) # Load roary roary = pd.read_table(options.spreadsheet, sep=',', low_memory=False) # Set index (group name) roary.set_index('Gene', inplace=True) # Drop the other info columns roary.drop(list(roary.columns[:options.skipped_columns - 1]), axis=1, inplace=True) # Transform it in a presence/absence matrix (1/0) roary.replace('.{2,100}', 1, regex=True, inplace=True)
#validate gene tree names against the species tree and flag up any discrepencies. #usage: python check_valid_species_names.py species_tree genetreedir/ from Bio import Phylo import os, sys, re spnames = [] odd_names = {} sptree = Phylo.read(sys.argv[1], "newick") for taxa in sptree.get_terminals(): spnames.append(taxa.name) #now scan gene trees for any unexpected species names to_check = [ file for file in os.listdir(sys.argv[2]) if file.endswith("ufboot") ] for file in to_check: trees = Phylo.parse(sys.argv[2] + file, "newick") for t in trees: for tip in t.get_terminals(): fields = re.split("_", tip.name) if fields[0] in spnames: continue else: odd_names[fields[0]] = 1 for element in odd_names.keys(): print element
tmpParam = sampSets[key] sampSets[realKey] = { 'startTime': tmpParam['startTime'], 'endTime': tmpParam['endTime'], 'time': [], 'lineages': [] } params = sampSets[realKey] startTime = params['startTime'] endTime = params['endTime'] time = params['time'] num_lineages = params['lineages'] T = Phylo.read(treefile, 'newick') node_data = read_node_data([branchfile, cladefile]) #raw_strain_info = collect_strain_info(node_data, metadatafile) node_data, node_attrs, node_data_names, metadata_names = parse_node_data_and_metadata( T, [branchfile, cladefile], metadatafile) rate = node_data['clock']['rate'] for node in T.find_clades(order='postorder'): data = node_data['nodes'][node.name] node.clade_membership = data['clade_membership'] node.date = data['date'] node.num_date = data['numdate'] #raw_data = raw_strain_info[node.name] raw_data = node_attrs[node.name] node.region = raw_data['region'] if 'region' in raw_data else '' node.branch_length = data['branch_length'] / rate
nargs='?', default='newick', help='phylogeny format (%s)' % (','.join(bp._io.supported_formats.keys()))) parser.add_argument('-t', '--taxonomy_format', nargs='?', default='newick', help='taxonomy format') parser.add_argument('-o', '--output_format', nargs='?', default='newick', help='output format') parser.add_argument('-r', '--root', nargs='?', default=None, help='name of OTU to use as root of taxonomy') args = parser.parse_args() # read in the tree and taxonomy phylogeny = bp.read(args.phylogeny_file, args.phylogeny_format) taxonomy = bp.read(args.taxonomy_file, args.taxonomy_format) label_tree(phylogeny, taxonomy, tax_root=args.root) # write output to stdout print phylogeny.format(args.output_format)
# makes a data frame where first column are node names, other columns are bbh, psi, and negatives count def lookup_by_names(tree, df): names = {} for clade in tree.find_clades(): if clade.name: if clade.name in names: raise ValueError("Duplicate key: %s" % clade.name) names[clade.name] = clade for node in names.keys(): for ind, val in df.iterrows(): if val[0] in node: df.iloc[ind, 0] = node return df tree = Phylo.read('16s-epsilon-outgroup-labelled-r.tree', 'newick') # open the tree file pfla = lookup_by_names(tree, pfla) # use the function to get the dictionary pflb = lookup_by_names(tree, pflb) # make a column of which results column has most species in it pfla = max_col(pfla) pflb = max_col(pflb) def binary_prot(df): df2 = df.iloc[:, :2] df2 = df2.reindex(columns=['accession', 'binary', 'binary2']) df2.iloc[:, 1:] = np.nan print(df2.head()) for ind, val in df['max'].iteritems(): if val == 'BBH':
#%% Preparatory file generation and organization. ## Read in the genome_data file. genome_data = pd.read_csv(ref_dir_domain + 'genome_data.csv', header=0, index_col=0) genome_data['clade'] = np.nan genome_data['tip_name'] = np.nan genome_data['npaths_actual'] = np.nan genome_data['branch_length'] = np.nan ## Get the clade number of each assembly and add this information to ## genome_data. tree = Phylo.read(tree, 'phyloxml') assemblies = [] for clade in tree.get_terminals(): clade_number = int(clade.confidence) print clade_number assembly = clade.name assembly = assembly.strip('@') if domain == 'eukarya': assembly = re.split('_', assembly)[0] else: assembly = re.split('_', assembly)
def phylo_from_str(tree_str): treeio = StringIO.StringIO(tree_str) phylo_tree = Phylo.read(treeio, format='newick') return phylo_tree
def test_to_networkx(self): """Tree to Graph conversion, if networkx is available.""" tree = Phylo.read(EX_DOLLO, 'phyloxml') G = Phylo.to_networkx(tree) self.assertEqual(len(G.nodes()), 659)