def get_tree(taxa, savename=None): """ Generates a taxonomic tree using the ncbi taxonomy and :param oma: a pyoma db object :param saveTree: Bool for whether or not to save a mastertree newick file :return: tree_string: a newick string tree: an ete3 object """ ncbi = ete3.NCBITaxa() tax = set(tax) genomes = set(genomes) tax.remove(0) print(len(tax)) tree = ete3.PhyloTree(name='') tree.add_child(name='131567') topo = ncbi.get_topology(tax, collapse_subspecies=False) tax = set([str(taxid) for taxid in tax]) tree.add_child(topo) orphans = list(genomes - set([x.name for x in tree.get_leaves()])) print('missing taxa:') print(len(orphans)) Entrez.email = config_utils.email orphans_info1 = {} orphans_info2 = {} for x in orphans: search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml') record = next(Entrez.parse(search_handle)) print(record) orphans_info1[record['ParentTaxId']] = x orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']] for n in tree.traverse(): if n.name in orphans_info1: n.add_sister(name=orphans_info1[n.name]) print(n) orphans = set(genomes) - set([x.name for x in tree.get_leaves()]) tree = add_orphans(orphans_info2, tree, genomes) orphans = set(genomes) - set([x.name for x in tree.get_leaves()]) tree_string = tree.write(format=1) if savename is None: with open(config_utils.datadir + 'mastertree.nwk', 'w') as nwkout: nwkout.write(tree_string) with open(config_utils.datadir + 'mastertree.pkl', 'wb') as pklout: pklout.write(pickle.dumps(tree)) else: with open(config_utils.datadir + savename + '_master_tree.nwk', 'w') as nwkout: nwkout.write(tree_string) with open(config_utils.datadir + savename + '_master_tree.pkl', 'wb') as pklout: pklout.write(pickle.dumps(tree)) return tree_string, tree
def main(treefile, ingroup, outfile): tree = ete3.PhyloTree(treefile, format=2) tax_map = get_tax_ids(tree) taxon_clade = get_clade_dict(tax_map) root_tree(tree, ingroup) tree.ladderize(direction=1) prepare_node_leave_names(tree) tree_string = collapse_groups(tree, taxon_clade) tree_string = tree_string.replace('INTERNAL_', "[&label=").replace('_SUPPORT',']') write_nexus(outfile, tree_string, tree)
def parse_phylo(phy_fn, phy_id): # load input phy = ete3.PhyloTree("%s" % (phy_fn)) logging.info("%s num nodes = %i" % (phy_id,len(phy))) # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split("_")[0] ) # resolve polytomies in a random fashion phy.resolve_polytomy(recursive=True) # check if tree is rooted, apply midpoint root if unrooted phy_root = phy.get_tree_root() phy_outg = phy_root.get_children() is_root = len(phy_outg) == 2 if is_root: pass logging.info("%s Tree is rooted, pass" % phy_id) else: logging.info("%s Tree is unrooted, apply midpoint root" % phy_id) phy_outgroup = phy.get_midpoint_outgroup() phy.set_outgroup(phy_outgroup) # find evolutionary events (duplications and speciations) evev = phy.get_descendant_evol_events(sos_thr=0) # create empty array for network edges evou = np.empty((len(evev)*1000, 5), dtype="object") evou[:] = np.nan # loop through in and out seqs, create edge table with orthologous events n = 0 for ev in evev: if ev.etype == "S": for ii in ev.in_seqs: for oi in ev.out_seqs: evou[n,0] = ii evou[n,1] = oi evou[n,2] = ev.branch_supports[0] evou[n,3] = ev.etype evou[n,4] = ev.sos n = n + 1 evou_d = pd.DataFrame(evou).dropna() evou_d.columns = ["in_gene","out_gene","branch_support","ev_type","sos"] return evou_d
def __init__(self, TreePath, AlignementPath, uniprotTaxonomy): """This class takes the path to the Newick Tree, the fasta alignment from which the tree is derived and the path to the parsed uniprot taxonomy.""" self.TreePath = TreePath self.AlignementPath = AlignementPath f = open(self.AlignementPath) lines = f.readlines() out = [] for line in lines: if line[0] == '>': out.append(line.split(' ')[0] + '\n') else: out.append(line) f.close() f = open(self.AlignementPath, 'w') for o in out: f.write(o) f.close() self.tree = ete3.PhyloTree(newick=TreePath, alignment=AlignementPath) self.tree.set_species_naming_function(self.parse_sp_name) self.uniprot2ncbi = {} self.uniprot2species = {} self.ncbiID2species = {} self.ncbi = ete3.NCBITaxa() f = open(uniprotTaxonomy) lines = f.readlines() for line in lines: s = line.strip().split('\t') uniprotID = s[0] ncbiID = s[1].split(' ')[0] specie = s[2].split(',')[-1] self.uniprot2ncbi[uniprotID] = ncbiID self.uniprot2species[uniprotID] = specie self.ncbiID2species[ncbiID] = specie self.treeTaxa = [] leaves = self.tree.get_leaves() for leaf in leaves: uniprotID = leaf.name.split('|')[0].split('_')[1] ncbiID = self.uniprot2ncbi[uniprotID] leaf.name = "%s_%s" % (ncbiID, leaf.name.split('|')[0].split('_')[1]) # leaf.species = sel`f.uniprot2species[uniprotID] self.treeTaxa.append(int(ncbiID)) self.NCBITaxonomy = self.ncbi.get_topology(self.treeTaxa, intermediate_nodes=True)
def read_ancestral_tree(rst_file_name): rst_file = open(rst_file_name) flag0 = False flag1 = False flag2 = True species_list = [] for line in rst_file: if (flag2 == True) and line.startswith('('): length_tree = ete3.Tree(line.strip()) flag2 = False if flag0 == True: species_tree = ete3.PhyloTree(line.strip(), format=8) re_root = re.search(r'\)\s+([_\-\.\w]+)\s+;', line) if re_root: species_tree.name = re_root.group(1) for node in species_tree.traverse(): if node.is_leaf(): node.name = '_'.join(node.name.split('_')[1:]) species_list.append(node.name) line_set = set(species_list + [ 'node', ]) flag0 = False flag1 = True if (flag1 == True) and (len(line) > 1) and (line.split()[0] in line_set): cols = line.strip().split() if cols[0] in species_list: (species_tree & cols[0]).sequence = ''.join(cols[1:]) else: (species_tree & cols[1][1:]).sequence = ''.join(cols[2:]) if line.startswith("tree with node labels for Rod Page's TreeView"): flag0 = True for node in species_tree.traverse('preorder'): leaves = set(node.get_leaf_names()) for length_node in length_tree.traverse('preorder'): if set(length_node.get_leaf_names()) == leaves: node.dist = length_node.dist return species_tree
def main(inputtree, outbase, div=True, features=None, stem_or_crown="crown", byrank='', byage=None, bylist=None, bysize=None): """byrank: when the rank is included in or equal to 'byrank'; byage: collapse any node of age <= byage; bylist: read list of nodes from file; bysize: collapse oldest nodes with size < bysize.""" group_feature_rate = def_group_feature_rate(stem_or_crown) tree = ete3.PhyloTree(inputtree, format=1, quoted_node_names=False) outsuffix = '-stem' if stem_or_crown == 'stem' else '' if byrank: outsuffix += '-%s' % byrank if byage: outsuffix += '-age%g' % byage if bylist: outsuffix += '-list' + op.splitext(op.basename(bylist))[0] if bysize: outsuffix += '-size%d' % bysize outnames = { 'tsv': (outbase + '%s.tsv' % outsuffix), 'subtrees': (outbase + '%s.subtrees.nwk' % outsuffix), 'tree': (outbase + '%s.nwk' % outsuffix) } for out in outnames.values(): if op.exists(out): logger.error("%r already exists, quitting", out) return 1 columns = [outsuffix.lstrip('-'), 'size', 'branches', 'age', 'tot_len'] #'crown_age', 'stem_age'] if div: columns.extend(('div_rate', 'gamma', 'ncbi_sp_sampling')) if features: columns.extend(features) if byrank or div: logger.info("Loading taxonomy") ncbi = ete3.NCBITaxa() name2taxid = ncbi.get_name_translator( [node.name.replace('_', ' ') if node.is_leaf() \ else node.name for node in tree.traverse()]) # Won't return anything for names not found #if rank: #taxid2rank = ncbi.get_rank(chain(*name2taxid.values())) taxid2name = ncbi.get_taxid_translator(chain(*name2taxid.values())) else: name2taxid, taxid2name = None, None is_leaf_fn = make_is_leaf_fn(byrank, byage, bylist, bysize, name2taxid, taxid2name) with open(outnames['tsv'], 'w') as outtsv, \ open(outnames['subtrees'], 'w') as outsub: outtsv.write('\t'.join(columns) + '\n') logger.info("Iterating over found clades") for node in tree.iter_leaves(is_leaf_fn): outsub.write( node.write(features, format=1, format_root_node=True) + '\n') # Collapse size = len(node) branches = len(node.get_descendants()) _, age = node.get_farthest_leaf() tot_len = sum(d.dist for d in node.iter_descendants()) if stem_or_crown == 'stem': age += node.dist tot_len += node.dist values = [node.name, size, branches, age, tot_len] if div: div_rate = float(size) / age if age else np.NaN gamma_stat = div_gamma(node) try: nodetaxids = name2taxid[node.name.replace('_', ' ')] if len(nodetaxids) > 1: nodetaxids = [ match_duplicate_taxid(taxids, node, taxid2name, ncbi) ] except KeyError: # This clade isn't in the taxonomy (example: Atlantogenata) # take descendant nodes and join them valid_tax_children = get_valid_tax_children( node, name2taxid) vtc_names = [ vtc.name.replace(' ', '_') for vtc in valid_tax_children ] logger.warning( '%r not found in NCBI Taxonomy. Merging ' 'the node children %s to get the ' 'descendant counts.', node.name, vtc_names) nodetaxids = [] for vtc_n, vtc in zip(vtc_names, valid_tax_children): vtc_taxids = name2taxid[vtc_n] if len(vtc_taxids) == 1: nodetaxids.append(vtc_taxids[0]) else: nodetaxids.append( match_duplicate_taxid(vtc_taxids, vtc, taxid2name, ncbi)) ncbi_sp = list(chain(*(ncbi.get_descendant_taxa(nt, rank_limit='species') \ for nt in nodetaxids))) #collapse_subspecies=True)) sp_sampling = float(size) / len(ncbi_sp) values.extend((div_rate, gamma_stat, sp_sampling)) if features: ft_rates = group_feature_rate(node, features) values += ft_rates.tolist() outtsv.write('\t'.join(str(v) for v in values) + '\n') tree.write(outfile=outnames['tree'], format=1, is_leaf_fn=is_leaf_fn, format_root_node=True)
import re from math import modf import seaborn as sns import pandas as pd from scipy import stats import matplotlib.pyplot as plt import ete3 def get_nodes(tree, clade): subtree = tree & clade clade_nodes = [n.name for n in subtree.traverse()] return clade_nodes tree = ete3.PhyloTree('species_trees/MethHikHalo_f30_19000_clean.tree') for n in tree.traverse(): if not n.is_leaf(): n.name = str(int(n.support)) clades = { 'Halobacteria': '107', 'Hikarchaea': '93', 'Methanomicrobia': '98', 'Archaeoglobales': '101', 'Methanocellales': '91', 'Methanosarcinales': '104', 'Syntrophoarchaea': '62' } clades = {v: get_nodes(tree, k) for v, k in clades.items()} clades = {s: c for c, specs in clades.items() for s in specs}
else: outgroup = [] # select clustering method valid_methods = ["mcl", "louvain", "lpa", "mclw"] if method in valid_methods: clusters_function_string = "clusters_%s" % method else: print("Error, invalid clustering method \'%s\'!" % method) print("Valid methods are: %s" % valid_methods) sys.exit() # use species tree reconciliation? if spstree is not None: do_sps_reconciliation = True phs = ete3.PhyloTree(spstree) else: do_sps_reconciliation = False ######################### ####### FUNCTIONS ####### ######################### # logging logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s]\t%(message)s", handlers=[logging.StreamHandler()]) def write_tree(phy, out,
# # define species phylogeny # sp = "((((Hsa,Ptr),(Mmu,Mms)),Cfa),Dme);" # #sp = "((((Hsa,Ptr),(Mmu,Mms)),(Cfa,(Lup,(Can,Cam)))),(Dme,Ano));" # phs = ete3.PhyloTree(sp) # phs.set_species_naming_function(lambda node: node.name ) # print(phs) # gene tree phy_fn = "/home/xavi/dades/Anotacions/orthofinder_Ano14sps_noclu_9oct19/output/Trees/OG0000058.iqt.treefile" #phy = ete3.PhyloTree("%s" % (phy_fn)) #phy.set_species_naming_function(lambda node: node.name.split("_")[0] ) # species tree phs_fn = "/home/xavi/Documents/auto-orthology/orthofinder_Ano14sp/tree.newick" phs = ete3.PhyloTree("%s" % (phs_fn)) phs.set_species_naming_function(lambda node: node.name.split("_")[0] ) # define a dictionary of species-two-species relative ages # for each species in the phyolgeny def species_age_dict(phs): # init dict of dicts sps_age_dict = dict() sps_list = phs.get_leaf_names() age_root = 0 for n,i in enumerate(sps_list): # init dict for species i sps_age_dict[i] = dict() for m,j in enumerate(sps_list):
import pandas as pd import ete3 import sys treefile = sys.argv[1] focus_taxon = sys.argv[2] column = sys.argv[3] df = pd.read_csv("taxonomy_orthofinder_selection.csv", sep='\t') tree = ete3.PhyloTree(treefile, format=2) df.loc[df[column].apply(lambda x: focus_taxon in x), 'group'] = focus_taxon for l in tree.iter_leaves(): sp = l.name.split('..')[0] group = df[df['Name'] == sp].iloc[0]['group'] group = group.replace(' ', '_').replace('"', '') l.name = "{}..{}".format(group, l.name) tree.write(outfile=treefile.replace('.treefile', '.{}.treefile'.format(focus_taxon)), format=2)
align = AlignIO.read(child.stdout, "clustal") print(align) # convert into PHYLIP format phylip = "seqs.phy" with open(phylip, 'w') as out: AlignIO.write(align, out, 'phylip') ### # reconstruct phylogenetic tree from Bio import Phylo from Bio.Phylo.Applications import PhymlCommandline, FastTreeCommandline #cmd = PhymlCommandline(input=phylip, datatype='aa') cmd = FastTreeCommandline(input=phylip, out=phylip + ".nw") out_log, err_log = cmd() tree = Phylo.read(phylip + ".nw", 'newick') # '_phyml_tree.txt' Phylo.draw_ascii(tree) # ete import ete3 t = ete3.PhyloTree(phylip + ".nw") # root by mid-point t.set_outgroup(t.get_midpoint_outgroup()) print(t) t.show() t.render(fn + ".svg")
elif node_type == 2: get_node_order(node, None) elif node_type == 3: get_node_order(node, mypartitions[node.name]) # Now we get the node orders of the tree print(",".join(root.order)) def test_tree(): all_partitions = write_partitions(mytree) generate_node_orders(all_partitions) if __name__ == "__main__": if len(sys.argv) != 2: print("usage: python generate_orders.py mytree") exit(0) scr, mytree_file = sys.argv with open(mytree_file) as f: mytree = ete3.PhyloTree(f.next().strip(), format=1) all_partitions = write_partitions(mytree) generate_node_orders(all_partitions)
def combine_features(data, dsizes, tree, taxid2sp, prot2taxid, taxa_to_merge): """ create a ete3 tree with domain features from jackhmmer """ # motif example from http://etetoolkit.org/docs/latest/tutorial/tutorial_drawing.html#phylogenetic-trees-and-sequence-domains #simple_motifs = [ ## seq.start, seq.end, shape, width, height, fgcolor, bgcolor #[10, 60, "[]", None, 10, "black", "rgradient:blue", "arial|8|white|long text clipped long text clipped"], #[120, 150, "o", None, 10, "blue", "pink", None], #[200, 300, "()", None, 10, "blue", "red", "arial|8|white|hello"], #] # add domain match # and ount number of sequences by taxid # and get size #dsize = dict() motifs = dict() dcnt_sp = dict() for tremolo_dom in data: tremolo_dom_start, tremolo_dom_stop = data[tremolo_dom]["QPos"] del data[tremolo_dom]["QPos"] for prot in data[tremolo_dom]: taxid = prot2taxid[prot] sp = taxid2sp[taxid] sp = sp.replace("(", "").replace(")", "").replace(",", "").replace(";", "") taxid2sp[taxid] = sp full_domains = data[tremolo_dom][prot].get("Tpos", list())[:] target_domains = filter_domain_arch(full_domains) dom_arch = list() ordered_domains = sorted(target_domains) for start, stop, dom in ordered_domains: dom_arch.append(dom) dom_arch = ";".join(dom_arch) if sp not in dcnt_sp: dcnt_sp[sp] = dict() if dom_arch not in dcnt_sp[sp]: dcnt_sp[sp][dom_arch] = list() dcnt_sp[sp][dom_arch].append(prot) # add domains for start, stop, dom in ordered_domains: color = get_domain_color(dom) motifs.setdefault(sp, dict())\ .setdefault(prot, list())\ .append([start+1, stop, "[]", None, 10, "black", color, "arial|1|black|{}".format(dom)]) #motifs[sp][prot].sort() for hitnb in data[tremolo_dom][prot]["Hit"]: start, stop = data[tremolo_dom][prot]["Hit"][hitnb]["Tali"] motifs.setdefault(sp, dict())\ .setdefault(prot, list())\ .append([start, stop, "o", None, 10, "black", "red", "arial|1|black|HCAdom {}-{}".format(tremolo_dom, hitnb)]) #print(motifs) # merge taxonomic groups for taxid_taxa in taxa_to_merge: taxid, taxa = taxid_taxa.split(",") lnode = tree.search_nodes(name=taxid) if len(lnode) > 0: taxnode = lnode[0] children = [child.name for child in taxnode.children] leaves = list() for child in taxnode.traverse(): if child.is_leaf(): leaves.append(child) #print(taxid_taxa, child.name) taxid2sp[taxid] = taxa dcnt_sp[taxa] = dict() motifs[taxa] = dict() for leafnode in leaves: nodeid = leafnode.name nodesp = taxid2sp[nodeid] #print(taxid_taxa, nodeid, nodesp) # merge protein list for dom_arch in dcnt_sp[nodesp]: for prot in dcnt_sp[nodesp][dom_arch]: dcnt_sp[taxa].setdefault(dom_arch, list()).append(prot) # merge motif for prot in motifs[nodesp]: for m in motifs[nodesp][prot]: motifs[taxa].setdefault(prot, list()).append(m) # delete obsoletes species del dcnt_sp[nodesp] del motifs[nodesp] del taxid2sp[nodeid] #for dom_arch in dcnt_sp[taxa]: #for prot in dcnt_sp[taxa][dom_arch]: #print(prot, dom_arch) for child in children: node = tree.search_nodes(name=child)[0] node.detach() #print(taxnode.get_ascii(show_internal=True)) else: print("Unable to find taxid {} in tree".format(taxid)) print(lnode) for taxid in taxid2sp: #print(taxid) node = tree.search_nodes(name=taxid) if node != []: node[0].name = taxid2sp[taxid] else: print("Unable to find node for taxid {}".format(taxid)) # expand taxonomic tree by the number of sequences in each taxa for node in tree: if node.is_leaf(): if node.name in dcnt_sp: node_sp = node.name proteins = list() features = dict() for dom_arch in dcnt_sp[node_sp]: sizes_and_proteins = list() for prot in dcnt_sp[node_sp][dom_arch]: new_name = node_sp + " | " + prot.split("|")[1] if len(dcnt_sp[node_sp][dom_arch]) > 1: new_name += " [+{}]".format( len(dcnt_sp[node_sp][dom_arch]) - 1) sizes_and_proteins.append( (dsizes[prot], new_name, dom_arch, prot)) sizes_and_proteins.sort(reverse=True) sizes, names, dom_archs, prots = zip(*sizes_and_proteins) proteins.append(names[0].replace(":", " ")) features[names[0].replace(":", " ")] = (prots[0], dom_archs[0]) subtree = ete3.PhyloTree("({});".format(", ".join(proteins))) node.add_child(subtree) for new_node in subtree: prot, dom_arch = features[new_node.name] seq = "G" * dsizes[prot] m = motifs[node_sp][prot] seqFace = SeqMotifFace(seq, seq_format="line", motifs=m) new_node.add_face(seqFace, 0, "aligned") #for dom in domain_color: #print(dom, domain_color[dom]) return tree
def parse_tree_data(args, c): # create a phylo tree object newick_tree = c['cft.reconstruction:asr_tree']['tripl.file:contents'] newick_tree_path = str(c['cft.reconstruction:asr_tree']['tripl.file:path']) tree = ete3.PhyloTree(newick_tree, format=1) # parse out sequences and other sequence metadata aa_seqs_dict = create_seqs_dict( c['cft.reconstruction:cluster_aa']['bio.seq:set']) nt_seqs_dict = create_seqs_dict( c['cft.reconstruction:asr_seqs']['bio.seq:set']) seqmeta_dict = create_seqmeta_dict( c['cft.reconstruction:seqmeta']['tripl.csv:data']) # Note that this function is impure; it's mutable over the internal nodes def process_node(node): node.label = node.id = node.name node.nt_seq = nt_seqs_dict[node.name] node.aa_seq = aa_seqs_dict[node.name] for attr, parser in [['cft.seq:multiplicity', int], ['cft.seq:timepoint_multiplicities', listofint], ['cft.seq:cluster_multiplicity', int], [ 'cft.seq:cluster_timepoint_multiplicities', listofint ], ['cft.seq:timepoint', None], ['cft.seq:timepoints', listof], ['cft.seq:cluster_timepoints', listof], ['cft.seq:affinity', float], ['cft.tree.node:lbi', float], ['cft.tree.node:lbr', float]]: seqmeta = seqmeta_dict.get(node.name, {}) node.__dict__[attr.split(':')[1]] = (parser or (lambda x: x))( seqmeta.get(attr)) if seqmeta.get(attr) else None node.type = "node" if node.is_leaf(): node.type = "leaf" if node.up: # get parent info, distance for non root node.parent = node.up.name node.length = node.get_distance(node.up) try: node.distance = node.get_distance(args.inferred_naive_name) except Exception as e: if args.verbose: warnings.warn("Unable to compute distance to naive '" + str(args.inferred_naive_name) + "' in file " + newick_tree_path) print("newick tree:", newick_tree) raise e else: # node is root node.type = "root" node.parent = None node.length = 0.0 node.distance = 0.0 node = node #import pdb; pdb.set_trace() return ({ 'id': node.id, 'label': node.label, 'type': node.type, 'parent': node.parent, 'length': node.length, 'distance': node.distance, 'nt_seq': node.nt_seq, 'aa_seq': node.aa_seq, 'affinity': node.affinity, 'lbi': node.lbi, 'lbr': node.lbr, 'timepoint': node.timepoint, 'multiplicity': node.multiplicity, 'cluster_multiplicity': node.cluster_multiplicity, # change this to real list of key value objects for timepoint multiplicities for #56 'timepoint_multiplicities': [{ 'timepoint': t, 'multiplicity': m } for t, m in zip(node.timepoints or [], node.timepoint_multiplicities or [])], 'cluster_timepoint_multiplicities': [{ 'timepoint': t, 'multiplicity': m } for t, m in zip(node.cluster_timepoints or [], node.cluster_timepoint_multiplicities or [])] }) # map through and process the nodes return map(process_node, tree.traverse('postorder'))
def parse_phylo(phy_fn, phy_id, do_root, do_allpairs, clusters_function_string, outgroup=outgroup, do_sps_reconciliation=do_sps_reconciliation): # load input phy = ete3.PhyloTree("%s" % (phy_fn)) logging.info("%s num nodes = %i" % (phy_id, len(phy))) logging.info("%s clustering function is %s" % (phy_id, clusters_function_string)) clusters_function = eval(clusters_function_string) if do_sps_reconciliation: logging.info("%s do species tree reconciliation" % (phy_id)) # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split(split_ch)[0]) # resolve polytomies (randomly) phy.resolve_polytomy(recursive=True) # try to find root if unrooted if do_root: # shall we do it with iterative midpoint rooting? if itermidroot is not None: niter = itermidroot num_evs_per_iter = np.zeros(niter) out_nod_per_iter = np.empty(niter, dtype=object) # then, iterate to try to find better candidates phy_it = phy.copy(method="newick") phy_outgroup_it = phy_it.get_midpoint_outgroup() phy_it.set_outgroup(phy_outgroup_it) # select very short length to shorten every rooting candidate branch (based on in-tree branch distribution) dist_lengths = [ node.dist for node in phy.traverse("postorder") if node.dist > 0 ] shrunk_length = np.quantile(dist_lengths, 0.1) for roi in range(niter): # parse events and re-do clustering evs_it, _, _, phy_lis_it = parse_events( phy=phy_it, outgroup=outgroup, do_allpairs=False, min_support_node=min_support_node) clu_it = clusters_function(evs=evs_it, node_list=phy_lis_it, verbose=False) # store number of orthogroups in this particular iteration num_evs_per_iter[roi] = len(np.unique( clu_it["cluster"].values)) out_nod_per_iter[roi] = phy_outgroup_it print("%s Iterative midpoint root | %i/%i | n OGs = %i" % (phy_id, roi + 1, niter, num_evs_per_iter[roi])) # in subsequent iterations, shrink the previous outgroup branch, and try to find second-best candidate phy_outgroup_it.dist = shrunk_length phy_outgroup_it = phy_it.get_midpoint_outgroup() phy_it.set_outgroup(phy_outgroup_it) # select outgroup that minimises number of OGs (more agglomerative) phy_outgroup_ix = np.argmin(num_evs_per_iter) # outgroup node in iterated tree phy_outgroup_from_it = out_nod_per_iter[phy_outgroup_ix] phy_outgroup_descendants = [ t for t in phy_outgroup_from_it.get_leaf_names() ] # outgroup node in original tree phy_outgroup = phy.get_common_ancestor(phy_outgroup_descendants) if len(phy_outgroup_descendants) != len( phy_outgroup.get_leaf_names()): print( "%s Iterative midpoint root found and impossible root, default to midpoint" % (phy_id)) phy_outgroup_ix = 0 phy_outgroup = phy.get_midpoint_outgroup() # set outgroup # print(phy_outgroup_ix, phy_outgroup) logging.info("%s Best root at iteration | %i/%i | n OGs = %i" % (phy_id, phy_outgroup_ix + 1, niter, num_evs_per_iter[phy_outgroup_ix])) # ...or shall we do it with simple midpoint rooting? else: # set outgroup using normal midpoint rooting logging.info("%s Midpoint root" % phy_id) phy_outgroup = phy.get_midpoint_outgroup() # set root phy.set_outgroup(phy_outgroup) # ignore rooting else: pass logging.info("%s Skip rooting (assume tree is already rooted)" % phy_id) # ladderise phylogeny phy.ladderize() # parse events if do_sps_reconciliation: evs, eva, phy, phy_lis = parse_events_sps_reconciliation( phy=phy, phs=phs, outgroup=outgroup, do_allpairs=do_allpairs) else: evs, eva, phy, phy_lis = parse_events( phy=phy, outgroup=outgroup, do_allpairs=do_allpairs, min_support_node=min_support_node) clu = clusters_function(evs=evs, node_list=phy_lis) # output from event parsing return evs, eva, phy, phy_lis, clu
import ete3 def get_ancestor(nodes): ancestor = nodes[0] for n in nodes: ancestor = ancestor.get_common_ancestor(n) return ancestor tree = ete3.PhyloTree('astral/astral_tree.main_tree', quoted_node_names=False) treepp = ete3.PhyloTree('astral/astral_tree_pp.new', quoted_node_names=False) counter = 0 c2supp = {} for n in tree.traverse(): if not n.is_leaf(): kids = [c.name for c in n.get_leaves()] pp_n = get_ancestor([treepp.get_leaves_by_name(k)[0] for k in kids]) c2supp[counter] = "{}/{}".format(round(n.support, 1), pp_n.support) n.support = counter counter += 1 tree_str = tree.write() for c, s in c2supp.items(): tree_str = tree_str.replace(")" + str(c) + ":", ")'" + s + "':") with open('astral/astral_tree_combined.new', 'w') as out: print(tree_str, file=out)
def parse_tree_data(args, c): # create a phylo tree object newick = c["newick"] tree = ete3.PhyloTree(newick, format=1) # parse out sequences and other sequence metadata aa_seqs_dict = create_seqs_dict( c["cft.reconstruction:cluster_aa"]["bio.seq:set"]) dna_seqs_dict = create_seqs_dict( c["cft.reconstruction:asr_seqs"]["bio.seq:set"]) seqmeta_dict = create_seqmeta_dict( c["cft.reconstruction:seqmeta"]["tripl.csv:data"]) # Note that this function is impure; it's mutable over the internal nodes def process_node(node): node.dna_seq = dna_seqs_dict[node.name] node.aa_seq = aa_seqs_dict[node.name] for attr, parser in [ ["cft.seq:multiplicity", int], ["cft.seq:timepoint_multiplicities", listofint], ["cft.seq:cluster_multiplicity", int], ["cft.seq:cluster_timepoint_multiplicities", listofint], ["cft.seq:timepoint", None], ["cft.seq:timepoints", listof], ["cft.seq:cluster_timepoints", listof], ["cft.seq:affinity", float], ["cft.tree.node:lbi", float], ["cft.tree.node:lbr", float], ]: seqmeta = seqmeta_dict.get(node.name, {}) try: value = ((parser or (lambda x: x))(seqmeta.get(attr)) if seqmeta.get(attr) else None) except ValueError as e: value = None node.__dict__[attr.split(":")[1]] = value node.type = "node" if node.is_leaf(): node.type = "leaf" if node.up: # get parent info, distance for non root node.parent = node.up.name node.length = node.get_distance(node.up) try: node.distance = node.get_distance(args.inferred_naive_name) except Exception as e: if args.verbose: warnings.warn( "Unable to compute distance to naive '{}' in file {}". format( str(args.inferred_naive_name), str(c["cft.reconstruction:asr_tree"] ["tripl.file:path"]), )) print("newick:", newick) raise e else: # node is root node.type = "root" node.parent = None node.length = 0.0 node.distance = 0.0 node = node # import pdb; pdb.set_trace() return { "id": node.name, "type": node.type, "parent": node.parent, "length": node.length, "distance": node.distance, "dna_seq": node.dna_seq, "aa_seq": node.aa_seq, "affinity": node.affinity, "lbi": node.lbi, "lbr": node.lbr, "timepoint_id": node.timepoint, "multiplicity": node.multiplicity, "cluster_multiplicity": node.cluster_multiplicity, # change this to real list of key value objects for timepoint multiplicities for #56 "timepoint_multiplicities": [{ "timepoint_id": t, "multiplicity": m } for t, m in zip(node.timepoints or [], node.timepoint_multiplicities or [])], "cluster_timepoint_multiplicities": [{ "timepoint_id": t, "multiplicity": m } for t, m in zip( node.cluster_timepoints or [], node.cluster_timepoint_multiplicities or [], )], } # map through and process the nodes return {n.name: process_node(n) for n in tree.traverse("postorder")}
sis_up = sis.up sis.detach() new_node = ete3.PhyloNode() sis_up.add_child(new_node) new_node.add_child(sis) new_node.add_child(anc) topologies = [(['Picozoa'], ['Rhodelphis', 'Rhodophyta']), (['Picozoa'], ['Rhodophyta']), (['Picozoa'], ['Rhodelphis']), (['Picozoa'], ['Viridiplantae', 'Glaucophyta']), (['Picozoa'], ['Glaucophyta']), (['Picozoa'], ['Viridiplantae']), (['Picozoa'], ['Archaeplastida']), (['Picozoa'], ['Telonemia']), (['Picozoa'], ['Telonemia', 'Rhizaria', 'Stramenopila', 'Alveolata']), (['Picozoa'], ['Cryptista']), (['Picozoa', 'Cryptista'], ['Rhodophyta', 'Rhodelphis']), (['Picozoa', 'Cryptista'], ['Rhodophyta']), (['Picozoa', 'Cryptista'], ['Viridiplantae', 'Glaucophyta']), (['Picozoa', 'Cryptista'], ['Glaucophyta']), (['Picozoa', 'Cryptista'], ['Viridiplantae'])] tree = ete3.PhyloTree("orig_topology.new", format=0) with open('all_topologies2test.new', 'w') as out: for c1, c2 in topologies: # print("({}),({})".format(','.join([c for c in c1]), ','.join([c for c in c2]))) make_sisters(tree, c1, c2) print(tree.write(format=9), file=out)
import os def get_mono_clades(tree, taxon): seeds = set([l for l in tree.get_leaves() if l.clade == taxon]) nodes = set() for s in seeds: n = s while all([l.clade == taxon for l in n.up.get_leaves()]): n = n.up nodes.add(n) return nodes def get_sister_id(node): if node.up.get_children()[0] == node: return set([l.clade for l in node.up.get_children()[1]]) elif node.up.get_children()[1] == node: return set([l.clade for l in node.up.get_children()[0]]) with open('sisters_picozoa_sgt.csv', 'w') as out: for f in glob.glob("trees_for_fabien_renamed/*.new"): clst = os.path.basename(f).replace('.new', '') tree = ete3.PhyloTree(f, format=2) for l in tree.iter_leaves(): l.add_feature(pr_name='clade', pr_value=l.name.replace("'", "").split('_')[0]) clades = get_mono_clades(tree, 'Picozoa') for i, n in enumerate(clades, 1): sister = get_sister_id(n) if sister: print("{}\tclade {}\t{}".format(clst, i, ";".join(list(sister))), file=out)
if all([l in prr for l in anc.get_leaves()]): return anc.support else: return 0.0 steps = ["step{}".format(i) for i in range(11)] clades = [ 'Picozoa+Rhodelphis+Rhodophyta', 'Rhodelphis+Rhodophyta', 'Archaeplastida', "Archaeplastida+Cryptista", 'Amorphea', 'TSAR', 'SAR' ] df = pd.DataFrame(index=steps, columns=clades) for f in glob.glob("*.treefile"): step = f.replace('.treefile', '') tree = ete3.PhyloTree(f) df.loc[step, 'Picozoa+Rhodelphis+Rhodophyta'] = check_monophyly_support( tree, ['Picozoa', 'Rhodelphis', 'Rhodophyta']) df.loc[step, 'Rhodelphis+Rhodophyta'] = check_monophyly_support( tree, ['Rhodelphis', 'Rhodophyta']) df.loc[step, 'Archaeplastida'] = check_monophyly_support( tree, ['Picozoa', 'Archaeplastida']) df.loc[step, 'Archaeplastida+Cryptista'] = check_monophyly_support( tree, ['Picozoa', 'Archaeplastida', 'Cryptista']) df.loc[step, 'Amorphea'] = check_monophyly_support( tree, ['Opisthokonta', 'Amoebozoa', 'Apusomonadida', 'Breviatea']) df.loc[step, 'TSAR'] = check_monophyly_support( tree, ['Telonemia', 'Stramenopila', 'Rhizaria', 'Alveolata']) df.loc[step, 'SAR'] = check_monophyly_support( tree, ['Stramenopila', 'Rhizaria', 'Alveolata'])
def load_spTree(fileName): spTree = ete3.PhyloTree(fileName, sp_naming_function=whole_name) for leaf in spTree.iter_leaves(): if len(leaf.name) != 5: leaf.delete() return spTree
def create_tree( # Base newick=None, name=None, format=0, dist=1.0, support=1.0, quoted_node_names=False, # ClusterTree text_array=None, fdist=None, # PhyloTree alignment=None, alg_format='fasta', sp_naming_function=None, # PhyloxmlTree phyloxml_clade=None, phyloxml_phylogeny=None, # Constructor node_prefix="y", into=ete3.Tree, prune=None, force_bifuraction=True, # Keywords tree_kws=dict(), bifurcation_kws=dict(recursive=True), ): """ Next: Convert to NetworkX """ # Should the tree be converted to skbio convert_to_skbio = False if into in [skbio.TreeNode]: into = ete3.Tree convert_to_skbio = True # ete3 construction if into == ete3.Tree: tree = ete3.Tree(newick=newick, format=format, quoted_node_names=quoted_node_names, **tree_kws) if into == ete3.ClusterTree: if isinstance(text_array, pd.DataFrame): text_array = dataframe_to_matrixstring(text_array) tree = ete3.ClusterTree(newick=newick, text_array=text_array, fdist=fdist, **tree_kws) if into == ete3.PhyloTree: tree = ete3.PhyloTree(newick=newick, alignment=alignment, alg_format=alg_format, sp_naming_function=sp_naming_function, format=format, **tree_kws) if into == ete3.PhyloxmlTree: tree = ete3.PhyloxmlTree(phyloxml_clade=phyloxml_clade, phyloxml_phylogeny=phyloxml_phylogeny, **tree_kws) # Set base attributes for k, v in dict(name=name, dist=dist, support=support).items(): setattr(tree, k, v) # Prune if prune is not None: tree.prune(prune) # Bifurcation if force_bifuraction: n_internal_nodes = len( [*filter(lambda node: node.is_leaf() == False, tree.traverse())]) n_leaves = len([*filter(lambda node: node.is_leaf(), tree.traverse())]) if n_internal_nodes < (n_leaves - 1): tree.resolve_polytomy(**bifurcation_kws) # Node prefix if node_prefix is not None: tree = name_tree_nodes(tree, node_prefix=node_prefix) if not convert_to_skbio: return tree # skbio else: return ete_to_skbio(tree, node_prefix=None)
# input variables # phy_fn = "set_raxml.newick" # out_fn = "set_raxml.out_ete" phy_fn = sys.argv[1] out_fn = sys.argv[2] # logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)-5.5s]\t%(message)s", handlers=[ logging.FileHandler("%s.log" % out_fn, mode="w"), logging.StreamHandler() ]) # load input phy = ete3.PhyloTree(phy_fn) logging.info("Phylogeny = %s" % phy_fn) logging.info("Nodes = %i" % len(phy)) # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split("_")[0]) phy_sps = [n.species for n in phy.get_leaves()] phy_sps_set = set(phy_sps) phy_seq = [n.name for n in phy.get_leaves()] # check if tree is rooted, apply midpoint root if unrooted phy_root = phy.get_tree_root() phy_outg = phy_root.get_children() is_root = len(phy_outg) == 2 if is_root: pass
"n_presences": mat_pres.sum(axis=1) }, columns=[ "orthogroup", "presence", "gain", "loss", "n_gains", "n_losses", "n_presences" ]) # output return dat, mat_gain, mat_loss, mat_pres #### MAIN WORK #### # load input tree print("# Load tree from %s" % phs_fn) phs = ete3.PhyloTree("%s" % (phs_fn), format=1) # assign species names to tree phs.set_species_naming_function(lambda node: node.name) # resolve polytomies in a random fashion #phs.resolve_polytomy(recursive=True) # load orthoclusters print("# Load orthoclusters from %s" % ort_fn) ort = pd.read_csv(ort_fn, sep="\t") ort = ort[[gene_col, clus_col]] # obtain species-to-species dictionary of relative ages print("# Species-to-species relative ages from %s" % phs_fn) species_orig_dict, species_ages_dict, sps_list, anc_list = do_species_orig_dict( phs=phs) dat, mat_gain, mat_loss, mat_pres = do_ancestral_reconstruction(
# input variables # phy_fn = "set_raxml.newick" # out_fn = "set_raxml.out_ete" phy_fn = sys.argv[1] out_fn = "%s.out_ete" % phy_fn.split(sep=".")[0] # logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)-5.5s]\t%(message)s", handlers=[ logging.FileHandler("%s.log" % out_fn, mode="w"), logging.StreamHandler() ]) # load input phy = ete3.PhyloTree("%s" % (phy_fn)) logging.info("Phylogeny = %s" % phy_fn) logging.info("Nodes = %i" % len(phy)) # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split("_")[0]) phy_sps = [n.species for n in phy.get_leaves()] phy_sps_set = set(phy_sps) phy_seq = [n.name for n in phy.get_leaves()] # check if tree is rooted, apply midpoint root if unrooted phy_root = phy.get_tree_root() phy_outg = phy_root.get_children() is_root = len(phy_outg) == 2 if is_root: pass
def parse_phylo(phy_fn, phy_id, is_root): # load input phy = ete3.PhyloTree("%s" % (phy_fn)) logging.info("%s num nodes = %i" % (phy_id,len(phy))) # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split(split_ch)[0] ) # resolve polytomies in a random fashion phy.resolve_polytomy(recursive=True) # check if tree is rooted, apply midpoint root if unrooted # NOT APPLIED: GLOBAL VARIBLE USED INSTEAD # phy_root = phy.get_tree_root() # phy_outg = phy_root.get_children() # is_root = len(phy_outg) == 2 if is_root: pass logging.info("%s Tree is rooted, pass" % phy_id) else: logging.info("%s Tree is unrooted, apply midpoint root" % phy_id) phy_outgroup = phy.get_midpoint_outgroup() phy.set_outgroup(phy_outgroup) # ladderise phylogeny phy.ladderize() # list of genes in phylogeny phy_lis = phy.get_leaf_names() # find evolutionary events (duplications and speciations) evev = phy.get_descendant_evol_events(sos_thr=sos) # speciation events evs = np.empty((len(evev)*len(evev), 5), dtype="object") evs[:] = np.nan n = 0 for ev in evev: if ev.etype == "S": for ii in ev.in_seqs: for oi in ev.out_seqs: evs[n,0] = ii evs[n,1] = oi evs[n,2] = ev.branch_supports[0] evs[n,3] = ev.etype evs[n,4] = ev.sos n = n + 1 evs = pd.DataFrame(evs).dropna() evs.columns = ["in_gene","out_gene","branch_support","ev_type","sos"] # duplications evd = np.empty((len(evev)*len(evev), 5), dtype="object") # evd[:] = np.nan # n = 0 # for ev in evev: # if ev.etype == "D": # for ii in ev.in_seqs: # for oi in ev.out_seqs: # evd[n,0] = ii # evd[n,1] = oi # evd[n,2] = ev.branch_supports[0] # evd[n,3] = ev.etype # evd[n,4] = ev.sos # n = n + 1 # evd = pd.DataFrame(evd).dropna() # evd.columns = ["in_gene","out_gene","branch_support","ev_type","sos"] return evs, evd, phy, phy_lis