#euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate print sys.argv[1] + "\tSingleton" #euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree #print len(eukaryote_seqs) else: try: answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name") if answer[0] == True: ca = tree.get_common_ancestor(eukaryote_seqs) print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) elif answer[0] == False: mono_groups = [] target_group = '' for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"): if target_leaf in node: target_group = node else: mono_groups.append(node) size_target_group = len(target_group) #get distance shortest_distance = 999999999999999.0 closest_other_group = '' for subtree in mono_groups: curr_distance = tree.get_distance(target_group, subtree, topology_only=True) if curr_distance < shortest_distance: shortest_distance = curr_distance closest_other_group = subtree #attempt to calculate distance on a version of the tree in which branches below some support threshold have been deleted # closest_leaves = []
#read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end) ml_tree = Tree(sys.argv[1]) for leaf in ml_tree: taxonomy = parse_taxonomy(leaf.name) name_to_tax_info[leaf.name] = taxonomy taxa_names.append(leaf.name) leaf.add_feature("tax", taxonomy[target_label]) labels[taxonomy[target_label]] = 1 groups = labels.keys() #compute the number of clades per label in the ML tree, and their sizes ML_groups = defaultdict( list ) #the list is the size of each clade, len(list) is the number of clades for that label in the ML tree for label in groups: for node in ml_tree.get_monophyletic(values=[label], target_attr="tax"): size_clade = 0 for leaf in node: size_clade += 1 ML_groups[label].append(size_clade) treeNum = -1 tree_sample_handle = open(sys.argv[2]) for line in tree_sample_handle: treeNum += 1 tree = Tree(line.rstrip()) for leaf in tree: tax = name_to_tax_info[ leaf.name] #this should set up taxonomy correctly... leaf.add_feature( "tax", tax[target_label]
def extract_subtrees(tree, ali, target_species, ref_species, treedir, outali, olore, oaore, species_groups, restrict_sp=None): """ For a full gene tree, extracts subtrees and builds AORe and LORe gene tree topologies for them. Writes aore and lore trees to file in nhx format and corresponding multiple alignement in fasta. Args: tree (str): tree file in nhx format for the considered gene family ali (str): alignment fasta file for the considered gene family target_species (list of str): duplicated+outgroup species ref_species (list of str): outgroup(s) species treedir (str): directory with SCORPiOs constrained gene tree topologies outali (str): output directory for the alignment olore (str): output directory for the lore topology (should exist) oaore (str): output directory for the aore topology (should exist) species_groups (list of str): groups of species for the LORe topology restrict_sp (list of str, optional): restrict the set of duplicated species to this set """ tree = Tree(tree) #find all monophyletic groups (clades with only target species genes in the tree) #called duplicated for historical reason but here I fetch outgr+dup_sp tag_duplicated_species(tree.get_leaves(), target_species) subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated") for subtree in subtrees: subtree_copy = subtree.copy() if restrict_sp: small_set = [ i for i in subtree_copy.get_leaves() if i.S in restrict_sp ] if len(small_set) > 3: subtree_copy.prune([ i for i in subtree_copy.get_leaves() if i.S in restrict_sp ], preserve_branch_length=True) else: continue if not check_copy_number(subtree_copy, ref_species): continue #Build contrained AORe tree topology gene_list = {i.name: i.S for i in subtree_copy.get_leaves()} file_exist = [ os.path.isfile(treedir + '/C_' + gene + ".nh") for gene in list(gene_list.keys()) ] file_exist = list(compress(range(len(file_exist)), file_exist)) if len(file_exist) == 1: outgr_gene = list(gene_list.keys())[file_exist.pop()] treefile = treedir + '/C_' + outgr_gene + ".nh" ctree_aore = get_scorpios_aore_tree(gene_list, treefile, ref_species, outgr_gene) elif file_exist == []: dup_sp = set(target_species).difference(ref_species) ctree_aore, outgr_gene = check_aore_consistent_tree( subtree_copy, ref_species, dup_sp) else: continue #Build contrained LORe tree topology ctree_lore, _ = make_tree_from_groups(subtree_copy.get_leaves(), species_groups) #check that LORe and AORe have been succesfully built and that they are different if ctree_aore is not None and ctree_lore is not None: assert {i.name for i in ctree_lore.get_leaves()} ==\ {i.name for i in ctree_aore.get_leaves()}, f"{ctree_aore}, {ctree_lore}" comp1 = ctree_aore.compare(ctree_lore) comp2 = ctree_lore.compare(ctree_aore) comp_res = max(comp1['source_edges_in_ref'], comp2['source_edges_in_ref']) if comp_res != 1: ctree_lore.write(outfile=olore + '/' + outgr_gene + '.nh', format=9, features=["D"]) ctree_aore.write(outfile=oaore + '/' + outgr_gene + '.nh', format=9, features=["D"]) leaves = [i.name for i in subtree_copy.get_leaves()] seq = ut.get_subali(ali, leaves) ut.write_fasta(seq, outali + '/' + outgr_gene + '.fa')
t = Tree( t.write(format=1, features=all_features, format_root_node=True)) for wgd in wgds_dict: leaves = t.get_leaves() if len(leaves) == 1: continue #find all monphyletic teleost groups tag_duplicated_species(leaves, wgds_dict[wgd]) #all clades of teleost genes, #by definition corrected subtrees will only contain dup. sp subtrees = t.get_monophyletic( values=["Y"], target_attr="duplicated") for subtree in subtrees: if subtree.is_leaf(): continue #if corrected leaves at each side of the node: corrected node child1, child2 = subtree.get_children() tags_wgd = [ i for i in COR_TAGS_ALL if wgd in i ] ok_child1 = corr_tag_below_node( child1, tags_wgd)
def get_example_tree(File): adres=os.getcwd() file_out_supliment = open(adres+"/out_spliment/"+File, 'w') node_file = open(adres+"/node/"+File, 'w') # Create a random tree and add to each leaf a random set of motifs # from the original set #t = Tree("( (A, B, C, D, E, F, G), H, I);") #Считываем все домены domain_all_legend={} file_all_domen=os.listdir(adres+"/for_pic/1_tree_nwk/") file_all_domen.remove(".DS_Store") file_all_domen.sort() i=0 for file_domain in file_all_domen: file_open_domain = open(adres+"/for_pic/3_domain/"+file_domain, 'r') for line in file_open_domain: line_=line.split("\t") try: if not (line_[2] in domain_all_legend): domain_all_legend.setdefault(line_[2],dic_domain_pic_pic[i]) i+=1 if i>len(dic_domain_pic_pic): i=0 except: a=0 mem="" file_open = open(adres+"/for_pic/1_tree_nwk/"+File, 'r') for line in file_open: mem=mem+line tt = Tree(mem, format=0) style = NodeStyle() style["fgcolor"] = "#000000" style["size"] = 0 style["vt_line_color"] = "#000000" style["hz_line_color"] = "#000000" style["vt_line_width"] = 4 style["hz_line_width"] = 4 style["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted style["hz_line_type"] = 8 for node in tt.traverse("levelorder"): node.img_style = style if (len(node.name))>1: node_file.write(node.name+"\n") children1=node.children for element in children1: element.img_style = style for node in tt.traverse("preorder"): node.img_style = style children1=node.children for element in children1: element.img_style = style node_file.close #вывести дерево с цветами #print (tt.get_ascii(attributes=["name", "color"], show_internal=False)) #поиск предка ancestor1="" i=0 for element in ancestor_grop: if i==0: for node in tt.traverse("postorder"): if i==0: node_name=str(node.name) if (node_name.startswith(element)) and not((node_name.startswith("PPE"))): ancestor1=str(node.name) i=1 break else: break else: break if not (ancestor1==""): tt.set_outgroup(ancestor1) #tt.render(adres+"/out/"+File[:-3]+"_2.png", tree_style=circular_style) print(str(ancestor1)+" - предок") file_out_supliment.write(str(ancestor1)+"\t"+" - предполагаемый корень"+"\n") else: print("Не нашел предка") file_out_supliment.write("\n\n\n Выявленные клады\n") #добавляем цвета к кладам for leaf in tt: i=0 node_name=str(leaf.name) for clad in all_clad: collor=collor_list[i] i+=1 for element in clad: if (node_name.startswith(element)): leaf.add_features(color=collor) #print(leaf) #print(tt) #забираем монофилитические цвета #print (tt.get_ascii(attributes=["name", "color"], show_internal=False)) ii=-1 for clad in all_clad: ii+=1 collor=collor_list[ii] for monophyletic_tree in tt.get_monophyletic(values=[collor], target_attr="color"): i=[] name_node_mono_color=[] for leaf in monophyletic_tree: i.append(leaf) name_node_mono_color.append(leaf.name) if len(i)>1: n1 = tt.get_common_ancestor(i) nst1 = NodeStyle() nst1["bgcolor"] = collor nst1["fgcolor"] = "#000000" nst1["size"] = 0 nst1["vt_line_color"] = "#000000" nst1["hz_line_color"] = "#000000" nst1["vt_line_width"] = 4 nst1["hz_line_width"] = 4 nst1["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted nst1["hz_line_type"] = 8 n1.set_style(nst1) for element in name_node_mono_color: file_out_supliment.write(str(element)+"\t"+" - "+collor+"\n") file_out_supliment.write("\n") file_out_supliment.write("\n\n\n Легенда доменного состава\n") #добавляем разметку по доменам dic_seq={} dic_domain={} dic_domain_pic={} i=0 list_legend_domain3=[] for node in tt.traverse("postorder"): #длины белков fasta_sequences=SeqIO.parse(open(adres+"/for_pic/2_MSA/"+File), "fasta") for element in fasta_sequences: if str(element.id)==str(node.name): dic_seq.setdefault(str(node.name),str(element.seq)) #доменный состав a=[] file_domain = open(adres+"/for_pic/3_domain/"+File, 'r') for line in file_domain: line_=line.split("\t") if line_[0]==str(node.name): if not (line_[2] in list_legend_domain3): list_legend_domain3.append(line_[2]) if not (line_[2] in dic_domain_pic): dic_domain_pic.setdefault(line_[2],dic_domain_pic_pic[i]) i+=1 #print(dic_domain_pic[line_[2]]) #print(i) a1=[int(line_[3]),int(line_[4]), "()", None, 15, "black", domain_all_legend[line_[2]], "arial|9|black|"+line_[2]] a.append(a1) dic_domain.setdefault(str(node.name),a) file_out_supliment.write(line_[2]+"\t"+domain_all_legend[line_[2]]+"\n") else: a1=[int(line_[3]),int(line_[4]), "()", None, 15, "black", domain_all_legend[line_[2]], "arial|9|black|"+line_[2]] a.append(a1) dic_domain.setdefault(str(node.name),a) for element in dic_domain: #print(str(element)+" "+ str(dic_domain[element])) try: seqFace = SeqMotifFace(seq=dic_seq[element], motifs=dic_domain[element], seq_format="line") (tt & element).add_face(seqFace, 0, "aligned") except: seqFace = SeqMotifFace(seq=dic_seq[element], seq_format="line", gapcolor="red") (tt & element).add_face(seqFace, 0, "aligned") print("except") #Рисуем легенду circular_style = TreeStyle() circular_style.show_leaf_name = False circular_style.show_branch_length = True circular_style.show_branch_support = True circular_style.scale = 75 circular_style.tree_width = 50 file_domain.close file_domain = open(adres+"/for_pic/3_domain/"+File, 'r') list_legend_domain={} list_legend_domain2=[] #считали список доменов i=0 for line in file_domain: line_=line.split("\t") try: if not(line_[2] in list_legend_domain2): #print(line_[2]) list_legend_domain2.append(line_[2]) list_legend_domain.setdefault("a"+str(i),line_[2]) i+=1 except: print("не понял что это за домен") i=0 #считываем легенду доменов file_domain_legend2={} file_domain_legend = open(adres+"/domain_legend.txt", 'r') for line in file_domain_legend: line_=line.split("\t") aaa=line_[1].replace(" ","_") aaa=aaa.replace("(","_") aaa=aaa.replace(")","_") aaa=aaa.replace(",","_") aaa=aaa.replace(":","_") aaa=aaa.replace(".","_") file_domain_legend2.setdefault(line_[0],aaa.replace("\n","")) #N = AttrFace("name", fsize=12) #faces.add_face_to_node(N, node, 1, position="branch-right") #рисуем домены ww="" for element in file_domain_legend2: ww=ww+","+file_domain_legend2[element] ww="("+ww[1:]+");" tree_domen_all=Tree(ww) for element in file_domain_legend2: try: element2=domain_all_legend[element] a1=[10,90, "()", None, 15, "black", domain_all_legend[element], "arial|9|black|"+element] i+=1 a=[] a.append(a1) seqFace = SeqMotifFace(seq=seq_seq, motifs=a, seq_format="line") #node_node="a"+str(i) node_node=file_domain_legend2[element] try: (tree_domen_all & node_node).add_face(seqFace, 0, "aligned") except: q=1 print("не нашел узел") except: q=1 circular_style.layout_fn = layout tree_domen_all.render(adres+"/out_legend_all.png", tree_style=circular_style) file_domain_out = open(adres+"/123123123.txt", 'w') w="" for element in list_legend_domain3: w=w+","+file_domain_legend2[element] w="("+w[1:]+");" tree_domen=Tree(w) for element in list_legend_domain3: file_domain_out.write(element+"\n") a1=[10,90, "()", None, 15, "black", domain_all_legend[element], "arial|9|black|"+element] i+=1 a=[] a.append(a1) try: seqFace = SeqMotifFace(seq=seq_seq, motifs=a, seq_format="line") #node_node="a"+str(i) node_node=file_domain_legend2[element] (tree_domen & node_node).add_face(seqFace, 0, "aligned") except: #print("Закончились узлы легенды") k=0 circular_style.layout_fn = layout tree_domen.render(adres+"/out_legend/"+File[:-4]+".png", tree_style=circular_style) #удаленние части узлов for node in tt.traverse("postorder"): try: seqFace = SeqMotifFace(seq=dic_seq[str(node.name)], motifs=dic_domain[str(node.name)], seq_format="line") (tt & node.name).add_face(seqFace, 0, "aligned") a=0 if len(node.name)<2: a=1 for element_save in save_node: if (node.name).startswith(element_save): a=1 for element_dell in dell_node: if (node.name).startswith(element_dell): a=0 if a==0: node.delete() except: if len(node.name)>0: seqFace = SeqMotifFace(seq=dic_seq[str(node.name)], seq_format="line", gapcolor="red") (tt & node.name).add_face(seqFace, 0, "aligned") node.delete() d0=0 #удаленние части узлов ЗАВЕРШЕНО #особые точки node_color=[] file_node_color = open(adres+"/for_pic/4_color_node/out_list_gene2.txt", 'r') for line in file_node_color: node_color.append(line.replace("\n","")) for node in tt.traverse("postorder"): if node.name in node_color: style = NodeStyle() style["fgcolor"] = "Red" style["size"] = 9 style["vt_line_color"] = "#000000" style["hz_line_color"] = "#000000" style["vt_line_width"] = 4 style["hz_line_width"] = 4 style["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted style["hz_line_type"] = 8 node.set_style(style) file_out_supliment.close #забираем монофилитические цвета #print (tt.get_ascii(attributes=["name", "color"], show_internal=False)) ii=-1 for clad in all_clad: ii+=1 collor=collor_list[ii] for monophyletic_tree in tt.get_monophyletic(values=[collor], target_attr="color"): i=[] name_node_mono_color=[] for leaf in monophyletic_tree: i.append(leaf) name_node_mono_color.append(leaf.name) if len(i)>1: n1 = tt.get_common_ancestor(i) nst1 = NodeStyle() nst1["bgcolor"] = collor nst1["fgcolor"] = "#000000" nst1["size"] = 0 nst1["vt_line_color"] = "#000000" nst1["hz_line_color"] = "#000000" nst1["vt_line_width"] = 4 nst1["hz_line_width"] = 4 nst1["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted nst1["hz_line_type"] = 8 n1.set_style(nst1) for element in name_node_mono_color: file_out_supliment.write(str(element)+"\t"+" - "+collor+"\n") file_out_supliment.write("\n") return tt
def orthologies_with_outgroup(forest, duplicated_sp, outgroup, dict_genes, out): """ Browses a gene tree forest and searches for orthologs with the outgroup. Writes genes without phylogenetic orthologs to a file. Also writes files with high-confidence orthologs and paralogs to use to otpimize the synteny support threshold to call orthology. Args: forest (str): Name of the gene trees forest file duplicated_sp (list of str): List of all duplicated species for the considered WGD outgroup (str): Non-duplicated outgroup dict_genes (dict of GeneSpeciesPosition tuples): All gene positions for each species out (str): Output file to write genes without phylogenetic orthologs Returns: dict: Orthologs of outgroup genes in each duplicated species Note (FIXME): Written to work within scorpios as orthologs and paralogs file names are derived from output file patterns, assuming it contains an '_'. """ ortho = {e: {} for e in duplicated_sp} orthofile = out.replace(out.split("/")[-1].split('_')[0], "orthologs") parafile = out.replace(out.split("/")[-1].split('_')[0], "paralogs") with open(out, 'w') as outfile, open(forest, 'r') as infile, open(parafile, 'w') as out_para,\ open(orthofile, 'w') as out_ortho: sys.stderr.write("Browsing gene trees for orthologies with the outgroup...\n") for tree in ut.read_multiple_objects(infile): #load tree tree = Tree(tree.strip(), format=1) node2leaves = tree.get_cached_content() leaves = [i for i in tree.get_leaves()] #add a tag to genes of duplicated species tag_duplicated_species(leaves, duplicated_sp) #find all clades with only genes of duplicated species subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated") #find all outgroup genes outgroup_genes = [i for i in leaves if i.S == outgroup] #search for an ortholog gene in the outgroup for all clades of teleost genes for subtree in subtrees: seen = {} subtree_leaves = subtree.get_leaves() found = False #browse all outgroup genes for j in outgroup_genes: #find the node that splits the outgroup gene and duplicated species genes lca = tree.get_common_ancestor(subtree, j) topo_distance = len(node2leaves[lca]) # if it is a speciation or dubious duplication node --> speciation if org.is_speciation(lca): branch_distance = tree.get_distance(subtree, j) if subtree not in seen: seen[subtree] = [] seen[subtree].append((topo_distance, branch_distance, j)) found = True # if no 'true' ortholog # check if all descendants include only outgroup + duplicated species if not found: for j in outgroup_genes: lca = tree.get_common_ancestor(subtree, j) for gene in lca.get_leaves(): if gene.duplicated != "Y" and gene.S != outgroup: break #if no break, it means all descendants are outgroup or dup. else: topo_distance = len(node2leaves[lca]) branch_distance = tree.get_distance(subtree, j) seen[subtree] = seen.get(subtree, []) seen[subtree].append((topo_distance, branch_distance, j)) # if an ortholog was found, add it to the orthology dict if seen: content = [] seen[subtree].sort(key=lambda x: (x[0], x[1])) outgroup_gene = seen[subtree][0] outgroup_gene = outgroup_gene[2].name for species in duplicated_sp: genes = [i.name for i in subtree_leaves if i.S == species] genes = get_genes_positions(genes, species, dict_genes) ortho[species][outgroup_gene] = ortho[species].get(outgroup_gene, []) ortho[species][outgroup_gene] += genes content += [g.name+'_'+species.replace(' ', '.')+\ '|'+str(g.chromosome)+\ '|'+str(g.index) for g in genes] all_ortho = [i[2].name for i in seen[subtree]] paralogs = [i.name for i in outgroup_genes if i.name not in all_ortho] if paralogs: paralog = random.choice(paralogs) if paralog in dict_genes[outgroup]\ and outgroup_gene in dict_genes[outgroup]: tmp_dict = dict_genes[outgroup] out_ortho.write(' '.join(content)+'\t') out_ortho.write(str(outgroup_gene)+'|'+\ str(tmp_dict[outgroup_gene].chromosome)+'|'+\ str(tmp_dict[outgroup_gene].index)+'|'+str(0)+'|'+\ str(0)+'\n') out_para.write(' '.join(content)+'\t') out_para.write(str(paralog)+'|'+\ str(tmp_dict[paralog].chromosome)+'|'+\ str(tmp_dict[paralog].index)+'|'+\ str(0)+'|'+str(0)+'\n') # if no ortholog found # write genes without ortholog along with all outgroup genes in tree # (potential candidate for orthology) elif any(i.name in dict_genes[outgroup] for i in outgroup_genes): #genes without orthologs missed_genes = [] for species in duplicated_sp: genes = [i.name for i in subtree_leaves if i.S == species] genes = get_genes_positions(genes, species, dict_genes) missed_genes += [g.name+'_'+species.replace(' ', '.')+\ '|'+str(g.chromosome)+\ '|'+str(g.index) for g in genes] if missed_genes: outfile.write(' '.join(missed_genes)+'\t') #candidate orthologs in the outgroup outgr_genes = [i.name for i in outgroup_genes] in_paralogs = [] for pair in itertools.combinations(outgr_genes, 2): if tree.get_distance(pair[0], pair[1], topology_only=True) == 1: in_paralogs.append(pair[0]+'|'+pair[1]) outgr_write = [] genome = dict_genes[outgroup] for gene in outgr_genes: if gene in genome: lca = tree.get_common_ancestor(subtree, gene) branch_distance = tree.get_distance(subtree, gene) topo_distance = len(node2leaves[lca]) outgr_write.append(str(gene)+'|'+str(genome[gene].chromosome)+'|'+\ str(genome[gene].index)+'|'+str(topo_distance)+\ '|'+str(branch_distance)) outfile.write(' '.join(outgr_write)+'\t'+' '.join(in_paralogs)+'\n') sys.stderr.write("Phylogenetic orthologies with the outgroup OK\n") return ortho
from ete3 import Tree t = Tree("((((((4, e), i), o),h), u), ((3, 4), (i, june)));") # we annotate the tree using external data colors = {"a":"red", "e":"green", "i":"yellow", "o":"black", "u":"purple", "4":"green", "3":"yellow", "1":"white", "5":"red", "june":"yellow"} for leaf in t: leaf.add_features(color=colors.get(leaf.name, "none")) print(t.get_ascii(attributes=["name", "color"], show_internal=False)) print("Green-yellow clusters:") # And obtain clusters exclusively green and yellow for node in t.get_monophyletic(values=["green", "yellow"], target_attr="color"): print(node.get_ascii(attributes=["color", "name"], show_internal=False)) #%% #finding and saving nodes by their names C= t&"C" H= t&"H" I= t&"I" #%%