def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True): logger.info('Loading data') ### /!\ quoted_node_names only from ete3 v3.1.1 timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True) ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \ timetree.get_leaf_names()]) for leaf in timetree.get_leaves(): try: leaf.add_feature('taxid', name2taxid[leaf.name.replace('_', ' ')][0]) except KeyError: logger.warning('Species %r not found', leaf.name) leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True) logger.info('Placing common ancestors') if ete3_algo: ncbi.annotate_tree(timetree, 'taxid') else: myannotate(timetree, ncbi) matchrename_ncbitax(timetree, uniq) #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features}) if not to_table: print(timetree.write(format=1, format_root_node=True)) else: for node in timetree.traverse(): if not node.is_leaf(): print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
def main(): parser = argparse.ArgumentParser(description='Gene Copy Number Finder') parser.add_argument('--genetree', required=True, help='GeneTree in nhx format') parser.add_argument('--speciesorder', required=True, help='Comma-separated species list') args = parser.parse_args() species_list = args.speciesorder.split(",") species_list = [_.strip() for _ in species_list] table = [] with open(args.genetree, "r") as f: # reads multiple gene tree line by line gene tree for line in f: # Remove empty NHX features that can be produced by TreeBest but break ete3 line = line.replace('[&&NHX]', '') # reads single gene tree genetree = PhyloTree(line) leaves = genetree.get_leaf_names() leaves_parts = [_.split("_") for _ in leaves] for i, leaf_parts in enumerate(leaves_parts): if len(leaf_parts) != 2: raise Exception( "Leaf node '%s' is not in gene_species format" % leaves[i]) leaves_species = [_[1] for _ in leaves_parts] species_counter = collections.Counter(leaves_species) # Assign to ref_species the first element of species_list which # appears in a leaf node for ref_species in species_list: if ref_species in species_counter: break else: raise Exception( "None of the specified species was found in the GeneTree '%s'" % line) # Find the gene of the (first) leaf node for the ref_species for leaf_parts in leaves_parts: if leaf_parts[1] == ref_species: species_counter['gene'] = leaf_parts[0] break table.append(species_counter) colList = ["gene"] + species_list printTSV(table, colList)
def yes_choice(tree_file_name, gene, algae_choice): t = PhyloTree(tree_file_name) R = t.get_midpoint_outgroup() t.set_outgroup(R) gene_names = t.get_leaf_names() if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(t) else: algae_list = [] outlier_choice = raw_input( "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)" ) if outlier_choice[0] == "y": print( "\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade" ) outlier_list = clade_to_tree(t) other_copies = raw_input( "If there are other genes in the outlier group, enter them here, separated by a space, or else enter n." ) if other_copies != "n": other_list = other_copies.split(" ") outlier_list = outlier_list + other_list else: outlier_list = [] print( "\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed." ) group_list = clade_to_tree(t) ###tree1 cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list gene1 = yesMake(cut_list, gene, tree_file_name) ###tree2 cut_list1 = [i for i in gene_names if i not in cut_list] cut_list1 = cut_list1 + algae_list + outlier_list gene2 = yesMake(cut_list1, gene1, tree_file_name) with open(sys.argv[2], "r") as f: todo_list = [line.rstrip() for line in f] todo_list = [i for i in todo_list if i != gene] todo_list.append(gene1) todo_list.append(gene2) with open(sys.argv[2], "w") as todo: for i in todo_list: todo.write(i + "\n")
def yes_choice(tree_file_name, gene, algae_choice): t=PhyloTree(tree_file_name) R = t.get_midpoint_outgroup() t.set_outgroup(R) gene_names = t.get_leaf_names() if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(t) else: algae_list = [] outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") if outlier_choice[0] == "y": print("\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade") outlier_list = clade_to_tree(t) other_copies = raw_input("If there are other genes in the outlier group, enter them here, separated by a space, or else enter n.") if other_copies != "n": other_list = other_copies.split(" ") outlier_list = outlier_list + other_list else: outlier_list=[] print("\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed.") group_list = clade_to_tree(t) ###tree1 cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list gene1 = yesMake(cut_list, gene, tree_file_name) ###tree2 cut_list1 = [i for i in gene_names if i not in cut_list] cut_list1 = cut_list1 + algae_list + outlier_list gene2 = yesMake(cut_list1, gene1, tree_file_name) with open(sys.argv[2], "r") as f: todo_list=[line.rstrip() for line in f] todo_list=[i for i in todo_list if i != gene] todo_list.append(gene1) todo_list.append(gene2) with open(sys.argv[2], "w") as todo: for i in todo_list: todo.write(i+"\n")
def main(): usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" parser = optparse.OptionParser(usage=usage) parser.add_option('--genetree', help='GeneTree in nhx format') parser.add_option('--out_format', type='string', default='tabular', help='Choose output format') parser.add_option('--filters', default='', help='Filter families') options, args = parser.parse_args() if options.genetree is None: parser.error( "--genetree option must be specified, GeneTree in nhx format") with open(options.genetree, 'r') as f: contents = f.read() # Remove empty NHX features that can be produced by TreeBest but break ete3 contents = contents.replace('[&&NHX]', '') # reads single gene tree genetree = PhyloTree(contents) leaves_list = genetree.get_leaf_names() # Genetree nodes are required to be in gene_species format leaves_list = [_ for _ in leaves_list if '_' in _] species_list = [_.split("_")[1] for _ in leaves_list] species_dict = {} for species in species_list: count = "one" if species in species_dict: count = "many" species_dict[species] = count homologies = { 'one-to-one': [], 'one-to-many': [], 'many-to-one': [], 'many-to-many': [], 'paralogs': [] } # stores relevant homology types in dict for i, leaf1 in enumerate(leaves_list): for leaf2 in leaves_list[i + 1:]: id1 = leaf1.split(":")[1] if ":" in leaf1 else leaf1 id2 = leaf2.split(":")[1] if ":" in leaf2 else leaf2 species1 = id1.split("_")[1] species2 = id2.split("_")[1] if species1 == species2: homology_type = 'paralogs' else: homology_type = species_dict[species1] + "-to-" + species_dict[ species2] homologies[homology_type].append((id1, id2)) options.filters = options.filters.split(",") if options.out_format == 'tabular': for homology_type, homologs_list in homologies.items(): # checks if homology type is in filter if homology_type in options.filters: for (gene1, gene2) in homologs_list: print("%s\t%s\t%s" % (gene1, gene2, homology_type)) elif options.out_format == 'csv': print_family = True for homology_type, homologs_list in homologies.items(): if homologs_list and homology_type not in options.filters: print_family = False break # prints family if homology type is not found in filter if print_family: print(','.join(leaves_list))
def pre_prune(gene): full_tree=PhyloTree(gene+"/"+gene+".3.fa.tre") gene_names=full_tree.get_leaf_names() m=100 start_gene="{}_all{}".format(gene,str(m)) os.system("mkdir {}".format(start_gene)) full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene,start_gene)) m=m+1 l=[start_gene] for item in l: full_tree=PhyloTree("{}/{}.3.fa.tre".format(item,item)) view_rooted_tree(full_tree) print("Tree for {}".format(item)) c=raw_input("Split off a monophyletic gene copy? (y/n)") if c[0] == "y": algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)") outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") while c[0]=="y": if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(full_tree) else: algae_list = [] if outlier_choice[0] == "y": print("\nLet's define the outlier group. ") outlier_list = [] out_choice = raw_input("\nIs there a monophyletic clade in the outlier group? (y/n)") while out_choice[0] == "y": outlier_list2 = clade_to_tree(full_tree) outlier_list = outlier_list + outlier_list2 out_choice = raw_input("\nIs there another monopyletic clade to add to the outlier group? (y/n)") other_choice = raw_input("Are there additional genes in the outlier group? (y/n)") while other_choice[0] == "y": other_copies = raw_input("\nEnter genes to include, separated by a space. Enter only up to ten genes at a time.") try: other_list = other_copies.split(" ") outlier_list = outlier_list + other_list except ValueError: other_choice = raw_input("\nAt least one gene is not found on the tree. Reenter genes? y/n") other_choice = raw_input("Are there more genes to enter? (y/n)") else: outlier_list=[] b="{}_all{}".format(gene, str(m)) l.append(b) tree1=PhyloTree("{}/{}.3.fa.tre".format(item,item)) R=tree1.get_midpoint_outgroup() tree1.set_outgroup(R) print("\nFor the monophyletic gene copy:") group_list=clade_to_tree(tree1) group_list=group_list + algae_list + outlier_list gene_names=tree1.get_leaf_names() if len(group_list)==len(gene_names): c1=raw_input("\nList includes all copies on tree.\nMake gene with all copies? (y/n)") if c1=="y": c="n" else: print("\nGroup crosses root. Unable to make group.\nChoose new group.") c="y" else: cut_list=[i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list os.system("mkdir {}".format(b)) tree2=PhyloTree("{}/{}.3.fa.tre".format(item,item)) R=tree2.get_midpoint_outgroup() tree2.set_outgroup(R) tree2.prune(group_list,preserve_branch_length=True) tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b,b)) tree1.prune(cut_list,preserve_branch_length=True) tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item,item)) m=m+1 print ("\nTree now looks like this.") view_rooted_tree(tree1) c=raw_input("Split off a monophyletic clade? (y/n)") if c[0] == "y": algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)") outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") with open(sys.argv[1], "a") as p: for i in l: p.write(i+"\n")
EventSummary.append("event(%i,duplication)" %(n.S)) else: n.Ev = "S" logger.debug("name: %s",n.name) logger.debug("S: %s",n.S) logger.debug("Ev: %s",n.Ev) logger.debug("ND: %s",n.ND) i+=1 EventsFile = OutPrefixName + ".events.txt" with open(EventsFile,"w") as File: File.write("\n".join(EventSummary)+"\n") recon_tree.prune(genetree.get_leaf_names(),preserve_branch_length=True) i=0 node_2events_and_sp = {} for n in recon_tree.traverse("postorder"): n.ND = i node_2events_and_sp[n.ND]={"S": n.S, "Ev":n.Ev} i+=1 logger.debug(node_2events_and_sp) i=0 for n in genetree.traverse("postorder"): n.ND=i n.S=node_2events_and_sp[n.ND]["S"]
def pre_prune(gene): full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") gene_names = full_tree.get_leaf_names() m = 100 start_gene = "{}_all{}".format(gene, str(m)) os.system("mkdir {}".format(start_gene)) full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene, start_gene)) m = m + 1 l = [start_gene] for item in l: full_tree = PhyloTree("{}/{}.3.fa.tre".format(item, item)) view_rooted_tree(full_tree) print("Tree for {}".format(item)) c = raw_input("Split off a monophyletic gene copy? (y/n)") if c[0] == "y": algae_choice = raw_input( "\nIs there an algae group that is sister to all shown families? (y/n)" ) outlier_choice = raw_input( "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)" ) while c[0] == "y": if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(full_tree) else: algae_list = [] if outlier_choice[0] == "y": print("\nLet's define the outlier group. ") outlier_list = [] out_choice = raw_input( "\nIs there a monophyletic clade in the outlier group? (y/n)" ) while out_choice[0] == "y": outlier_list2 = clade_to_tree(full_tree) outlier_list = outlier_list + outlier_list2 out_choice = raw_input( "\nIs there another monopyletic clade to add to the outlier group? (y/n)" ) other_choice = raw_input( "Are there additional genes in the outlier group? (y/n)") while other_choice[0] == "y": other_copies = raw_input( "\nEnter genes to include, separated by a space. Enter only up to ten genes at a time." ) try: other_list = other_copies.split(" ") outlier_list = outlier_list + other_list except ValueError: other_choice = raw_input( "\nAt least one gene is not found on the tree. Reenter genes? y/n" ) other_choice = raw_input( "Are there more genes to enter? (y/n)") else: outlier_list = [] b = "{}_all{}".format(gene, str(m)) l.append(b) tree1 = PhyloTree("{}/{}.3.fa.tre".format(item, item)) R = tree1.get_midpoint_outgroup() tree1.set_outgroup(R) print("\nFor the monophyletic gene copy:") group_list = clade_to_tree(tree1) group_list = group_list + algae_list + outlier_list gene_names = tree1.get_leaf_names() if len(group_list) == len(gene_names): c1 = raw_input( "\nList includes all copies on tree.\nMake gene with all copies? (y/n)" ) if c1 == "y": c = "n" else: print( "\nGroup crosses root. Unable to make group.\nChoose new group." ) c = "y" else: cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list os.system("mkdir {}".format(b)) tree2 = PhyloTree("{}/{}.3.fa.tre".format(item, item)) R = tree2.get_midpoint_outgroup() tree2.set_outgroup(R) tree2.prune(group_list, preserve_branch_length=True) tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b, b)) tree1.prune(cut_list, preserve_branch_length=True) tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item, item)) m = m + 1 print("\nTree now looks like this.") view_rooted_tree(tree1) c = raw_input("Split off a monophyletic clade? (y/n)") if c[0] == "y": algae_choice = raw_input( "\nIs there an algae group that is sister to all shown families? (y/n)" ) outlier_choice = raw_input( "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)" ) with open(sys.argv[1], "a") as p: for i in l: p.write(i + "\n")
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) treefile = os.path.basename(treepath) t.dist = 0 outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) t.standardize() except: if args.pairs_table: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return ([], []) #return (['aa', 'aa'] ,[['aa', 'aa']]) else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') l = t.get_leaf_names() r = l[0] t.set_outgroup(r) pass #return ([],[]) #return (['None', 'None'] ,[['None', 'None']]) else: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return [] else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') return [] names = {} for leaf in t: try: sp = str(leaf.name.split('.')[0]) leaf.taxid = str(sp) sci_name = ncbi.get_taxid_translator([sp]) names[sp] = sci_name[int(sp)] except: names[sp] = '' if args.conv_table: try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name leaf.good_name = good_name node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only target taxid leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "{%s}" % ('|'.join( [_lf.name for _lf in node2content[n]])) if args.conv_table: n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) all_ortholgs_tree = [] all_ortholgs_pairs = [] event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = ev.node.children[0] ortho_seqs = ev.node.children[1] if target_taxid: sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue if args.conv_table: co_orthologs = [leaf.good_name for leaf in source_seqs] co_orthologs.sort() else: co_orthologs = [leaf.name for leaf in source_seqs] co_orthologs.sort() orthologs = defaultdict(set) for leaf in ortho_seqs: sp = str(leaf.name.split('.')[0]) if args.conv_table: orthologs[sp].add(leaf.good_name) else: orthologs[sp].add(leaf.name) if len(source_seqs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.items(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n' ])) if args.pairs_table: source_seqs_names = [] ortho_seqs_names = [] for node in source_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name source_seqs_names.append(name) for node in ortho_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name ortho_seqs_names.append(name) all_ortholgs_node = itertools.product(source_seqs_names, ortho_seqs_names) all_ortholgs_tree.append(all_ortholgs_node) for node in all_ortholgs_tree: for pair in node: all_ortholgs_pairs.append(pair) #return (event_lines, all_ortholgs_pairs) if args.pairs_table: return (event_lines, all_ortholgs_pairs) else: return (event_lines)