def generateCoalescentTrees(choice, num, fout, length): if choice == 1: sp_tree_str = """\ [&R] ((((((((A,B)%d,C)%d,D)%d,E)%d,F)%d,G)%d,H)%d); """ % (float(length),float(length),float(length),float(length),float(length),float(length),float(length)) elif choice == 2: sp_tree_str = """\ [&R] ((((((((A,B)%d,C)%d,D)%d,E)%d,F)%d,G)%d,H)%d); """ % (float(length),float(length),float(length),float(length),float(length),float(length),float(length)) sp_tree = dendropy.Tree.get_from_string(sp_tree_str, "newick") gene_to_species_map = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping( containing_taxon_namespace=sp_tree.taxon_namespace, num_contained=1) gene_tree_list = TreeList() for i in range(num): gene_tree = treesim.contained_coalescent_tree(containing_tree=sp_tree, gene_to_containing_taxon_map=gene_to_species_map) treesim.contained_coalescent_tree(containing_tree=sp_tree, gene_to_containing_taxon_map=gene_to_species_map) for t in gene_tree.leaf_nodes(): t.taxon.label = t.taxon.label.split( )[0] gene_tree_list.append(gene_tree) gene_tree_list.write_to_path(fout, 'newick')
def generateCoalescentTrees(choice, num, fout, length): if choice == 1: sp_tree_str = """((((((((A:%f,B:%f):%f,C:%f):%f,D:%f):%f,E:%f):%f,F:%f):%f,G:%f):%f,H:%f):%f);""" % (float(length), float(length), float(length),float(length),2*float(length),float(length),3*float(length),float(length),4*float(length),float(length),5*float(length),float(length),6*float(length),float(length),7*float(length)) #sp_tree_str = """\ # [&R] ((((((((A,B)%f,C)%f,D)%f,E)%f,F)%f,G)%f,H)%f); #""" % (float(length),float(length),float(length),float(length),float(length),float(length),float(length)) elif choice == 2: #sp_tree_str = """\ #[&R] (((A,B)%f,(C,D)%f)%f,((E,F)%f,(G,H)%f)%f); #""" % (float(length),float(length),float(length),float(length),float(length),float(length)) sp_tree_str = """(((A:%f,B:%f):%f,(C:%f,D:%f):%f):%f,((E:%f,F:%f):%f,(G:%f,H:%f):%f):%f);""" % (float(length), float(length), float(length), float(length), float(length), 2*float(length),4*float(length),float(length), float(length),2*float(length),float(length), float(length), float(length),4*float(length)) #print(sp_tree_str) sp_tree = dendropy.Tree.get_from_string(sp_tree_str, "newick") gene_to_species_map = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping( containing_taxon_namespace=sp_tree.taxon_namespace, num_contained=1) gene_tree_list = TreeList() for i in range(num): gene_tree = dendropy.simulate.treesim.contained_coalescent_tree(containing_tree=sp_tree, gene_to_containing_taxon_map=gene_to_species_map) dendropy.simulate.treesim.contained_coalescent_tree(containing_tree=sp_tree, gene_to_containing_taxon_map=gene_to_species_map) for t in gene_tree.leaf_nodes(): t.taxon.label = t.taxon.label.split( )[0] gene_tree_list.append(gene_tree) gene_tree_list.write_to_path(fout, 'newick')
def main(): treefile1 = sys.argv[1] treefile2 = sys.argv[2] treelist = TreeList() treelist.read(file=open(treefile1, 'rU'), schema="nexus") treelist.read(file=open(treefile2, 'rU'), schema="nexus") if treecompare.symmetric_difference(treelist.__getitem__(0), treelist.__getitem__(1)) == 0: print "trees are identical" else: print "trees are NOT identical"
def readTree(filename, quiet=False): if not quiet: print() print("Reading in files...") print() temp = TreeList() try: temp.read(file=open(filename, 'r'), schema="newick", preserve_underscores=True) except: print("Error with file '{}': please only use files with newick tree format".format(f)) sys.exit() return temp
def get_bs_trees(self, bin_name): tl = TreeList.get(path=os.path.join(self.path, 'supergenes', bin_name, 'RAxML_bootstrap.bootstrap'), preserve_underscores=True, schema='newick') tree_upper(tl[0]) return tl
def create_tree(filepath='bird_phylogenic_tree.nex', num_trees=1): treelist = TreeList.get(path=filepath, schema="nexus") if num_trees == -1: num_trees = len(treelist) maps = [] for i in range(0, num_trees): outer_map = {} tree = treelist[i] # Iterate from root to tips of tree not including leaves. iterator = tree.ageorder_node_iter(include_leaves=False, descending=True) for node in iterator: # Looping nodes in tree children_iter = node.child_node_iter() for child_node in children_iter: # Looping through all child of node. if child_node.is_leaf(): # Add child_node as key to outer_map but first create inner_map leaf_of_node_list = node.leaf_nodes() leaf_of_node_list.remove(child_node) inner_map = create_inner_map(child_node, leaf_of_node_list) child_name = convert_name(child_node.taxon.__str__()) outer_map[child_name] = inner_map maps.append(outer_map) return maps
def readTrees(filenames, namespace, quiet=False): if not quiet: print() print("Reading in files...") print() sample_tree_list = [] for f in filenames: # temp = TreeList(taxon_namespace=namespace) temp = TreeList() try: temp.read(file=open(f, 'r'), schema="newick", preserve_underscores=True) except: print( "Error with file '{}': please only use files with newick tree format" .format(f)) sys.exit() sample_tree_list.append(temp) return sample_tree_list
def main (folder=None,seed=None): print("Folder %s, seed %s") % (folder,seed) r=numpy.random.RandomState(seed) gene_trees=TreeList() taxa = dendropy.TaxonNamespace() treefiles=glob.glob(args.sd+"/"+folder+"/g_trees*.trees") tree_yielder=Tree.yield_from_files(files=treefiles,schema="newick",rooting="default-rooted",preserve_underscores=True,taxon_namespace=taxa) #Modify gene trees #I have to modify here the trees if args.mk=="random": for gtree in tree_yielder: onodes=gtree.leaf_nodes() nodes=remove_taxa_prov(r,onodes,args.pr) if len(nodes) < len(onodes)-3: #Tree with missing leaves gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True) gene_trees.append(gtree) else: #The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree) continue elif args.mk=="byindividual": tagProbs=None for gtree in tree_yielder: onodes=gtree.leaf_nodes() if not tagProbs: tagProbs={} probs=truncated_normal(r,n=len(onodes),mean=args.pr,sd=args.ist,min=args.itmin,max=args.itmax) #one prob for each leaf for leafi in xrange(len(onodes)): tagProbs[onodes[leafi].taxon.label]=probs[leafi]#assigment to leaf labels in the dictionary nodes=remove_taxa_tagprobs(r,onodes,tagProbs) if len(nodes) < len(onodes)-3: #Tree with missing leaves gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True) gene_trees.append(gtree) else: #The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree) continue else: print("Yet unsupported option") #Write gene trees gene_trees.write(path=args.sd+"/"+folder+"/"+args.o,schema="newick")
#! /usr/bin/env python from dendropy import TreeList from sys import argv trees = TreeList() trees.read(path=argv[1],schema="newick") trees.write(path=argv[2],schema="nexus")
def main(args): import os import itertools import subprocess from dendropy import TreeList from dendropy.calculate import treecompare import ts_extras def ts_txts_to_trees(ts_nodes, ts_edges, trees_outname=None): import shutil import msprime logging.info("== Converting new ts ARG to .trees ===") try: ts = msprime.load_text(nodes=ts_nodes, edges=ts_edges) except: logging.warning( "Can't load the texts file properly. Saved copied to 'bad.nodes' & 'bad.edges' for inspection" ) shutil.copyfile(ts_nodes.name, "bad.nodes") shutil.copyfile(ts_edges.name, "bad.edges") raise logging.info("== loaded {}, {}===".format(ts_nodes.name, ts_edges.name)) try: simple_ts = ts.simplify() except: ts.dump("bad.trees") logging.warning( "Can't simplify. .trees file dumped to 'bad.trees'") raise if trees_outname: simple_ts.dump(trees_outname) return (simple_ts) msprime.TreeSequence.write_nexus_trees = ts_extras.write_nexus_trees iterations = 20 full_prefix = os.path.join( args.outputdir, os.path.splitext(os.path.basename(args.trees_file))[0]) with open(full_prefix + ".sites", "w+") as aw_in: tsfile_to_ARGweaver_in(args.trees_file, aw_in) cmd = [ os.path.join(args.ARGweaver_executable_dir, args.ARGweaver_sample_executable), '--sites', aw_in.name, '--popsize', str(args.effective_population_size), '--recombrate', str(args.recombination_rate), '--mutrate', str(args.mutation_rate), '--overwrite', '--randseed', str(int(args.random_seed)), '--iters', str(iterations), '--sample-step', str(iterations), '--output', full_prefix ] assert os.stat(aw_in.name).st_size > 0, "Initial .sites file is empty" logging.debug("running '{}'".format(" ".join(cmd))) subprocess.call(cmd) #now check that the smc file produced can be converted to nodes smc = full_prefix + "." + str(iterations) + ".smc.gz" assert os.path.isfile(smc), "No output file names {}".format(smc) smc_nex = smc.replace(".smc.gz", ".nex") with open(smc_nex, "w+") as smc_nex_out: ARGweaver_smc_to_nexus(smc, smc_nex_out) arg_nex = smc.replace(".smc.gz", ".ts_nex") with open(smc.replace(".smc.gz", ".TSnodes"), "w+") as nodes, \ open(smc.replace(".smc.gz", ".TSedges"), "w+") as edges, \ open(arg_nex, "w+") as ts_nex: ARGweaver_smc_to_ts_txts( os.path.join(args.ARGweaver_executable_dir, args.ARGweaver_smc2arg_executable), smc.replace(".smc.gz", ""), nodes, edges) ts = ts_txts_to_trees(nodes, edges) ts.write_nexus_trees(ts_nex) smc_trees = TreeList.get(path=smc_nex, schema="nexus") arg_trees = TreeList.get(path=arg_nex, schema="nexus", taxon_namespace=smc_trees[0].taxon_namespace) #zero_based_tip_numbers assumed False) #Check the smc trees against the ts-imported equivalents #NB, the ARGweaver output does not specify where mutations occur on the ARG, so we cannot #reconstruct the sequences implied by this ARG for testing purposes, and thus cannot compare #the original sequences with the reconstructed ones assert len(smc_trees) == len(arg_trees) assert [int(float(t.label)) for t in smc_trees ] == [int(float(t.label)) for t in arg_trees] for i, (smc_tree, arg_tree) in enumerate(zip(smc_trees, arg_trees)): if treecompare.symmetric_difference(smc_tree, arg_tree) == 0: print( "✓ Tree " + str(i + 1) + " in AW SMC file is identical to that produced by SMC->ARG->STS" ) else: raise Exception("Tree {} differs\n".format(i+1) + \ smc_tree.label + " (smc) = " + smc_tree.as_string(schema="newick", suppress_edge_lengths=True, suppress_internal_node_labels = True, suppress_rooting = True) + \ arg_tree.label + " (arg) = " + arg_tree.as_string(schema="newick", suppress_edge_lengths=True, suppress_internal_node_labels = True, suppress_rooting = True))
import sys from warnings import warn from dendropy import TreeList from collections import OrderedDict parser = argparse.ArgumentParser( description= 'Add genus names to nodes on the tree, for each monophyletic genus') parser.add_argument('treefile', type=argparse.FileType('r'), help='A newick-format tree') args = parser.parse_args() trees = TreeList.get(file=args.treefile, schema='newick', preserve_underscores=True, rooting='default-rooted') tree = trees[0] #compile a list of genus names count = {} for node in tree.preorder_internal_node_iter(): if node.label: nl = re.sub(r'_\d+_$', '', node.label).lower() count[nl] = 1 + (count.get(nl) or 0) dups = {name: 0 for name, n in count.items() if n > 1} #collect a list of genus names
mode = args["mode"] if args["mode"] else 'per-species' print(mode) k = int(args["k"]) if args["k"] else None outdir = args["outdir"] if args["outdir"] else splitext( intrees)[0] + "_kshrink" mkdir(outdir) if args["tempdir"]: tempdir = args["tempdir"] mkdir(tempdir) else: tempdir = check_output(["mktemp", "-d"]).rstrip() trees = TreeList.get_from_path(intrees, 'newick', preserve_underscores=True) gene_list = [[] for i in range(len(trees))] species_map = {} occ = {} removing_sets = [[[] for i in range(len(trees))] for j in range(len(quantiles))] for t, a_tree in enumerate(trees): # solve k-shrink a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"]) a_filter.optFilter(d=k) # compute species feature (i.e. the max ratio associated with each species for this gene tree) mapping = {} for i in range(1, len(a_filter.min_diams)): r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
def main(args): if len(args) < 2: print """USAGE: %s [tree_file] [outgroups] [-mrca -mrca-dummy (optional)] [output name (optional)] [-igerr (optional)] -- tree_file: a path to the newick tree file -- outgroups: a list of outgroups, separated by comma. The script goes through the list of outgroups. If the outgroup is found in the tree, the tree is rooted at that outgroup. Otherwise, the next outgroup in the list is used. Each element in the comma-delimited list is itself a + delimited list of taxa. By default the script makes sure that this list of taxa are monophyletic in the tree and roots the tree at the node leading to the clade represented by outgroups given in the + delimited list. Alternatively, you can specify -m which will result in mid-point rooting. Example: HUMAN,ANOCA,STRCA+TINMA first tries to root at HUMAN, if not present, tries to use ANOCA, if not present, tries to root at parent of STRCA and TINMA which need to be monophyletic. If not monophyletic, roots at STRCA. -- (optional) -mrca: using this option the mono-phyletic requirement is relaxed and always the mrca of the + delimited list of outgroups is used. -- (optional) -mrca-dummy: is like -mrca, but also adds a dummy taxon as outgroup to the root. """ % args[ 0 ] sys.exit(1) treeName = args[1] outgroups = [x.replace("_", " ") for x in args[2].split(",")] use_mrca = True if len(args) > 3 and (args[3] == "-mrca" or args[3] == "-mrca-dummy") else False add_dummy = True if len(args) > 3 and (args[3] == "-mrca-dummy") else False resultsFile = ( args[4] if len(args) > 4 else ("%s.rooted" % treeName[:-9] if treeName.endswith("unrooted") else "%s.rooted" % treeName) ) ignore = True if len(args) > 5 and args[5] == "-igerr" else False print "Reading input trees %s ..." % treeName, trees = dendropy.TreeList.get_from_path(treeName, "newick", rooted=True) print "%d tree(s) found" % len(trees) i = 0 outtrees = TreeList() for tree in trees: i += 1 print ".", oldroot = tree.seed_node # print "Tree %d:" %i if outgroups[0] == "-m": print "Midpoint rooting ... " tree.reroot_at_midpoint(update_splits=False) else: mrca = None for outgroup in outgroups: outs = outgroup.split("+") outns = [] for out in outs: n = tree.find_node_with_taxon_label(out) if n is None: print "outgroup not found %s," % out, continue outns.append(n.taxon) if len(outns) != 0: # Find an ingroup and root the tree there for n in tree.leaf_iter(): if n.taxon not in outns: ingroup = n break # print "rerooting at ingroup %s" %ingroup.taxon.label """reroot at an ingroup, so that outgroups form monophyletic groups, if possible""" if ingroup.edge.length is not None: tree.reroot_at_edge( ingroup.edge, update_splits=True, length1=ingroup.edge.length / 2, length2=ingroup.edge.length / 2, ) else: tree.reroot_at_edge(ingroup.edge, update_splits=True) mrca = tree.mrca(taxa=outns) break if mrca is None: if ignore: print >> sys.stderr, "Outgroups not found: %s" % outgroups continue else: raise KeyError("Outgroups not found %d: %s" % (i, outgroups)) # print mrca.leaf_nodes() # if not mono-phyletic, then use the first if not use_mrca and len(mrca.leaf_nodes()) != len(outns): print >> sys.stderr, "selected set is not monophyletic. Using %s instead. " % outns[0] mrca = tree.find_node_with_taxon_label(outns[0].label) if mrca.parent_node is None: print >> sys.stderr, "Already rooted at the root." # print "rerooting on %s" % [s.label for s in outns] # tree.reroot_at_midpoint() elif mrca.edge.length is not None: # print "rerooting at %s" %mrca.as_newick_string() if ingroup.edge.length is not None: tree.reroot_at_edge( mrca.edge, update_splits=False, length1=mrca.edge.length / 2, length2=mrca.edge.length / 2 ) else: tree.reroot_at_edge(mrca.edge, update_splits=False) else: tree.reroot_at_edge(mrca.edge, update_splits=False) if add_dummy: dummy = tree.seed_node.new_child(taxon=Taxon(label="outgroup"), edge_length=1) tree.reroot_at_edge(dummy.edge, update_splits=False) outtrees.append(tree) """This is to fix internal node labels when treated as support values""" while oldroot.parent_node != tree.seed_node and oldroot.parent_node != None: oldroot.label = oldroot.parent_node.label oldroot = oldroot.parent_node if len(oldroot.sister_nodes()) > 0: oldroot.label = oldroot.sister_nodes()[0].label # tree.reroot_at_midpoint(update_splits=False) print >> sys.stderr, "writing results to %s" % resultsFile outtrees.write(open(resultsFile, "w"), "newick", edge_lengths=True, internal_labels=True, write_rooting=False)
from os import walk import glob ### Main ### ### Argparse parser = argparse.ArgumentParser( description="Reads a newick trees and reroots it with a basal trifurcation", prog="strictunroot.py") parser.add_argument("-i", required=True, type=str, help="Input newick tree name") parser.add_argument("-o", required=True, type=str, help="Output file name") args = parser.parse_args() ###Main itrees = TreeList.get(path=args.i, schema="newick", rooting="default-rooted", preserve_underscores=True) otrees = TreeList() for tree in itrees: tree.collapse_basal_bifurcation() otrees.append(tree) otrees.write(path=args.o, schema="newick", unquoted_underscores=True, suppress_rooting=True) print("Done!")
redundant_count += 1 break else: tree_list.append(tree) return tree_list, redundant_count if __name__ == '__main__': #inputs# mle_tree = raw_input("File with Maximum Likelihood tree: ") mcmc_trees = raw_input("File with MCMC trees: ") burnin = int(raw_input("Burnin: ")) outfile = raw_input("Name of outfile: ") uts = [] #list of unique topologies taxa = dendropy.TaxonSet() #initialize TaxonSet object mle_tree = dendropy.Tree.get_from_path(mle_tree, 'nexus', taxon_set=taxa) uts.append(mle_tree) #MLE tree is the first topology in unique list uts, redundant_count = unique_trees(uts, mcmc_trees, 'nexus', burnin, taxonset=taxa) print "\nNumber of redundant trees: %d" % redundant_count print "Number of unique trees: %d\n" % len(uts) unique_tree_list = TreeList(uts) unique_tree_list.write_to_path(outfile, 'newick', suppress_edge_lengths=True)
#!/opt/local/bin/python ### Imports ### import dendropy from dendropy import TreeList,Tree import sys import argparse from os import walk import glob ### Main ### ### Argparse parser = argparse.ArgumentParser(description="Reads a newick trees and reroots it with a basal trifurcation",prog="strictunroot.py") parser.add_argument("-i",required=True,type=str,help="Input newick tree name") parser.add_argument("-o",required=True,type=str,help="Output file name") args = parser.parse_args() ###Main itrees=TreeList.get(path=args.i,schema="newick",rooting="default-rooted",preserve_underscores=True) otrees=TreeList() for tree in itrees: tree.collapse_basal_bifurcation() otrees.append(tree) otrees.write(path=args.o,schema="newick",unquoted_underscores=True,suppress_rooting=True) print("Done!")
def main(args): if len(args) < 2: print '''USAGE: %s [tree_file] [outgroups] [-mrca -mrca-dummy (optional)] [output name (optional)] [-igerr (optional)] -- tree_file: a path to the newick tree file -- outgroups: a list of outgroups, separated by comma. The script goes through the list of outgroups. If the outgroup is found in the tree, the tree is rooted at that outgroup. Otherwise, the next outgroup in the list is used. Each element in the comma-delimited list is itself a + delimited list of taxa. By default the script makes sure that this list of taxa are monophyletic in the tree and roots the tree at the node leading to the clade represented by outgroups given in the + delimited list. Alternatively, you can specify -m which will result in mid-point rooting. Example: HUMAN,ANOCA,STRCA+TINMA first tries to root at HUMAN, if not present, tries to use ANOCA, if not present, tries to root at parent of STRCA and TINMA which need to be monophyletic. If not monophyletic, roots at STRCA. -- (optional) -mrca: using this option the mono-phyletic requirement is relaxed and always the mrca of the + delimited list of outgroups is used. -- (optional) -mrca-dummy: is like -mrca, but also adds a dummy taxon as outgroup to the root. ''' % args[0] sys.exit(1) treeName = args[1] outgroups = [x.replace("_", " ") for x in args[2].split(",")] # uym2 editted: keep underscore #outgroups = [x for x in args[2].split(",")] use_mrca = True if len(args) > 3 and ( args[3] == "-mrca" or args[3] == "-mrca-dummy") else False add_dummy = True if len(args) > 3 and (args[3] == "-mrca-dummy") else False resultsFile = args[4] if len(args) > 4 else ( "%s.rooted" % treeName[:-9] if treeName.endswith("unrooted") else "%s.rooted" % treeName) ignore = True if len(args) > 5 and args[5] == "-igerr" else False print >> sys.stderr, "Reading input trees %s ..." % treeName, #trees = dendropy.treelist.get_from_path(treename, 'newick',rooted=true) # uym2 edited: hack for dendropy4 trees = dendropy.TreeList.get_from_path(treeName, "newick") print >> sys.stderr, "%d tree(s) found" % len(trees) i = 0 outtrees = TreeList() for tree in trees: i += 1 print >> sys.stderr, ".", oldroot = tree.seed_node #print "Tree %d:" %i if outgroups[0] == "-m": print >> sys.stderr, "Midpoint rooting ... " tree.reroot_at_midpoint(update_splits=False) else: mrca = None for outgroup in outgroups: outs = outgroup.split("+") outns = [] for out in outs: n = tree.find_node_with_taxon_label(out) if n is None: print >> sys.stderr, "outgroup not found %s," % out, continue outns.append(n.taxon) if len(outns) != 0: # Find an ingroup and root the tree there for n in tree.leaf_node_iter(): if n.taxon not in outns: ingroup = n break #print "rerooting at ingroup %s" %ingroup.taxon.label '''reroot at an ingroup, so that outgroups form monophyletic groups, if possible''' if ingroup.edge.length is not None: #tree.reroot_at_edge(ingroup.edge, update_splits=True,length1=ingroup.edge.length/2,length2=ingroup.edge.length/2) # uym2 editted: hack for dendropy4 tree.reroot_at_edge(ingroup.edge, length1=ingroup.edge.length / 2, length2=ingroup.edge.length / 2) else: #tree.reroot_at_edge(ingroup.edge, update_splits=True) tree.reroot_at_edge(ingroup.edge) mrca = tree.mrca(taxa=outns) break if mrca is None: if ignore: print >> sys.stderr, "Outgroups not found: %s" % outgroups print >> sys.stdout, tree.as_string(schema="newick"), continue else: print >> sys.stderr, "Outgroups not found: %s" % outgroups continue #raise KeyError("Outgroups not found %d: %s" %(i,outgroups)) #print mrca.leaf_nodes() #if not mono-phyletic, then use the first if not use_mrca and len(mrca.leaf_nodes()) != len(outns): print >> sys.stderr, "selected set is not monophyletic. Using %s instead. " % outns[ 0] mrca = tree.find_node_with_taxon_label(outns[0].label) if mrca.parent_node is None: print >> sys.stderr, "Already rooted at the root." #print "rerooting on %s" % [s.label for s in outns] #tree.reroot_at_midpoint() elif mrca.edge.length is not None: #print "rerooting at %s" %mrca.as_newick_string() if ingroup.edge.length is not None: #tree.reroot_at_edge(mrca.edge, update_splits=False,length1=mrca.edge.length/2,length2=mrca.edge.length/2) #uym2 editted: hack for dendropy4 tree.reroot_at_edge(mrca.edge, length1=mrca.edge.length / 2, length2=mrca.edge.length / 2) else: #tree.reroot_at_edge(mrca.edge, update_splits=False) #uym2 editted: hack for dendropy4 tree.reroot_at_edge(mrca.edge) else: tree.reroot_at_edge(mrca.edge, update_splits=False) if add_dummy: dummy = tree.seed_node.new_child(taxon=Taxon(label="outgroup"), edge_length=1) tree.reroot_at_edge(dummy.edge, update_splits=False) outtrees.append(tree) '''This is to fix internal node labels when treated as support values''' while oldroot.parent_node != tree.seed_node and oldroot.parent_node != None: oldroot.label = oldroot.parent_node.label oldroot = oldroot.parent_node if len(oldroot.sister_nodes()) > 0: oldroot.label = oldroot.sister_nodes()[0].label #tree.reroot_at_midpoint(update_splits=False) print >> sys.stderr, "writing results to %s" % resultsFile #outtrees.write(open(resultsFile,'w'),'newick',edge_lengths=True, internal_labels=True,write_rooting=False) #uym2 editted: hack for dendropy4 outtrees.write( path=resultsFile, schema='newick', suppress_rooting=True ) #,edge_lengths=True, internal_labels=True,write_rooting=False)
parser = ArgumentParser('Return CP- or CPM-vectors for a set of trees\n'+ 'The vectors are written to a separate file each,\n'+ 'named {tree_file}.tree_{tree_number}.vector') parser.add_argument('-t', type=str, help='Tree file in Newick format') parser.add_argument('-u', action='store_true', help='Produce unrooted (CPM) labelling') parser.add_argument('--hash', action='store_true', help='Produce hashed labelling') parser.add_argument('--processes', type=int, default=0, help='Number of processes. Defaults to processor number') args = parser.parse_args() start = time() process_count = args.processes if args.processes else cpu_count() print('Using {} processes'.format(process_count), file=stderr) file_mask = args.t.split('.')[0]+'_tree{}.vector' trees = TreeList.get_from_path(args.t, schema='newick') print('Loaded {} trees'.format(len(trees)), file=stderr) counter = 0 f = args.u and leaf_enumeration_annotation or annotate_rooted_tree func_args = [(trees[i], f, file_mask.format(str(i)), args.hash) for i in range(len(trees))] p = Pool(process_count) _ = p.starmap(write_tree, func_args, chunksize=1) print('Processed {} trees in {} seconds using {} processes'.format( str(len(trees)), time()-start, process_count), file=stderr)
'''Label all unnamed nodes with an underscore + number. ''' import argparse import re import sys from warnings import warn from dendropy import TreeList from collections import OrderedDict parser = argparse.ArgumentParser(description='Add genus names to nodes on the tree, for each monophyletic genus') parser.add_argument('treefile', type=argparse.FileType('r'), help='A newick-format tree') args = parser.parse_args() trees = TreeList.get(file=args.treefile, schema='newick', preserve_underscores=True, rooting='default-rooted') tree = trees[0] #compile a list of genus names count = {} for node in tree.preorder_internal_node_iter(): if node.label: nl = re.sub(r'_\d+_$','', node.label).lower() count[nl] = 1+ (count.get(nl) or 0) dups = {name:0 for name,n in count.items() if n > 1} #collect a list of genus names genera = OrderedDict()
def trees_from_newick_str_list(newick_list): all_tree_str = " ".join(newick_list) return TreeList(stream=StringIO(all_tree_str), taxon_set=TaxonSet(), schema="NEWICK")
#! /usr/bin/env python from dendropy import TreeList from sys import argv from tree_lib import compute_diameter infile = argv[1] treelist = TreeList.get(path=infile, schema="newick") compute_diameter(treelist)
def do_sim(birth_rate , death_rate, num_leaves, rng=None): temp_dir = tempfile.mkdtemp() model_tree = treesim.birth_death(birth_rate=birth_rate, death_rate=death_rate, ntax=num_leaves, rng=rng) ################################################################################ # Calling seq-gen mtf = os.path.join(temp_dir, 'simtree') print "temp_dir =", temp_dir treefile_obj = open(mtf, 'w') treefile_obj.write("%s;\n" % str(model_tree)) # CLOSING THE FILE IS IMPORTANT! This flushes buffers, assuring that the data # will be written to the filesystem before seq-gen is invoked. treefile_obj.close() import subprocess command_line = ['seq-gen', '-mHKY', '-on', ] if os.environ.get('TREE_INF_TEST_RAND_NUMBER_SEED'): sg_seed = seed else: if rng is None: sg_seed = random.randint(0,100000) else: sg_seed = rng.randint(0,100000) command_line.append('-z%d' % sg_seed) command_line.append('simtree') seq_gen_proc = subprocess.Popen(command_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=temp_dir) dataset = seq_gen_proc.communicate()[0] # seq-gen does not exit with an error code when it fails. I don't know why!! if seq_gen_proc.returncode != 0 or len(dataset) == 0: sys.exit('seq-gen failed!\n') sd = os.path.join(temp_dir, 'simdata.nex') d = open(sd, 'w') d.write(dataset) # CLOSING THE FILE IS IMPORTANT! This flushes buffers, assuring that the data # will be written to the filesystem before PAUP is invoked. d.close() ################################################################################ # PAUP pcf = os.path.join(temp_dir, 'execute_paup.nex') pc = open(pcf, 'w') pc.write('''execute simdata.nex ; hsearch nomultrees ; savetree file=inferred.tre format = NEXUS; quit; ''') pc.close() paup_proc = subprocess.Popen(['paup', '-n', pcf], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=temp_dir) (o, e) = paup_proc.communicate() paup_output = os.path.join(temp_dir, 'inferred.tre') # seq-gen does not exit with an error code when it fails. I don't know why!! if paup_proc.returncode != 0 or not os.path.exists(paup_output): sys.exit(e) # read true tree with the inferred tree (because it is nexus) inf_tree_list = TreeList.get_from_path(paup_output, "NEXUS", taxon_set=model_tree.taxon_set) assert len(inf_tree_list) == 1 inferred_tree = inf_tree_list[0] # determine which splits were missed treesplit.encode_splits(inferred_tree) treesplit.encode_splits(model_tree) missing = model_tree.find_missing_splits(inferred_tree) # sort the nodes of the true tree by depth and ask whether or not they were recovered node_depth_TF_list = [] for node in model_tree.postorder_node_iter(): children = node.child_nodes() if children and node.parent_node: first_child = children[0] node.depth = first_child.depth + first_child.edge.length if node.edge.split_bitmask in missing: recovered = 0 else: recovered = 1 node_depth_TF_list.append((node.depth, node.edge.length, recovered)) else: node.depth = 0.0 node_depth_TF_list.sort() os.remove(pcf) os.remove(paup_output) os.remove(sd) os.remove(mtf) os.rmdir(temp_dir) return node_depth_TF_list
fin = sys.argv[1] num = int(sys.argv[2]) fout = sys.argv[3] f = open(fin, "r") sp_tree_str = "" for l in f: sp_tree_str += l f.close() sp_tree_str = "[&R] " + sp_tree_str sp_tree = dendropy.Tree.get_from_string(sp_tree_str, "newick", preserve_underscores=True) gene_to_species_map = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping( containing_taxon_namespace=sp_tree.taxon_namespace, num_contained=1) gene_tree_list = TreeList() for i in range(num): gene_tree = treesim.contained_coalescent_tree( containing_tree=sp_tree, gene_to_containing_taxon_map=gene_to_species_map) for t in gene_tree.leaf_nodes(): t.taxon.label = t.taxon.label.split()[0] gene_tree_list.append(gene_tree) gene_tree_list.write_to_path(fout, 'newick')
required=True, help="Sampling time") parser.add_argument("-r", "--rootAge", required=False, help="Root age") parser.add_argument("-t", "--timeTree", required=True, help="The output trees with branch lengths in time unit") parser.add_argument("-c", "--composite", required=False, action='store_true', help="Do composite optimization. Default: NO") args = vars(parser.parse_args()) myTrees = TreeList.get_from_path(args["input"], 'newick') smpl_times = {} rootAge = float(args["rootAge"]) if args["rootAge"] else None with open(args["samplingTime"], "r") as fin: fin.readline() for line in fin: name, time = line.split() smpl_times[name] = float(time) for tree in myTrees: if args["composite"]: s = calibrate_composite_opt(tree, smpl_times, root_age=rootAge) else: s = calibrate_log_opt(tree, smpl_times,
def getOpenTreesFromOneZoom(OpenTreeFile, output_dir, include_var, phy_files, verbose=False): '''Python routine to get OToL subtrees from phy files. If include_var is a number, treat it as a recursion depth, otherwise a dictionary of names to keep. The parameter phy_files should be an iterable list of .phy or .PHY filenames''' from numbers import Number ExtractionUtility = os.path.join(os.path.dirname(os.path.realpath(__file__)), "subtree_extract.pl") #find all nodes that end in ott plus a number and (optionally) some other numbers starting with underscore, ending in # an at sign followed optionally by another number (giving the max depth) are OpenTree subnode IDs # the first number after the ott is always the ott number to use as the filename. # the 1) ott123 2) ott_123: use the name, not the ott id ottRE = re.compile(r"^(.*)_ott([-~\d]+)\@(\d*)$") id_pattern = re.compile(r"(\d*)~?([-\d]*)$") if not os.path.isfile(OpenTreeFile): OpenTreeURL = "http://files.opentreeoflife.org/synthesis/opentree9.1/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre" warn("Could not find the OpenTree file {}. Do you want to download it from {}".format(OpenTreeFile, OpenTreeURL)) if (input("Press Enter to accept, or N to abort... ") == "N"): sys.exit(0) if not get_species_level_tree(OpenTreeFile): warn("Could not get the Open Tree of Life newick file to save at {}".format(OpenTreeFile)) if isinstance(include_var, Number): keep = True #means keep all of the species down to a certain depth, i.e. do not use an include list default_recursion_depth = include_var else: keep = include_var default_recursion_depth = float('nan') for file in phy_files: if file == "-": trees = TreeList.get_from_stream(sys.stdin, schema="newick", preserve_underscores=True, rooting='default-rooted') file = "<stdin>" else: try: with open(file, 'r', encoding="utf8") as stream: trees = TreeList.get_from_stream(stream, schema="newick", preserve_underscores=True, rooting='default-rooted') except Exception as e: trees = [] warn("Problem reading tree from {}: {}".format(file, e)) for tree in trees: for i, include_ott in enumerate(tree.preorder_node_iter( lambda node: True if hasattr(node, "taxon") and node.taxon is not None and ottRE.search(node.taxon.label) else False )): if i==0: print("\n//;# == {} ==, from file {}".format(tree.seed_node.label, file)) #each of these is a file to @include #first get recursion depth from the end of the string match = ottRE.search(include_ott.taxon.label) name = match.group(1) ottIDs = match.group(2) if default_recursion_depth < 0: recursion_depth = abs(default_recursion_depth) else: recursion_depth = float(match.group(3)) if len(match.group(3)) else default_recursion_depth match = id_pattern.match(ottIDs) if match: subfile_name = match.group(1) or name del_otts = (match.group(2) or '').split('-') #split by minus signs base_ott = del_otts.pop(0) or match.group(1) #first number after '=' is the tree to extract. system_call = [ExtractionUtility] if keep==True and math.isfinite(recursion_depth): system_call.append("-d={}".format(int(recursion_depth))) system_call.append(os.path.relpath(OpenTreeFile, output_dir)) system_call.append(base_ott) OpenSubTreeFile = os.path.join(output_dir, base_ott + ".nwk") if verbose: warn("For "+include_ott.taxon.label+": extracting tree into " + OpenSubTreeFile, prefix=''); call(system_call, cwd=output_dir) #should create many ottID.nwk files OutputFilename = os.path.join(output_dir, subfile_name + ".phy") if os.path.isfile(OpenSubTreeFile): removed = "" if len(del_otts)==0 else " removed {}".format(del_otts) subtree = prune_tree(OpenSubTreeFile, keep, del_otts) if keep == True: if verbose: warn("Found file {} with {} leaf taxa,{} and extracted to max depth: {}".format(OpenSubTreeFile, len(subtree.taxon_namespace), removed, recursion_depth), prefix='') else: subtree_size = len(subtree.leaf_nodes()) if verbose: warn("Found file with {} leaf taxa, {}, and simplified to only selected taxa ({} {})".format(len(subtree.taxon_namespace), removed, subtree_size, 'leaf' if subtree_size==1 else 'leaves', del_otts), prefix='') '''this is not needed until the OpenTree has branch lengths subtree.ultrametricize() #maybe use subtree.calc_node_ages() warn("ultrametricized\n", prefix="") #subtree->get_root()->set_branch_length(undef); stem_height = include_ott.edge_length - subtree.calc_tree_height if (stem_height < 0):''' if verbose: warn("Now writing to {}".format(OutputFilename), prefix='') with open(OutputFilename, 'w', encoding='UTF-8') as outputstream: subtree.write_to_stream(outputstream,'newick', unquoted_underscores=True, suppress_rooting=True) max_tree_height = 0 if include_ott.edge_length is not None and include_ott.edge_length > max_tree_height: stem_height = include_ott.edge_length- max_tree_height else: stem_height = 0 # print(r'$tree.substitute_with_fn_last("{}_ott{}@\\d*", {}, "{}", {}); //;# "user/OpenTree/{}");'.format(name, ottIDs, stem_height, name, len(subtree.taxon_namespace), OutputFilename)) # OpenTrees are currently not dated, so we should omit the 'stem_length' value, so that the node becomes # 'date unknown' print(r"$tree.substitute('{}_ott{}@\\d*', '{}');".format(name, ottIDs, OutputFilename)) else: warn("File " + OpenSubTreeFile + " does not exist, skipping\n")
import argparse import re import sys from dendropy import TreeList parser = argparse.ArgumentParser( description='Check which nodes have duplicated names') parser.add_argument('treefile', type=argparse.FileType('r'), nargs='+', help='Any number of newick-format tree files') args = parser.parse_args() for f in args.treefile: trees = TreeList.get(file=f, schema='newick', preserve_underscores=True) tree = trees[0] count = {} for node in tree.preorder_internal_node_iter(): if node.label: count[node.label] = 1 + (count.get(node.label) or 0) tot = 0 for name, n in count.items(): if n > 1: print("Node name '{}' duplicated {} times".format(name, n)) tot = tot + n print("Total dups for {}: {}".format(f.name, tot))
def main(): import treeshrink from treeshrink.optimal_filter_lib import TreeFilter from treeshrink.tree_lib import prune_tree from sys import argv, stdout from math import sqrt from subprocess import check_output, call import argparse from dendropy import Tree, TreeList from os.path import basename, dirname, splitext, realpath, join, normpath from os import mkdir, getcwd, rmdir from copy import deepcopy from tempfile import mkdtemp from shutil import rmtree import dendropy print("Launching " + treeshrink.PROGRAM_NAME + " version " + treeshrink.PROGRAM_VERSION) parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input trees") parser.add_argument( "-d", "--outdir", required=False, help="Output directory. Default: inferred from the input trees") parser.add_argument( "-t", "--tempdir", required=False, help= "Directory to keep temporary files. If specified, the temp files will be kept" ) parser.add_argument( "-o", "--output", required=False, help= "The name of the output trees. Default: inferred from the input trees") parser.add_argument( "-c", "--centroid", required=False, action='store_true', help= "Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO" ) parser.add_argument( "-k", "--k", required=False, help= "The maximum number of leaves that can be removed. Default: auto-select based on the data" ) parser.add_argument( "-q", "--quantiles", required=False, help="The quantile(s) to set threshold. Default is 0.05") parser.add_argument( "-m", "--mode", required=False, help= "Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto" ) wdir = dirname(realpath(__file__)) args = vars(parser.parse_args()) MIN_OCC = 20 MIN_TREE_NUM = 20 quantiles = [q for q in args["quantiles"].split() ] if args["quantiles"] else ["0.05"] #print(quantiles) intrees = args["input"] treeName, treeExt = splitext(basename(intrees)) outtrees = args["output"] if args[ "output"] else treeName + "_shrunk" + treeExt mode = args["mode"] if args["mode"] else 'auto' k = int(args["k"]) if args["k"] else None outdir = args["outdir"] if args["outdir"] else splitext( intrees)[0] + "_treeshrink" mkdir(outdir) if args["tempdir"]: tempdir = args["tempdir"] mkdir(tempdir) else: tempdir = mkdtemp() #check_output(["mktemp","-d"]).rstrip() trees = TreeList.get(path=intrees, schema='newick', preserve_underscores=True) if mode == 'auto' and len(trees) < MIN_TREE_NUM: print("There are only " + str(len(trees)) + " gene trees in the dataset.") print("TreeShrink will run in 'All-genes' mode") mode = 'all-genes' gene_list = [[] for i in range(len(trees))] species_map = {} occ = {} removing_sets = [[[] for i in range(len(trees))] for j in range(len(quantiles))] for t, a_tree in enumerate(trees): # solve k-shrink a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"]) a_filter.optFilter(d=k) # compute species feature (i.e. the max ratio associated with each species for this gene tree) mapping = {} for i in range(1, len(a_filter.min_diams)): r = a_filter.min_diams[i - 1] / a_filter.min_diams[i] removals = a_filter.list_removals(d=i) for s in removals: mapping[s] = r if s not in mapping else max(mapping[s], r) # gather per-species distributions and per-gene species features for s in mapping: if mode == 'per-species' or mode == 'auto': species_map[s] = [ mapping[s] ] if s not in species_map else species_map[s] + [mapping[s]] if mode == 'per-species' or mode == 'all-genes' or mode == 'auto': gene_list[t].append((s, mapping[s])) # fit kernel density to this gene's species features (per-gene mode) if mode == 'per-gene': filename = normpath(join(tempdir, "gene_" + str(t) + ".dat")) with open(filename, 'w') as f: for s in mapping: f.write(str(mapping[s])) f.write("\n") #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping) #for i in range(n_missing): # f.write("1.0") # f.write("\n") if len(mapping) > 1: for i, q in enumerate(quantiles): threshold = float( check_output([ "Rscript", normpath( join(wdir, "R_scripts", "find_threshold_loglnorm.R")), filename, q ]).lstrip().rstrip()[4:]) #print("Threshold: ", threshold) for s in mapping: if mapping[s] > threshold: removing_sets[i][t].append(s) # update taxon occupancy (only for per-species mode) if mode == 'per-species' or mode == 'auto': for n in a_tree.leaf_node_iter(): s = n.taxon.label occ[s] = 1 if not s in occ else occ[s] + 1 if mode == 'auto' or mode == 'per-species': flag = False for s in occ: if occ[s] < MIN_OCC: print("Species " + s + " only exists in " + str(occ[s]) + " gene trees") flag = True if flag: if mode == 'auto': mode = 'all-genes' print( "There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode" ) else: print( "WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode" ) elif mode == 'auto': mode = 'per-species' print( "Finish preprocessing. TreeShrink will run in 'Per-species' mode" ) # fit kernel density to the per-species distributions and compute per-species threshold (per-species mode) if mode == 'per-species': for s in species_map: l = len(species_map[s]) for i in range(occ[s] - l): species_map[s].append(1) filename = normpath(join(tempdir, s + ".dat")) with open(filename, 'w') as f: for v in species_map[s]: f.write(str(v)) f.write("\n") thresholds = [0 for i in range(len(quantiles))] for i, q in enumerate(quantiles): thresholds[i] = float( check_output([ "Rscript", normpath( join(wdir, "R_scripts", "find_threshold_lkernel.R")), wdir, filename, q ]).lstrip().rstrip()[5:]) species_map[s] = (species_map[s], thresholds) for t, gene in enumerate(gene_list): for s, r in gene: for i, threshold in enumerate(species_map[s][1]): if r > threshold: removing_sets[i][t].append(s) # fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode) if mode == 'all-genes': filename = normpath(join(tempdir, "all_genes" + ".dat")) with open(filename, 'w') as f: for gene in gene_list: for s, r in gene: f.write(str(r)) f.write("\n") for i, q in enumerate(quantiles): threshold = float( check_output([ "Rscript", normpath( join(wdir, "R_scripts", "find_threshold_lkernel.R")), wdir, filename, q ]).lstrip().rstrip()[5:]) for t, gene in enumerate(gene_list): for s, r in gene: if r > threshold: removing_sets[i][t].append(s) # Dendropy's filter_leaf_nodes() seems to have problem # i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration) # use home-made code to prune the tree instead treeName, treeExt = splitext(outtrees) fName, ext = splitext(outtrees) for i, RS in enumerate(removing_sets): trees_shrunk = deepcopy(trees) outfile = normpath(join(outdir, fName + "_RS_" + quantiles[i] + ".txt")) with open(outfile, 'w') as f: for item in RS: for s in item: f.write(s + "\t") f.write("\n") for t, tree in enumerate(trees_shrunk): #filt = lambda node: False if (node.taxon is not None and node.taxon.label in RS[t]) else True #tree.filter_leaf_nodes(filt,update_bipartitions=True) prune_tree(tree, RS[t]) trees_shrunk.write_to_path( normpath(join(outdir, treeName + "_" + quantiles[i] + treeExt)), 'newick') if not args["tempdir"]: rmtree(tempdir) # call(["rm","-r",tempdir]) print("Output files written to " + outdir)
from argparse import ArgumentParser from dendropy import TreeList, TaxonNamespace from dendropy.simulate import treesim import os parser = ArgumentParser('Generate trees of a given size with different algos') parser.add_argument('-n', type=int, help='Tree size', default=100) parser.add_argument('-d', type=str, help='Output directory') args = parser.parse_args() if not os.path.isdir(args.d): os.mkdir(args.d) os.chdir(args.d) bd2 = TreeList([ treesim.birth_death_tree(birth_rate=1.0, death_rate=0.5, num_extant_tips=args.n, repeat_until_success=True) for _ in range(100) ]) bd2.write_to_path('birth_death2.nwk', schema='newick') bd5 = TreeList([ treesim.birth_death_tree(birth_rate=1.0, death_rate=0.2, num_extant_tips=args.n, repeat_until_success=True) for _ in range(100) ]) bd5.write_to_path('birth_death5.nwk', schema='newick') taxa = TaxonNamespace(['T{}'.format(x) for x in range(1, args.n + 1)]) king = TreeList( [treesim.pure_kingman_tree(taxon_namespace=taxa) for _ in range(100)]) king.write_to_path('kingman.nwk', schema='newick')
@author: smirarab ''' import dendropy import sys import os import copy import os.path from dendropy import TreeList if __name__ == '__main__': if len(sys.argv) < 4: print "USAGE: count [output|-] treefile*" sys.exit(1) count= int(sys.argv[1]) out=open(sys.argv[2],'w') if sys.argv[2] != "-" else sys.stdout c={} trees = None for treeName in sys.argv[3:]: a = dendropy.TreeList.get_from_path(treeName, 'nexus',rooted=True, tree_offset=200) if trees: trees.append(a) else: trees = a import random samples = TreeList(random.sample(trees,count)) samples.write(out,'newick',write_rooting=False) if out != sys.stdout: out.close()
metavar="input") parser.add_argument("-c", type=str, help="Tree to constrain the search ala RAxML's -g", metavar="constrain") parser.add_argument("-o", type=str, help="Output file name", metavar="output") #parser.add_argument("-s",type=int,help="Random number generator seed",metavar="seed") args = parser.parse_args() ###Random number machinery initialization #if args.s: # seed=args.s #else: # seed=random.randint(0,sys.maxint) #random.seed(seed) #print("Seed: %d" % seed) ###Input trees gene_trees = TreeList.get(path=args.i, schema="newick", rooting="force-unrooted") constrainTree = Tree.get(path=args.c, schema="newick") consensus = gene_trees.constrained_consensus(constrainTree=constrainTree, summarize_splits=False, min_freq=0) #Write gene trees consensus.write(path=args.o, schema="newick", suppress_rooting=True) print("Done!")
def dendropy_read_treefile(treefiles, quiet=False, preserve_underscores=False, **kwargs): out_stream = kwargs.pop('writer', sys.stderr) intrees = TreeList() if not treefiles: if not quiet: sys.stderr.write('NOTE: reading trees from stdin\n') trees = sys.stdin.read() #try two input formats try: intrees.extend(TreeList.get_from_string(trees, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) except (DataParseError, NexusReader.NotNexusFileError) as e: sys.stderr.write('%s\n' % e.message) intrees.extend(TreeList.get_from_string(trees, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError) as e: if not quiet: sys.stderr.write('%s\n' % e.message) sys.exit('Could not read file %s in nexus or newick format ...\n' % tf) else: for tf in treefiles: if not os.path.isfile(tf): out_stream.write('TreeFile %s does not exist' % tf) sys.exit() #try two input formats try: if not quiet: out_stream.write('Reading file %s in nexus format ...\n' % tf) intrees.extend(TreeList.get_from_path(tf, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) #except (DataParseError, dendropy.dataio.nexusreader.NotNexusFileError) as e: except (DataParseError, NexusReader.NotNexusFileError, AttributeError) as e: try: if not quiet: out_stream.write('Reading file %s in newick format ...\n' % tf) intrees.extend(TreeList.get_from_path(tf, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError) as e: if not quiet: sys.stderr.write('%s\n' % e.message) sys.exit('Could not read file %s in nexus or newick format ...\n' % tf) return intrees
import os, sys from subprocess import Popen, PIPE from MSTrees import backend as GrapeTree from MSTrees2 import backend as GrapeTree2 from dendropy import Tree, TreeList def appendTo(nwk, label, tlist): tre = Tree.get_from_string(nwk, 'newick') tre.label = label tlist.append(tre) if __name__ == '__main__': sim_data = sys.argv[1] trees = TreeList() tree = GrapeTree(profile=sim_data, method='MSTreeV2') appendTo(tree, 'MSTreeV2', trees) tree = GrapeTree(profile=sim_data, method='MSTree', missing_data='as_allele', matrix_type='symmetric', edge_weight='eBurst', branch_recrafting='F') appendTo(tree, 'goeBurstA', trees) tree = GrapeTree(profile=sim_data, method='MSTree', missing_data='pair_delete', matrix_type='symmetric',
def main(): print("Launching " + treeshrink.PROGRAM_NAME + " version " + treeshrink.PROGRAM_VERSION) parser = argparse.ArgumentParser() parser.add_argument("-i","--indir",required=False,help="The parent input directory where the trees (and alignments) can be found") parser.add_argument("-t","--tree",required=False,help="The name of the input tree/trees. If the input directory is specified (see -i option), each subdirectory under it must contain a tree with this name. Otherwise, all the trees can be included in this one file. Default: input.tre") parser.add_argument("-a","--alignment",required=False,help="The name of the input alignment; can only be used when the input directory is specified (see -i option). Each subdirectory under it must contain an alignment with this name. Default: input.fasta") parser.add_argument("-c","--centroid",required=False,action='store_true',help="Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO") parser.add_argument("-k","--k",required=False,help="The maximum number of leaves that can be removed. Default: auto-select based on the data; see also -s") parser.add_argument("-s","--kscaling",required=False,help="If -k not given, we use k=min(n/a,b*sqrt(n)) by default; using this option, you can set the a,b constants; Default: '5,2'") parser.add_argument("-q","--quantiles",required=False,help="The quantile(s) to set threshold. Default is 0.05") parser.add_argument("-b","--minimpact",required=False,help="Do not remove species on the per-species test if their impact on diameter is less than MINIPACT%% where x is the given value. Default: 5") parser.add_argument("-m","--mode",required=False,help="Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto") parser.add_argument("-o","--outdir",required=False,help="Output directory. Default: the same as input directory (if it is specified) or the same as the input trees") parser.add_argument("-p","--tempdir",required=False,help="Directory to keep temporary files. If specified, the temp files will be kept") parser.add_argument("-r","--libdir",required=False,help="Directory of the R libraries and scripts. Default: 2 layers above the treeshrink package") args = vars(parser.parse_args()) MIN_OCC = 20 MIN_TREE_NUM = 20 libdir = args["libdir"] if args["libdir"] else dirname(dirname(realpath(treeshrink.__file__))) tempdir = set_tmp_dir(args["tempdir"]) quantiles = [ q for q in args["quantiles"].split()] if args["quantiles"] else ["0.05"] minimpact = (float(args["minimpact"])/100)+1 if args["minimpact"] else 1.05 scaling = [int(x) for x in args["kscaling"].split(",")] if args["kscaling"] else [5,2] if args["indir"]: treename = splitext(args["tree"])[0] if args["tree"] else "input" subdirs = [d for d in listdir(args["indir"]) if exists(normpath(join(args["indir"],d,args["tree"] if args["tree"] else "input.tre")))] intrees = get_tmp_file(treename + ".trees") with open(intrees,'w') as fout: for d in subdirs: treename = args["tree"] if args["tree"] else "input.tre" treefile = normpath(join(args["indir"],d,treename)) if exists(treefile): fout.write(open(treefile,'r').read()) else: intrees = args["tree"] mode = args["mode"] if args["mode"] else 'auto' k = int(args["k"]) if args["k"] else None if args["outdir"]: outdir = args["outdir"] check_dir(outdir) elif args["indir"]: outdir = args["indir"] else: outdir = splitext(intrees)[0] + "_treeshrink" mkdir(outdir) ''' Check to make sure output can be written''' if args["indir"]: i = 0 fName,ext = splitext(basename(intrees)) for sd in subdirs: outfile = normpath(join(outdir,sd, fName + "_shrunk_RS_" + quantiles[i] + ".txt")) with open(outfile,'w') as f: pass trees = TreeList.get(path=intrees,schema='newick',preserve_underscores=True) if mode=='auto' and len(trees) < MIN_TREE_NUM: print("There are only " + str(len(trees)) + " gene trees in the dataset.") print("TreeShrink will run in 'All-genes' mode") mode='all-genes' gene_list = [[] for i in range(len(trees))] species_map = {} occ = {} removing_sets = [ [ [ ] for i in range(len(trees)) ] for j in range(len(quantiles)) ] for t,a_tree in enumerate(trees): # solve k-shrink a_filter = TreeFilter(ddpTree=a_tree,centroid_reroot=args["centroid"],scaling=scaling) a_filter.optFilter(d=k) # compute species feature (i.e. the max ratio associated with each species for this gene tree) mapping = {} #print(a_filter.min_diams) for i in range(1,len(a_filter.min_diams)): if a_filter.min_diams[i] == 0: print("Warning: tree %d has no diameter (has only zero branch lengths) after removing %d sequences." %(t+1,i)) break r = a_filter.min_diams[i-1]/a_filter.min_diams[i] removals = a_filter.list_removals(d=i) for s in removals: mapping[s] = r if s not in mapping else max(mapping[s],r) # gather per-species distributions and per-gene species features for s in mapping: if mode == 'per-species' or mode == 'auto': species_map[s] = [mapping[s]] if s not in species_map else species_map[s]+[mapping[s]] if mode == 'per-species' or mode == 'all-genes' or mode == 'auto': gene_list[t].append((s,mapping[s])) # fit kernel density to this gene's species features (per-gene mode) if mode == 'per-gene': filename = get_tmp_file("gene_%s.dat" %str(t)) with open(filename,'w') as f: for s in mapping: f.write(str(mapping[s])) f.write("\n") #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping) #for i in range(n_missing): # f.write("1.0") # f.write("\n") if len(mapping) > 1: for i,q in enumerate(quantiles): threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_loglnorm.R")),filename,q]).lstrip().rstrip()[4:]) #print("Threshold: ", threshold) for s in mapping: if mapping[s] > threshold: removing_sets[i][t].append(s) # update taxon occupancy (only for per-species mode) if mode == 'per-species' or mode == 'auto': for n in a_tree.leaf_node_iter(): s = n.taxon.label occ[s] = 1 if not s in occ else occ[s]+1 if mode == 'auto' or mode == 'per-species': flag = False for s in occ: if occ[s] < MIN_OCC: print ("Species " + s + " only exists in " + str(occ[s]) + " gene trees") flag = True if flag: if mode == 'auto': mode = 'all-genes' print ("There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode") else: print ("WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode") elif mode == 'auto': mode = 'per-species' print("Finish preprocessing. TreeShrink will run in 'Per-species' mode ... ") # fit kernel density to the per-species distributions and compute per-species threshold (per-species mode) if mode == 'per-species': for s in sorted(species_map): l = len(species_map[s]) for i in range(occ[s]-l): species_map[s].append(1) filename = get_tmp_file(s + ".dat") with open(filename,'w') as f: for v in species_map[s]: f.write(str(v)) f.write("\n") thresholds = [ 0 for i in range(len(quantiles)) ] for i,q in enumerate(quantiles): thresholds[i] = max(minimpact,float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:])) print("%s:\n\t will be cut in %d trees where its impact is above %f for quantile %s" %(s,sum(1 for x in species_map[s] if x>thresholds[i]),thresholds[i],q,)) species_map[s] = (species_map[s],thresholds) for t,gene in enumerate(gene_list): for s,r in gene: for i,threshold in enumerate(species_map[s][1]): if r > threshold: removing_sets[i][t].append(s) # fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode) if mode == 'all-genes': filename = get_tmp_file("all_genes" + ".dat") with open(filename,'w') as f: for gene in gene_list: for s,r in gene: f.write(str(r)) f.write("\n") for i,q in enumerate(quantiles): threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:]) for t,gene in enumerate(gene_list): for s,r in gene: if r > threshold: removing_sets[i][t].append(s) print("Writing output ...\n") # Dendropy's filter_leaf_nodes() seems to have problem # i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration) # use home-made code to prune the tree instead #treeName,treeExt = splitext(basename(intrees)) #outtrees = args["output"] if args["output"] else treeName + "_shrunk" + treeExt fName,ext = splitext(basename(intrees)) for i,RS in enumerate(removing_sets): trees_shrunk = deepcopy(trees) if args["indir"] is None: outfile = normpath(join(outdir,fName + "_RS_" + quantiles[i] + ".txt")) with open(outfile,'w') as f: for item in RS: for s in item: f.write(s + "\t") f.write("\n") for tree,rs in zip(trees_shrunk,RS): prune_tree(tree,rs) trees_shrunk.write_to_path(normpath(join(outdir,fName + "_" + quantiles[i] + ext)),'newick') else: for sd,item in zip(subdirs,RS): outfile = normpath(join(outdir,sd, fName + "_shrunk_RS_" + quantiles[i] + ".txt")) with open(outfile,'w') as f: for s in item: f.write(s + "\t") for sd,tree,rs in zip(subdirs,trees_shrunk,RS): L = set(x.taxon.label for x in tree.leaf_node_iter()) prune_tree(tree,rs) treeName,treeExt = splitext(args["tree"]) treefile = normpath(join(outdir,sd, treeName + "_shrunk_" + quantiles[i] + treeExt)) tree.write_to_path(treefile,'newick',unquoted_underscores=True,real_value_format_specifier=".16g") aln_filename = args["alignment"] if args["alignment"] else "input.fasta" alnName,alnExt = splitext(aln_filename) input_aln = normpath(join(args["indir"],sd,aln_filename)) if isfile(input_aln): output_aln = normpath(join(outdir,sd,alnName+"_shrunk"+quantiles[i]+alnExt)) alg = CompactAlignment() alg.read_file_object(input_aln,'fasta') S=set(alg.keys()) if (L.difference(alg.keys())) or S.difference(L): print("ERROR: For gene %s, alignment names don't match tree names. Will skip it.\n\tonly in tree:\t%s\n\tonly in alignment:\t%s"%(sd,str(L.difference(S)),str(S.difference(L)))) else: alg.remove_all(rs) alg.mask_gapy_sites(1) alg.write(output_aln,'fasta') if not args["tempdir"]: rmtree(tempdir) # call(["rm","-r",tempdir]) print("Output files written to " + outdir)
for ut in tree_list: sd = treecalc.symmetric_difference(tree,ut) #print sd ## error check if sd == 0: redundant_count +=1 break else: tree_list.append(tree) return tree_list, redundant_count if __name__ == '__main__': #inputs# mle_tree = raw_input("File with Maximum Likelihood tree: ") mcmc_trees = raw_input("File with MCMC trees: ") burnin = int(raw_input("Burnin: ")) outfile = raw_input("Name of outfile: ") uts = [] #list of unique topologies taxa = dendropy.TaxonSet() #initialize TaxonSet object mle_tree = dendropy.Tree.get_from_path(mle_tree, 'nexus', taxon_set=taxa) uts.append(mle_tree) #MLE tree is the first topology in unique list uts, redundant_count = unique_trees(uts,mcmc_trees,'nexus',burnin,taxonset=taxa) print "\nNumber of redundant trees: %d" % redundant_count print "Number of unique trees: %d\n" % len(uts) unique_tree_list = TreeList(uts) unique_tree_list.write_to_path(outfile,'newick',suppress_edge_lengths=True)
import dendropy from dendropy import TreeList,Taxon,Node import sys import argparse parser = argparse.ArgumentParser(description="Parses a Newick tree file, modifying the branch lengths from number of generations to years and adding an outgroup") parser.add_argument("-gt",type=float,default=0,required=False,help="Generation time") parser.add_argument("-od",type=float,default=0,required=False,help="Outgroup branch length") parser.add_argument("-i",type=str,default="infile.tree",required=True,help="Input Newick tree file") parser.add_argument("-o",type=str,default="outtree.tree",required=False,help="Output Newick tree file") args = parser.parse_args() trees=TreeList.get_from_path(args.i,schema="newick",rooting="force-rooted") if args.gt != 0: print "Scaling branch lengths to time with generation time %d\n" % args.gt for tree in trees: for edge in tree.preorder_edge_iter(): #print "DEBUG: %s" % edge.length if edge.length != None: edge.length=edge.length/args.gt if args.od != 0: print "Adding outgroup with branch length %d\n" % args.od namespace=trees.taxon_namespace outgroup= Taxon("outgroup") namespace.add_taxon(outgroup) ntree=0 labels=namespace.labels() labels.remove("outgroup") for tree in trees: outgroup_node=Node(taxon=outgroup,edge_length=args.od)
### Main ### ### Argparse parser = argparse.ArgumentParser( description="Converts a newick tree file in a Nexus file", prog="newicktonexusphylonet.py") parser.add_argument("-i", required=True, type=str, help="Input newick tree name") parser.add_argument("-o", required=True, type=str, help="Output file name") args = parser.parse_args() ###Main itrees = TreeList.get(path=args.i, schema="newick", rooting="default-rooted", preserve_underscores=True) itrees.write(path=args.o, schema="nexus", unquoted_underscores=True, suppress_rooting=True) namespace = itrees.taxon_namespace labels = namespace.labels() regex = re.compile("(.+)_.+_.+") speciesmap = defaultdict(list) for label in labels: match = regex.match(label).group(1) speciesmap[match].append(label) textlisttrees = "(" + ",".join(str(x) for x in xrange(1,