Пример #1
0
def generateCoalescentTrees(choice, num, fout, length):
    if choice == 1:
        sp_tree_str = """\
        [&R]  ((((((((A,B)%d,C)%d,D)%d,E)%d,F)%d,G)%d,H)%d);
        """ % (float(length),float(length),float(length),float(length),float(length),float(length),float(length))
    elif choice == 2:
        sp_tree_str = """\
        [&R]  ((((((((A,B)%d,C)%d,D)%d,E)%d,F)%d,G)%d,H)%d);
        """ % (float(length),float(length),float(length),float(length),float(length),float(length),float(length))

    sp_tree = dendropy.Tree.get_from_string(sp_tree_str, "newick")
    gene_to_species_map = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping(
        containing_taxon_namespace=sp_tree.taxon_namespace,
        num_contained=1)
    gene_tree_list = TreeList()

    for i in range(num):
        gene_tree = treesim.contained_coalescent_tree(containing_tree=sp_tree,
        gene_to_containing_taxon_map=gene_to_species_map)
        treesim.contained_coalescent_tree(containing_tree=sp_tree,
                                      gene_to_containing_taxon_map=gene_to_species_map)
        for t in gene_tree.leaf_nodes():
            t.taxon.label = t.taxon.label.split( )[0]
        gene_tree_list.append(gene_tree)

    gene_tree_list.write_to_path(fout, 'newick')
Пример #2
0
def generateCoalescentTrees(choice, num, fout, length):
    	if choice == 1:
		sp_tree_str = """((((((((A:%f,B:%f):%f,C:%f):%f,D:%f):%f,E:%f):%f,F:%f):%f,G:%f):%f,H:%f):%f);""" % (float(length), float(length), float(length),float(length),2*float(length),float(length),3*float(length),float(length),4*float(length),float(length),5*float(length),float(length),6*float(length),float(length),7*float(length))

        	#sp_tree_str = """\
       		# [&R]  ((((((((A,B)%f,C)%f,D)%f,E)%f,F)%f,G)%f,H)%f);
        	#""" % (float(length),float(length),float(length),float(length),float(length),float(length),float(length))
    	elif choice == 2:
        	#sp_tree_str = """\
        	#[&R] (((A,B)%f,(C,D)%f)%f,((E,F)%f,(G,H)%f)%f);  
        	#""" % (float(length),float(length),float(length),float(length),float(length),float(length))
		sp_tree_str = """(((A:%f,B:%f):%f,(C:%f,D:%f):%f):%f,((E:%f,F:%f):%f,(G:%f,H:%f):%f):%f);""" % (float(length), float(length), float(length), float(length), float(length), 2*float(length),4*float(length),float(length), float(length),2*float(length),float(length), float(length), float(length),4*float(length)) 
    #print(sp_tree_str)
 	sp_tree = dendropy.Tree.get_from_string(sp_tree_str, "newick")
    	gene_to_species_map = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping(
        	containing_taxon_namespace=sp_tree.taxon_namespace,
        	num_contained=1)
    	gene_tree_list = TreeList()

    	for i in range(num):
        	gene_tree = dendropy.simulate.treesim.contained_coalescent_tree(containing_tree=sp_tree,
        	gene_to_containing_taxon_map=gene_to_species_map)
        	dendropy.simulate.treesim.contained_coalescent_tree(containing_tree=sp_tree,
                                      gene_to_containing_taxon_map=gene_to_species_map)
        	for t in gene_tree.leaf_nodes():
            		t.taxon.label = t.taxon.label.split( )[0]
        	gene_tree_list.append(gene_tree)

   	gene_tree_list.write_to_path(fout, 'newick')
Пример #3
0
def main():
    treefile1 = sys.argv[1]
    treefile2 = sys.argv[2]

    treelist = TreeList()
    treelist.read(file=open(treefile1, 'rU'), schema="nexus")
    treelist.read(file=open(treefile2, 'rU'), schema="nexus")

    if treecompare.symmetric_difference(treelist.__getitem__(0),
                                        treelist.__getitem__(1)) == 0:
        print "trees are identical"
    else:
        print "trees are NOT identical"
Пример #4
0
def readTree(filename, quiet=False):
    if not quiet:
        print()
        print("Reading in files...")
        print()

    temp = TreeList()
    try:
        temp.read(file=open(filename, 'r'), schema="newick", preserve_underscores=True)
    except:
        print("Error with file '{}': please only use files with newick tree format".format(f))
        sys.exit()

    return temp
 def get_bs_trees(self, bin_name):
     tl = TreeList.get(path=os.path.join(self.path, 'supergenes', bin_name,
                                         'RAxML_bootstrap.bootstrap'),
                       preserve_underscores=True,
                       schema='newick')
     tree_upper(tl[0])
     return tl
Пример #6
0
def create_tree(filepath='bird_phylogenic_tree.nex', num_trees=1):
    treelist = TreeList.get(path=filepath, schema="nexus")
    if num_trees == -1:
        num_trees = len(treelist)
    maps = []
    for i in range(0, num_trees):
        outer_map = {}
        tree = treelist[i]
        # Iterate from root to tips of tree not including leaves.
        iterator = tree.ageorder_node_iter(include_leaves=False,
                                           descending=True)
        for node in iterator:
            # Looping nodes in tree
            children_iter = node.child_node_iter()
            for child_node in children_iter:
                # Looping through all child of node.
                if child_node.is_leaf():
                    # Add child_node as key to outer_map but first create inner_map
                    leaf_of_node_list = node.leaf_nodes()
                    leaf_of_node_list.remove(child_node)
                    inner_map = create_inner_map(child_node, leaf_of_node_list)
                    child_name = convert_name(child_node.taxon.__str__())
                    outer_map[child_name] = inner_map
        maps.append(outer_map)
    return maps
Пример #7
0
def readTrees(filenames, namespace, quiet=False):
    if not quiet:
        print()
        print("Reading in files...")
        print()

    sample_tree_list = []
    for f in filenames:
        # temp = TreeList(taxon_namespace=namespace)
        temp = TreeList()
        try:
            temp.read(file=open(f, 'r'),
                      schema="newick",
                      preserve_underscores=True)
        except:
            print(
                "Error with file '{}': please only use files with newick tree format"
                .format(f))
            sys.exit()

        sample_tree_list.append(temp)
    return sample_tree_list
Пример #8
0
def main (folder=None,seed=None):
	print("Folder %s, seed %s") % (folder,seed)
	r=numpy.random.RandomState(seed)
	gene_trees=TreeList()
	taxa = dendropy.TaxonNamespace()
	treefiles=glob.glob(args.sd+"/"+folder+"/g_trees*.trees")
	tree_yielder=Tree.yield_from_files(files=treefiles,schema="newick",rooting="default-rooted",preserve_underscores=True,taxon_namespace=taxa)
	#Modify gene trees
	#I have to modify here the trees
	if args.mk=="random":
		for gtree in tree_yielder:
			onodes=gtree.leaf_nodes()
			nodes=remove_taxa_prov(r,onodes,args.pr)
			if len(nodes) < len(onodes)-3: #Tree with missing leaves
				gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True)
				gene_trees.append(gtree)
			else:	#The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree)
				continue
	elif args.mk=="byindividual":
		tagProbs=None
		for gtree in tree_yielder:
                        onodes=gtree.leaf_nodes()
			if not tagProbs:
				tagProbs={}
				probs=truncated_normal(r,n=len(onodes),mean=args.pr,sd=args.ist,min=args.itmin,max=args.itmax) #one prob for each leaf
				for leafi in xrange(len(onodes)):
					tagProbs[onodes[leafi].taxon.label]=probs[leafi]#assigment to leaf labels in the dictionary
                        nodes=remove_taxa_tagprobs(r,onodes,tagProbs)
			if len(nodes) < len(onodes)-3: #Tree with missing leaves
                                gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True)
                                gene_trees.append(gtree)
                        else:   #The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree)
                                continue
	else:
		print("Yet unsupported option")
	#Write gene trees
	gene_trees.write(path=args.sd+"/"+folder+"/"+args.o,schema="newick")
Пример #9
0
#! /usr/bin/env python

from dendropy import TreeList
from sys import argv

trees = TreeList()
trees.read(path=argv[1],schema="newick")

trees.write(path=argv[2],schema="nexus")
Пример #10
0
def main(args):
    import os
    import itertools
    import subprocess
    from dendropy import TreeList
    from dendropy.calculate import treecompare
    import ts_extras

    def ts_txts_to_trees(ts_nodes, ts_edges, trees_outname=None):
        import shutil
        import msprime
        logging.info("== Converting new ts ARG to .trees ===")
        try:
            ts = msprime.load_text(nodes=ts_nodes, edges=ts_edges)
        except:
            logging.warning(
                "Can't load the texts file properly. Saved copied to 'bad.nodes' & 'bad.edges' for inspection"
            )
            shutil.copyfile(ts_nodes.name, "bad.nodes")
            shutil.copyfile(ts_edges.name, "bad.edges")
            raise
        logging.info("== loaded {}, {}===".format(ts_nodes.name,
                                                  ts_edges.name))
        try:
            simple_ts = ts.simplify()
        except:
            ts.dump("bad.trees")
            logging.warning(
                "Can't simplify. .trees file dumped to 'bad.trees'")
            raise
        if trees_outname:
            simple_ts.dump(trees_outname)
        return (simple_ts)

    msprime.TreeSequence.write_nexus_trees = ts_extras.write_nexus_trees
    iterations = 20
    full_prefix = os.path.join(
        args.outputdir,
        os.path.splitext(os.path.basename(args.trees_file))[0])
    with open(full_prefix + ".sites", "w+") as aw_in:
        tsfile_to_ARGweaver_in(args.trees_file, aw_in)
        cmd = [
            os.path.join(args.ARGweaver_executable_dir,
                         args.ARGweaver_sample_executable), '--sites',
            aw_in.name, '--popsize',
            str(args.effective_population_size), '--recombrate',
            str(args.recombination_rate), '--mutrate',
            str(args.mutation_rate), '--overwrite', '--randseed',
            str(int(args.random_seed)), '--iters',
            str(iterations), '--sample-step',
            str(iterations), '--output', full_prefix
        ]
        assert os.stat(aw_in.name).st_size > 0, "Initial .sites file is empty"
        logging.debug("running '{}'".format(" ".join(cmd)))
        subprocess.call(cmd)
        #now check that the smc file produced can be converted to nodes
        smc = full_prefix + "." + str(iterations) + ".smc.gz"
        assert os.path.isfile(smc), "No output file names {}".format(smc)
        smc_nex = smc.replace(".smc.gz", ".nex")
        with open(smc_nex, "w+") as smc_nex_out:
            ARGweaver_smc_to_nexus(smc, smc_nex_out)
        arg_nex = smc.replace(".smc.gz", ".ts_nex")
        with open(smc.replace(".smc.gz", ".TSnodes"), "w+") as nodes, \
            open(smc.replace(".smc.gz", ".TSedges"), "w+") as edges, \
            open(arg_nex, "w+") as ts_nex:
            ARGweaver_smc_to_ts_txts(
                os.path.join(args.ARGweaver_executable_dir,
                             args.ARGweaver_smc2arg_executable),
                smc.replace(".smc.gz", ""), nodes, edges)

            ts = ts_txts_to_trees(nodes, edges)
            ts.write_nexus_trees(ts_nex)

        smc_trees = TreeList.get(path=smc_nex, schema="nexus")
        arg_trees = TreeList.get(path=arg_nex,
                                 schema="nexus",
                                 taxon_namespace=smc_trees[0].taxon_namespace)
        #zero_based_tip_numbers assumed False)
        #Check the smc trees against the ts-imported equivalents
        #NB, the ARGweaver output does not specify where mutations occur on the ARG, so we cannot
        #reconstruct the sequences implied by this ARG for testing purposes, and thus cannot compare
        #the original sequences with the reconstructed ones

        assert len(smc_trees) == len(arg_trees)
        assert [int(float(t.label)) for t in smc_trees
                ] == [int(float(t.label)) for t in arg_trees]
        for i, (smc_tree, arg_tree) in enumerate(zip(smc_trees, arg_trees)):
            if treecompare.symmetric_difference(smc_tree, arg_tree) == 0:
                print(
                    "✓ Tree " + str(i + 1) +
                    " in AW SMC file is identical to that produced by SMC->ARG->STS"
                )
            else:
                raise Exception("Tree {} differs\n".format(i+1) + \
                    smc_tree.label + " (smc) = " + smc_tree.as_string(schema="newick",
                        suppress_edge_lengths=True,
                        suppress_internal_node_labels = True,
                        suppress_rooting = True) + \
                    arg_tree.label + " (arg) = " + arg_tree.as_string(schema="newick",
                        suppress_edge_lengths=True,
                        suppress_internal_node_labels = True,
                        suppress_rooting = True))
Пример #11
0
import sys
from warnings import warn
from dendropy import TreeList
from collections import OrderedDict

parser = argparse.ArgumentParser(
    description=
    'Add genus names to nodes on the tree, for each monophyletic genus')
parser.add_argument('treefile',
                    type=argparse.FileType('r'),
                    help='A newick-format tree')

args = parser.parse_args()

trees = TreeList.get(file=args.treefile,
                     schema='newick',
                     preserve_underscores=True,
                     rooting='default-rooted')

tree = trees[0]

#compile a list of genus names

count = {}
for node in tree.preorder_internal_node_iter():
    if node.label:
        nl = re.sub(r'_\d+_$', '', node.label).lower()
        count[nl] = 1 + (count.get(nl) or 0)

dups = {name: 0 for name, n in count.items() if n > 1}

#collect a list of genus names
Пример #12
0
mode = args["mode"] if args["mode"] else 'per-species'

print(mode)

k = int(args["k"]) if args["k"] else None

outdir = args["outdir"] if args["outdir"] else splitext(
    intrees)[0] + "_kshrink"
mkdir(outdir)
if args["tempdir"]:
    tempdir = args["tempdir"]
    mkdir(tempdir)
else:
    tempdir = check_output(["mktemp", "-d"]).rstrip()

trees = TreeList.get_from_path(intrees, 'newick', preserve_underscores=True)
gene_list = [[] for i in range(len(trees))]
species_map = {}
occ = {}
removing_sets = [[[] for i in range(len(trees))]
                 for j in range(len(quantiles))]

for t, a_tree in enumerate(trees):
    # solve k-shrink
    a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"])
    a_filter.optFilter(d=k)

    # compute species feature (i.e. the max ratio associated with each species for this gene tree)
    mapping = {}
    for i in range(1, len(a_filter.min_diams)):
        r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
Пример #13
0
def main(args):
    if len(args) < 2:
        print """USAGE: %s [tree_file] [outgroups] [-mrca -mrca-dummy (optional)] [output name (optional)] [-igerr (optional)]

-- tree_file: a path to the newick tree file

-- outgroups: a list of outgroups, separated by comma.
The script goes through the list of outgroups. If the outgroup is found in the tree, 
the tree is rooted at that outgroup. Otherwise, the next outgroup in the list is used. 
Each element in the comma-delimited list is itself a + delimited list of taxa.
By default the script makes sure that this list of taxa are monophyletic
in the tree and roots the tree at the node leading to the clade represented 
by outgroups given in the + delimited list.
Alternatively, you can specify -m which will result in mid-point rooting.

Example: HUMAN,ANOCA,STRCA+TINMA first tries to root at HUMAN, if not present, 
tries to use ANOCA, if not present, tries to root at parent of STRCA and TINMA
which need to be monophyletic. If not monophyletic, roots at STRCA.

-- (optional) -mrca: using this option the mono-phyletic requirement is relaxed 
and always the mrca of the + delimited list of outgroups is used.
-- (optional) -mrca-dummy: is like -mrca, but also adds a dummy taxon as outgroup to the root. 
""" % args[
            0
        ]
        sys.exit(1)
    treeName = args[1]

    outgroups = [x.replace("_", " ") for x in args[2].split(",")]

    use_mrca = True if len(args) > 3 and (args[3] == "-mrca" or args[3] == "-mrca-dummy") else False
    add_dummy = True if len(args) > 3 and (args[3] == "-mrca-dummy") else False
    resultsFile = (
        args[4]
        if len(args) > 4
        else ("%s.rooted" % treeName[:-9] if treeName.endswith("unrooted") else "%s.rooted" % treeName)
    )
    ignore = True if len(args) > 5 and args[5] == "-igerr" else False
    print "Reading input trees %s ..." % treeName,
    trees = dendropy.TreeList.get_from_path(treeName, "newick", rooted=True)
    print "%d tree(s) found" % len(trees)
    i = 0
    outtrees = TreeList()
    for tree in trees:
        i += 1
        print ".",
        oldroot = tree.seed_node
        # print "Tree %d:" %i
        if outgroups[0] == "-m":
            print "Midpoint rooting ... "
            tree.reroot_at_midpoint(update_splits=False)
        else:
            mrca = None
            for outgroup in outgroups:
                outs = outgroup.split("+")
                outns = []
                for out in outs:
                    n = tree.find_node_with_taxon_label(out)
                    if n is None:
                        print "outgroup not found %s," % out,
                        continue
                    outns.append(n.taxon)
                if len(outns) != 0:
                    # Find an ingroup and root the tree there
                    for n in tree.leaf_iter():
                        if n.taxon not in outns:
                            ingroup = n
                            break
                    # print "rerooting at ingroup %s" %ingroup.taxon.label
                    """reroot at an ingroup, so that outgroups form monophyletic groups, if possible"""
                    if ingroup.edge.length is not None:
                        tree.reroot_at_edge(
                            ingroup.edge,
                            update_splits=True,
                            length1=ingroup.edge.length / 2,
                            length2=ingroup.edge.length / 2,
                        )
                    else:
                        tree.reroot_at_edge(ingroup.edge, update_splits=True)

                    mrca = tree.mrca(taxa=outns)
                    break
            if mrca is None:
                if ignore:
                    print >> sys.stderr, "Outgroups not found: %s" % outgroups
                    continue
                else:
                    raise KeyError("Outgroups not found %d: %s" % (i, outgroups))
            # print mrca.leaf_nodes()
            # if not mono-phyletic, then use the first
            if not use_mrca and len(mrca.leaf_nodes()) != len(outns):
                print >> sys.stderr, "selected set is not monophyletic. Using %s instead. " % outns[0]
                mrca = tree.find_node_with_taxon_label(outns[0].label)
            if mrca.parent_node is None:
                print >> sys.stderr, "Already rooted at the root."
                # print "rerooting on %s" % [s.label for s in outns]
                # tree.reroot_at_midpoint()
            elif mrca.edge.length is not None:
                # print "rerooting at %s" %mrca.as_newick_string()
                if ingroup.edge.length is not None:
                    tree.reroot_at_edge(
                        mrca.edge, update_splits=False, length1=mrca.edge.length / 2, length2=mrca.edge.length / 2
                    )
                else:
                    tree.reroot_at_edge(mrca.edge, update_splits=False)
            else:
                tree.reroot_at_edge(mrca.edge, update_splits=False)
            if add_dummy:
                dummy = tree.seed_node.new_child(taxon=Taxon(label="outgroup"), edge_length=1)
                tree.reroot_at_edge(dummy.edge, update_splits=False)
            outtrees.append(tree)
        """This is to fix internal node labels when treated as support values"""
        while oldroot.parent_node != tree.seed_node and oldroot.parent_node != None:
            oldroot.label = oldroot.parent_node.label
            oldroot = oldroot.parent_node
        if len(oldroot.sister_nodes()) > 0:
            oldroot.label = oldroot.sister_nodes()[0].label
            # tree.reroot_at_midpoint(update_splits=False)

    print >> sys.stderr, "writing results to %s" % resultsFile
    outtrees.write(open(resultsFile, "w"), "newick", edge_lengths=True, internal_labels=True, write_rooting=False)
Пример #14
0
from os import walk
import glob

### Main ###

### Argparse
parser = argparse.ArgumentParser(
    description="Reads a newick trees and reroots it with a basal trifurcation",
    prog="strictunroot.py")
parser.add_argument("-i",
                    required=True,
                    type=str,
                    help="Input newick tree name")
parser.add_argument("-o", required=True, type=str, help="Output file name")
args = parser.parse_args()

###Main
itrees = TreeList.get(path=args.i,
                      schema="newick",
                      rooting="default-rooted",
                      preserve_underscores=True)
otrees = TreeList()
for tree in itrees:
    tree.collapse_basal_bifurcation()
    otrees.append(tree)
otrees.write(path=args.o,
             schema="newick",
             unquoted_underscores=True,
             suppress_rooting=True)
print("Done!")
Пример #15
0
                redundant_count += 1
                break
        else:
            tree_list.append(tree)
    return tree_list, redundant_count


if __name__ == '__main__':
    #inputs#
    mle_tree = raw_input("File with Maximum Likelihood tree: ")
    mcmc_trees = raw_input("File with MCMC trees: ")
    burnin = int(raw_input("Burnin: "))
    outfile = raw_input("Name of outfile: ")

    uts = []  #list of unique topologies
    taxa = dendropy.TaxonSet()  #initialize TaxonSet object
    mle_tree = dendropy.Tree.get_from_path(mle_tree, 'nexus', taxon_set=taxa)
    uts.append(mle_tree)  #MLE tree is the first topology in unique list

    uts, redundant_count = unique_trees(uts,
                                        mcmc_trees,
                                        'nexus',
                                        burnin,
                                        taxonset=taxa)
    print "\nNumber of redundant trees: %d" % redundant_count
    print "Number of unique trees: %d\n" % len(uts)
    unique_tree_list = TreeList(uts)
    unique_tree_list.write_to_path(outfile,
                                   'newick',
                                   suppress_edge_lengths=True)
Пример #16
0
#!/opt/local/bin/python

### Imports ###
import dendropy
from dendropy import TreeList,Tree
import sys
import argparse
from os import walk
import glob


### Main ###

### Argparse
parser = argparse.ArgumentParser(description="Reads a newick trees and reroots it with a basal trifurcation",prog="strictunroot.py")
parser.add_argument("-i",required=True,type=str,help="Input newick tree name")
parser.add_argument("-o",required=True,type=str,help="Output file name")
args = parser.parse_args()

###Main
itrees=TreeList.get(path=args.i,schema="newick",rooting="default-rooted",preserve_underscores=True)
otrees=TreeList()
for tree in itrees:
    tree.collapse_basal_bifurcation()
    otrees.append(tree)
otrees.write(path=args.o,schema="newick",unquoted_underscores=True,suppress_rooting=True)
print("Done!")
Пример #17
0
def main(args):
    if len(args) < 2:
        print '''USAGE: %s [tree_file] [outgroups] [-mrca -mrca-dummy (optional)] [output name (optional)] [-igerr (optional)]

-- tree_file: a path to the newick tree file

-- outgroups: a list of outgroups, separated by comma.
The script goes through the list of outgroups. If the outgroup is found in the tree, 
the tree is rooted at that outgroup. Otherwise, the next outgroup in the list is used. 
Each element in the comma-delimited list is itself a + delimited list of taxa.
By default the script makes sure that this list of taxa are monophyletic
in the tree and roots the tree at the node leading to the clade represented 
by outgroups given in the + delimited list.
Alternatively, you can specify -m which will result in mid-point rooting.

Example: HUMAN,ANOCA,STRCA+TINMA first tries to root at HUMAN, if not present, 
tries to use ANOCA, if not present, tries to root at parent of STRCA and TINMA
which need to be monophyletic. If not monophyletic, roots at STRCA.

-- (optional) -mrca: using this option the mono-phyletic requirement is relaxed 
and always the mrca of the + delimited list of outgroups is used.
-- (optional) -mrca-dummy: is like -mrca, but also adds a dummy taxon as outgroup to the root. 
''' % args[0]
        sys.exit(1)
    treeName = args[1]

    outgroups = [x.replace("_", " ") for x in args[2].split(",")]

    # uym2 editted: keep underscore
    #outgroups = [x for x in args[2].split(",")]

    use_mrca = True if len(args) > 3 and (
        args[3] == "-mrca" or args[3] == "-mrca-dummy") else False
    add_dummy = True if len(args) > 3 and (args[3] == "-mrca-dummy") else False
    resultsFile = args[4] if len(args) > 4 else (
        "%s.rooted" %
        treeName[:-9] if treeName.endswith("unrooted") else "%s.rooted" %
        treeName)
    ignore = True if len(args) > 5 and args[5] == "-igerr" else False
    print >> sys.stderr, "Reading input trees %s ..." % treeName,
    #trees = dendropy.treelist.get_from_path(treename, 'newick',rooted=true)
    # uym2 edited: hack for dendropy4
    trees = dendropy.TreeList.get_from_path(treeName, "newick")
    print >> sys.stderr, "%d tree(s) found" % len(trees)
    i = 0
    outtrees = TreeList()
    for tree in trees:
        i += 1
        print >> sys.stderr, ".",
        oldroot = tree.seed_node
        #print "Tree %d:" %i
        if outgroups[0] == "-m":
            print >> sys.stderr, "Midpoint rooting ... "
            tree.reroot_at_midpoint(update_splits=False)
        else:
            mrca = None
            for outgroup in outgroups:
                outs = outgroup.split("+")
                outns = []
                for out in outs:
                    n = tree.find_node_with_taxon_label(out)
                    if n is None:
                        print >> sys.stderr, "outgroup not found %s," % out,
                        continue
                    outns.append(n.taxon)
                if len(outns) != 0:
                    # Find an ingroup and root the tree there
                    for n in tree.leaf_node_iter():
                        if n.taxon not in outns:
                            ingroup = n
                            break
                    #print "rerooting at ingroup %s" %ingroup.taxon.label
                    '''reroot at an ingroup, so that outgroups form monophyletic groups, if possible'''
                    if ingroup.edge.length is not None:
                        #tree.reroot_at_edge(ingroup.edge, update_splits=True,length1=ingroup.edge.length/2,length2=ingroup.edge.length/2)
                        # uym2 editted: hack for dendropy4
                        tree.reroot_at_edge(ingroup.edge,
                                            length1=ingroup.edge.length / 2,
                                            length2=ingroup.edge.length / 2)
                    else:
                        #tree.reroot_at_edge(ingroup.edge, update_splits=True)
                        tree.reroot_at_edge(ingroup.edge)

                    mrca = tree.mrca(taxa=outns)
                    break
            if mrca is None:
                if ignore:
                    print >> sys.stderr, "Outgroups not found: %s" % outgroups
                    print >> sys.stdout, tree.as_string(schema="newick"),
                    continue
                else:
                    print >> sys.stderr, "Outgroups not found: %s" % outgroups
                    continue
                    #raise KeyError("Outgroups not found %d: %s" %(i,outgroups))
            #print mrca.leaf_nodes()
            #if not mono-phyletic, then use the first
            if not use_mrca and len(mrca.leaf_nodes()) != len(outns):
                print >> sys.stderr, "selected set is not monophyletic. Using %s instead. " % outns[
                    0]
                mrca = tree.find_node_with_taxon_label(outns[0].label)
            if mrca.parent_node is None:
                print >> sys.stderr, "Already rooted at the root."
                #print "rerooting on %s" % [s.label for s in outns]
                #tree.reroot_at_midpoint()
            elif mrca.edge.length is not None:
                #print "rerooting at %s" %mrca.as_newick_string()
                if ingroup.edge.length is not None:
                    #tree.reroot_at_edge(mrca.edge, update_splits=False,length1=mrca.edge.length/2,length2=mrca.edge.length/2)
                    #uym2 editted: hack for dendropy4
                    tree.reroot_at_edge(mrca.edge,
                                        length1=mrca.edge.length / 2,
                                        length2=mrca.edge.length / 2)
                else:
                    #tree.reroot_at_edge(mrca.edge, update_splits=False)
                    #uym2 editted: hack for dendropy4
                    tree.reroot_at_edge(mrca.edge)
            else:
                tree.reroot_at_edge(mrca.edge, update_splits=False)
            if add_dummy:
                dummy = tree.seed_node.new_child(taxon=Taxon(label="outgroup"),
                                                 edge_length=1)
                tree.reroot_at_edge(dummy.edge, update_splits=False)
            outtrees.append(tree)
        '''This is to fix internal node labels when treated as support values'''
        while oldroot.parent_node != tree.seed_node and oldroot.parent_node != None:
            oldroot.label = oldroot.parent_node.label
            oldroot = oldroot.parent_node
        if len(oldroot.sister_nodes()) > 0:
            oldroot.label = oldroot.sister_nodes()[0].label
            #tree.reroot_at_midpoint(update_splits=False)

    print >> sys.stderr, "writing results to %s" % resultsFile
    #outtrees.write(open(resultsFile,'w'),'newick',edge_lengths=True, internal_labels=True,write_rooting=False)
    #uym2 editted: hack for dendropy4
    outtrees.write(
        path=resultsFile, schema='newick', suppress_rooting=True
    )  #,edge_lengths=True, internal_labels=True,write_rooting=False)
Пример #18
0
            
            
parser = ArgumentParser('Return CP- or CPM-vectors for a set of trees\n'+
                        'The vectors are written to a separate file each,\n'+
                        'named {tree_file}.tree_{tree_number}.vector')
parser.add_argument('-t', type=str, help='Tree file in Newick format')
parser.add_argument('-u', action='store_true',
                    help='Produce unrooted (CPM) labelling')
parser.add_argument('--hash', action='store_true',
                    help='Produce hashed labelling')
parser.add_argument('--processes', type=int, default=0,
                    help='Number of processes. Defaults to processor number')
args = parser.parse_args()

start = time()
process_count = args.processes if args.processes else cpu_count()
print('Using {} processes'.format(process_count), file=stderr)
file_mask = args.t.split('.')[0]+'_tree{}.vector'
trees = TreeList.get_from_path(args.t, schema='newick')
print('Loaded {} trees'.format(len(trees)), file=stderr)
counter = 0
f = args.u and leaf_enumeration_annotation or annotate_rooted_tree
func_args = [(trees[i], f, file_mask.format(str(i)), args.hash) for i in range(len(trees))]
p = Pool(process_count)
_ = p.starmap(write_tree, func_args, chunksize=1)
print('Processed {} trees in {} seconds using {} processes'.format(
                                                                str(len(trees)),
                                                                time()-start,
                                                                process_count),
      file=stderr)
Пример #19
0
'''Label all unnamed nodes with an underscore + number.
'''

import argparse
import re
import sys
from warnings import warn
from dendropy import TreeList
from collections import OrderedDict

parser = argparse.ArgumentParser(description='Add genus names to nodes on the tree, for each monophyletic genus')
parser.add_argument('treefile', type=argparse.FileType('r'), help='A newick-format tree')

args = parser.parse_args()

trees = TreeList.get(file=args.treefile, schema='newick', preserve_underscores=True, rooting='default-rooted')

tree = trees[0]

#compile a list of genus names

count = {}
for node in tree.preorder_internal_node_iter():
    if node.label:
        nl = re.sub(r'_\d+_$','', node.label).lower()
        count[nl] = 1+ (count.get(nl) or 0)

dups = {name:0 for name,n in count.items() if n > 1}

#collect a list of genus names
genera = OrderedDict()
Пример #20
0
def trees_from_newick_str_list(newick_list):
    all_tree_str = " ".join(newick_list)
    return TreeList(stream=StringIO(all_tree_str),
                    taxon_set=TaxonSet(),
                    schema="NEWICK")
Пример #21
0
#! /usr/bin/env python

from dendropy import TreeList
from sys import argv
from tree_lib import compute_diameter

infile = argv[1]

treelist = TreeList.get(path=infile, schema="newick")

compute_diameter(treelist)
def do_sim(birth_rate   , death_rate, num_leaves, rng=None):
    temp_dir = tempfile.mkdtemp()
    model_tree = treesim.birth_death(birth_rate=birth_rate,
                            death_rate=death_rate,
                            ntax=num_leaves,
                            rng=rng)
    ################################################################################
    # Calling seq-gen
    mtf = os.path.join(temp_dir, 'simtree')
    print "temp_dir =", temp_dir
    treefile_obj = open(mtf, 'w')
    treefile_obj.write("%s;\n" % str(model_tree))
    # CLOSING THE FILE IS IMPORTANT!  This flushes buffers, assuring that the data
    #  will be written to the filesystem before seq-gen is invoked.
    treefile_obj.close() 
    
    
    import subprocess
    command_line = ['seq-gen',
                    '-mHKY',
                    '-on',
                ]
    if os.environ.get('TREE_INF_TEST_RAND_NUMBER_SEED'):
        sg_seed = seed
        
    else:
        if rng is None:
            sg_seed = random.randint(0,100000)
        else:
            sg_seed = rng.randint(0,100000)
    command_line.append('-z%d' % sg_seed)
    command_line.append('simtree')
    
    seq_gen_proc = subprocess.Popen(command_line,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    cwd=temp_dir)
    
    dataset = seq_gen_proc.communicate()[0]
    
    
    # seq-gen does not exit with an error code when it fails.  I don't know why!!
    if seq_gen_proc.returncode != 0 or len(dataset) == 0:
        sys.exit('seq-gen failed!\n')
    sd = os.path.join(temp_dir, 'simdata.nex')
    d = open(sd, 'w')
    d.write(dataset)
    # CLOSING THE FILE IS IMPORTANT!  This flushes buffers, assuring that the data
    #  will be written to the filesystem before PAUP is invoked.
    d.close()
    
    ################################################################################
    # PAUP
    pcf = os.path.join(temp_dir, 'execute_paup.nex')
    pc = open(pcf, 'w')
    pc.write('''execute simdata.nex ; 
    hsearch nomultrees ; 
    savetree file=inferred.tre format = NEXUS;
    quit;
    ''')
    pc.close()
    paup_proc = subprocess.Popen(['paup', '-n', pcf], 
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 cwd=temp_dir)
    (o, e) = paup_proc.communicate()
    
    paup_output = os.path.join(temp_dir, 'inferred.tre')
    # seq-gen does not exit with an error code when it fails.  I don't know why!!
    if paup_proc.returncode != 0 or not os.path.exists(paup_output):
        sys.exit(e)
    
    
    # read true tree with the inferred tree (because it is nexus)
    inf_tree_list = TreeList.get_from_path(paup_output, 
                                           "NEXUS",
                                           taxon_set=model_tree.taxon_set)
    assert len(inf_tree_list) == 1
    inferred_tree = inf_tree_list[0]
    
    # determine which splits were missed
    treesplit.encode_splits(inferred_tree)
    treesplit.encode_splits(model_tree)
    missing = model_tree.find_missing_splits(inferred_tree)
    # sort the nodes of the true tree by depth and ask whether or not they were recovered
    node_depth_TF_list = []
    for node in model_tree.postorder_node_iter():
        children = node.child_nodes()
        if children and node.parent_node:
            first_child = children[0]
            node.depth = first_child.depth + first_child.edge.length
            if node.edge.split_bitmask in missing:
                recovered = 0
            else:
                recovered = 1
            node_depth_TF_list.append((node.depth, node.edge.length, recovered))
        else:
            node.depth = 0.0
    
    node_depth_TF_list.sort()
    
    os.remove(pcf)
    os.remove(paup_output)
    os.remove(sd)
    os.remove(mtf)
    os.rmdir(temp_dir)
    
    return node_depth_TF_list
Пример #23
0
fin = sys.argv[1]
num = int(sys.argv[2])
fout = sys.argv[3]

f = open(fin, "r")

sp_tree_str = ""
for l in f:
    sp_tree_str += l
f.close()

sp_tree_str = "[&R] " + sp_tree_str

sp_tree = dendropy.Tree.get_from_string(sp_tree_str,
                                        "newick",
                                        preserve_underscores=True)
gene_to_species_map = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping(
    containing_taxon_namespace=sp_tree.taxon_namespace, num_contained=1)
gene_tree_list = TreeList()

for i in range(num):
    gene_tree = treesim.contained_coalescent_tree(
        containing_tree=sp_tree,
        gene_to_containing_taxon_map=gene_to_species_map)
    for t in gene_tree.leaf_nodes():
        t.taxon.label = t.taxon.label.split()[0]
    gene_tree_list.append(gene_tree)

gene_tree_list.write_to_path(fout, 'newick')
Пример #24
0
                    required=True,
                    help="Sampling time")
parser.add_argument("-r", "--rootAge", required=False, help="Root age")
parser.add_argument("-t",
                    "--timeTree",
                    required=True,
                    help="The output trees with branch lengths in time unit")
parser.add_argument("-c",
                    "--composite",
                    required=False,
                    action='store_true',
                    help="Do composite optimization. Default: NO")

args = vars(parser.parse_args())

myTrees = TreeList.get_from_path(args["input"], 'newick')
smpl_times = {}
rootAge = float(args["rootAge"]) if args["rootAge"] else None

with open(args["samplingTime"], "r") as fin:
    fin.readline()
    for line in fin:
        name, time = line.split()
        smpl_times[name] = float(time)

for tree in myTrees:
    if args["composite"]:
        s = calibrate_composite_opt(tree, smpl_times, root_age=rootAge)
    else:
        s = calibrate_log_opt(tree,
                              smpl_times,
Пример #25
0
def getOpenTreesFromOneZoom(OpenTreeFile, output_dir, include_var, phy_files, verbose=False):
    '''Python routine to get OToL subtrees from phy files. If include_var is a number, 
    treat it as a recursion depth, otherwise a dictionary of names to keep. 
    The parameter phy_files should be an iterable list of .phy or .PHY filenames'''
    from numbers import Number
    ExtractionUtility = os.path.join(os.path.dirname(os.path.realpath(__file__)), "subtree_extract.pl")
    #find all nodes that end in  ott plus a number and (optionally) some other numbers starting with underscore, ending in
    # an at sign followed optionally by another number (giving the max depth) are OpenTree subnode IDs
    
    # the first number after the ott is always the ott number to use as the filename. 
    # the 1) ott123 2) ott_123: use the name, not the ott id
    
    ottRE = re.compile(r"^(.*)_ott([-~\d]+)\@(\d*)$")
    id_pattern = re.compile(r"(\d*)~?([-\d]*)$")
    
    if not os.path.isfile(OpenTreeFile):
        OpenTreeURL = "http://files.opentreeoflife.org/synthesis/opentree9.1/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre"
        warn("Could not find the OpenTree file {}. Do you want to download it from {}".format(OpenTreeFile, OpenTreeURL))
        if (input("Press Enter to accept, or N to abort... ") == "N"):
            sys.exit(0)
        if not get_species_level_tree(OpenTreeFile):
            warn("Could not get the Open Tree of Life newick file to save at {}".format(OpenTreeFile))
    if isinstance(include_var, Number):
        keep = True #means keep all of the species down to a certain depth, i.e. do not use an include list
        default_recursion_depth = include_var
    else:
        keep = include_var
        default_recursion_depth = float('nan')
    
    for file in phy_files:
        if file == "-":
            trees = TreeList.get_from_stream(sys.stdin, schema="newick", preserve_underscores=True, rooting='default-rooted')
            file = "<stdin>"
        else:
            try:
                with open(file, 'r', encoding="utf8") as stream:
                    trees = TreeList.get_from_stream(stream, schema="newick", preserve_underscores=True, rooting='default-rooted')
            except Exception as e:
                trees = []
                warn("Problem reading tree from {}: {}".format(file, e))
                
        for tree in trees:
            for i, include_ott in enumerate(tree.preorder_node_iter(
                    lambda node: True if hasattr(node, "taxon") and node.taxon is not None and ottRE.search(node.taxon.label) else False
                )):
                if i==0:
                    print("\n//;#  == {} ==, from file {}".format(tree.seed_node.label, file))
                #each of these is a file to @include
                #first get recursion depth from the end of the string
                match = ottRE.search(include_ott.taxon.label)
                name = match.group(1)
                ottIDs = match.group(2)
                if default_recursion_depth < 0:
                    recursion_depth = abs(default_recursion_depth)
                else:
                    recursion_depth = float(match.group(3)) if len(match.group(3)) else default_recursion_depth
                match = id_pattern.match(ottIDs)
                if match:
                    subfile_name = match.group(1) or name
                    del_otts = (match.group(2) or '').split('-') #split by minus signs
                    base_ott = del_otts.pop(0) or match.group(1) #first number after '=' is the tree to extract.
                    system_call = [ExtractionUtility]
                    if keep==True and math.isfinite(recursion_depth):
                        system_call.append("-d={}".format(int(recursion_depth)))
                    system_call.append(os.path.relpath(OpenTreeFile, output_dir))
                    system_call.append(base_ott)
                    OpenSubTreeFile = os.path.join(output_dir, base_ott + ".nwk")
                    if verbose:
                        warn("For "+include_ott.taxon.label+": extracting tree into " + OpenSubTreeFile, prefix='');
                    call(system_call, cwd=output_dir) #should create many ottID.nwk files
                    OutputFilename = os.path.join(output_dir, subfile_name + ".phy")
                    if os.path.isfile(OpenSubTreeFile):
                        removed = "" if len(del_otts)==0 else " removed {}".format(del_otts)
                        subtree = prune_tree(OpenSubTreeFile, keep, del_otts)
                        if keep == True:
                            if verbose:
                                warn("Found file {} with {} leaf taxa,{} and extracted to max depth: {}".format(OpenSubTreeFile, len(subtree.taxon_namespace), removed, recursion_depth), prefix='')
                        else:
                            subtree_size = len(subtree.leaf_nodes())
                            if verbose:
                                warn("Found file with {} leaf taxa, {}, and simplified to only selected taxa ({} {})".format(len(subtree.taxon_namespace), removed, subtree_size, 'leaf' if subtree_size==1 else 'leaves', del_otts), prefix='')
                        '''this is not needed until the OpenTree has branch lengths
            
                        subtree.ultrametricize() #maybe use subtree.calc_node_ages()
                        warn("ultrametricized\n", prefix="")
                        #subtree->get_root()->set_branch_length(undef);
                        stem_height = include_ott.edge_length - subtree.calc_tree_height
                        if (stem_height < 0):'''
                        
                        if verbose:
                            warn("Now writing to {}".format(OutputFilename), prefix='')
                        with open(OutputFilename, 'w', encoding='UTF-8') as outputstream:
                            subtree.write_to_stream(outputstream,'newick', unquoted_underscores=True, suppress_rooting=True)

                        max_tree_height = 0
                        if include_ott.edge_length is not None and include_ott.edge_length > max_tree_height:
                            stem_height = include_ott.edge_length- max_tree_height
                        else:
                            stem_height = 0

                        # print(r'$tree.substitute_with_fn_last("{}_ott{}@\\d*", {}, "{}", {}); //;# "user/OpenTree/{}");'.format(name, ottIDs, stem_height, name, len(subtree.taxon_namespace), OutputFilename))
                        # OpenTrees are currently not dated, so we should omit the 'stem_length' value, so that the node becomes
                        # 'date unknown'
                        print(r"$tree.substitute('{}_ott{}@\\d*', '{}');".format(name, ottIDs, OutputFilename))
                    else:
                        warn("File " + OpenSubTreeFile + " does not exist, skipping\n")
Пример #26
0
import argparse
import re
import sys
from dendropy import TreeList

parser = argparse.ArgumentParser(
    description='Check which nodes have duplicated names')
parser.add_argument('treefile',
                    type=argparse.FileType('r'),
                    nargs='+',
                    help='Any number of newick-format tree files')

args = parser.parse_args()

for f in args.treefile:

    trees = TreeList.get(file=f, schema='newick', preserve_underscores=True)

    tree = trees[0]

    count = {}
    for node in tree.preorder_internal_node_iter():
        if node.label:
            count[node.label] = 1 + (count.get(node.label) or 0)

    tot = 0
    for name, n in count.items():
        if n > 1:
            print("Node name '{}' duplicated {} times".format(name, n))
            tot = tot + n
    print("Total dups for {}: {}".format(f.name, tot))
Пример #27
0
def main():
    import treeshrink
    from treeshrink.optimal_filter_lib import TreeFilter
    from treeshrink.tree_lib import prune_tree
    from sys import argv, stdout
    from math import sqrt
    from subprocess import check_output, call
    import argparse
    from dendropy import Tree, TreeList
    from os.path import basename, dirname, splitext, realpath, join, normpath
    from os import mkdir, getcwd, rmdir
    from copy import deepcopy
    from tempfile import mkdtemp
    from shutil import rmtree
    import dendropy

    print("Launching " + treeshrink.PROGRAM_NAME + " version " +
          treeshrink.PROGRAM_VERSION)

    parser = argparse.ArgumentParser()

    parser.add_argument("-i", "--input", required=True, help="Input trees")
    parser.add_argument(
        "-d",
        "--outdir",
        required=False,
        help="Output directory. Default: inferred from the input trees")
    parser.add_argument(
        "-t",
        "--tempdir",
        required=False,
        help=
        "Directory to keep temporary files. If specified, the temp files will be kept"
    )
    parser.add_argument(
        "-o",
        "--output",
        required=False,
        help=
        "The name of the output trees. Default: inferred from the input trees")
    parser.add_argument(
        "-c",
        "--centroid",
        required=False,
        action='store_true',
        help=
        "Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO"
    )
    parser.add_argument(
        "-k",
        "--k",
        required=False,
        help=
        "The maximum number of leaves that can be removed. Default: auto-select based on the data"
    )
    parser.add_argument(
        "-q",
        "--quantiles",
        required=False,
        help="The quantile(s) to set threshold. Default is 0.05")
    parser.add_argument(
        "-m",
        "--mode",
        required=False,
        help=
        "Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto"
    )

    wdir = dirname(realpath(__file__))

    args = vars(parser.parse_args())

    MIN_OCC = 20
    MIN_TREE_NUM = 20

    quantiles = [q for q in args["quantiles"].split()
                 ] if args["quantiles"] else ["0.05"]
    #print(quantiles)

    intrees = args["input"]
    treeName, treeExt = splitext(basename(intrees))
    outtrees = args["output"] if args[
        "output"] else treeName + "_shrunk" + treeExt

    mode = args["mode"] if args["mode"] else 'auto'

    k = int(args["k"]) if args["k"] else None

    outdir = args["outdir"] if args["outdir"] else splitext(
        intrees)[0] + "_treeshrink"
    mkdir(outdir)
    if args["tempdir"]:
        tempdir = args["tempdir"]
        mkdir(tempdir)
    else:
        tempdir = mkdtemp()  #check_output(["mktemp","-d"]).rstrip()

    trees = TreeList.get(path=intrees,
                         schema='newick',
                         preserve_underscores=True)
    if mode == 'auto' and len(trees) < MIN_TREE_NUM:
        print("There are only " + str(len(trees)) +
              " gene trees in the dataset.")
        print("TreeShrink will run in 'All-genes' mode")
        mode = 'all-genes'

    gene_list = [[] for i in range(len(trees))]
    species_map = {}
    occ = {}
    removing_sets = [[[] for i in range(len(trees))]
                     for j in range(len(quantiles))]

    for t, a_tree in enumerate(trees):
        # solve k-shrink
        a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"])
        a_filter.optFilter(d=k)

        # compute species feature (i.e. the max ratio associated with each species for this gene tree)
        mapping = {}
        for i in range(1, len(a_filter.min_diams)):
            r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
            removals = a_filter.list_removals(d=i)
            for s in removals:
                mapping[s] = r if s not in mapping else max(mapping[s], r)

        # gather per-species distributions and per-gene species features
        for s in mapping:
            if mode == 'per-species' or mode == 'auto':
                species_map[s] = [
                    mapping[s]
                ] if s not in species_map else species_map[s] + [mapping[s]]
            if mode == 'per-species' or mode == 'all-genes' or mode == 'auto':
                gene_list[t].append((s, mapping[s]))

        # fit kernel density to this gene's species features (per-gene mode)
        if mode == 'per-gene':
            filename = normpath(join(tempdir, "gene_" + str(t) + ".dat"))
            with open(filename, 'w') as f:
                for s in mapping:
                    f.write(str(mapping[s]))
                    f.write("\n")
                #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping)
                #for i in range(n_missing):
                #    f.write("1.0")
                #    f.write("\n")
            if len(mapping) > 1:
                for i, q in enumerate(quantiles):
                    threshold = float(
                        check_output([
                            "Rscript",
                            normpath(
                                join(wdir, "R_scripts",
                                     "find_threshold_loglnorm.R")), filename, q
                        ]).lstrip().rstrip()[4:])
                    #print("Threshold: ", threshold)
                    for s in mapping:
                        if mapping[s] > threshold:
                            removing_sets[i][t].append(s)
        # update taxon occupancy (only for per-species mode)
        if mode == 'per-species' or mode == 'auto':
            for n in a_tree.leaf_node_iter():
                s = n.taxon.label
                occ[s] = 1 if not s in occ else occ[s] + 1

    if mode == 'auto' or mode == 'per-species':
        flag = False
        for s in occ:
            if occ[s] < MIN_OCC:
                print("Species " + s + " only exists in " + str(occ[s]) +
                      " gene trees")
                flag = True
        if flag:
            if mode == 'auto':
                mode = 'all-genes'
                print(
                    "There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode"
                )
            else:
                print(
                    "WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode"
                )
        elif mode == 'auto':
            mode = 'per-species'
            print(
                "Finish preprocessing. TreeShrink will run in 'Per-species' mode"
            )

# fit kernel density to the per-species distributions and compute per-species threshold (per-species mode)
    if mode == 'per-species':
        for s in species_map:
            l = len(species_map[s])
            for i in range(occ[s] - l):
                species_map[s].append(1)
            filename = normpath(join(tempdir, s + ".dat"))
            with open(filename, 'w') as f:
                for v in species_map[s]:
                    f.write(str(v))
                    f.write("\n")
            thresholds = [0 for i in range(len(quantiles))]
            for i, q in enumerate(quantiles):
                thresholds[i] = float(
                    check_output([
                        "Rscript",
                        normpath(
                            join(wdir, "R_scripts",
                                 "find_threshold_lkernel.R")), wdir, filename,
                        q
                    ]).lstrip().rstrip()[5:])
            species_map[s] = (species_map[s], thresholds)

        for t, gene in enumerate(gene_list):
            for s, r in gene:
                for i, threshold in enumerate(species_map[s][1]):
                    if r > threshold:
                        removing_sets[i][t].append(s)

# fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode)
    if mode == 'all-genes':
        filename = normpath(join(tempdir, "all_genes" + ".dat"))
        with open(filename, 'w') as f:
            for gene in gene_list:
                for s, r in gene:
                    f.write(str(r))
                    f.write("\n")
        for i, q in enumerate(quantiles):
            threshold = float(
                check_output([
                    "Rscript",
                    normpath(
                        join(wdir, "R_scripts", "find_threshold_lkernel.R")),
                    wdir, filename, q
                ]).lstrip().rstrip()[5:])
            for t, gene in enumerate(gene_list):
                for s, r in gene:
                    if r > threshold:
                        removing_sets[i][t].append(s)

# Dendropy's filter_leaf_nodes() seems to have problem
# i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration)
# use home-made code to prune the tree instead

    treeName, treeExt = splitext(outtrees)
    fName, ext = splitext(outtrees)
    for i, RS in enumerate(removing_sets):
        trees_shrunk = deepcopy(trees)
        outfile = normpath(join(outdir,
                                fName + "_RS_" + quantiles[i] + ".txt"))
        with open(outfile, 'w') as f:
            for item in RS:
                for s in item:
                    f.write(s + "\t")
                f.write("\n")
        for t, tree in enumerate(trees_shrunk):
            #filt = lambda node: False if (node.taxon is not None and node.taxon.label in RS[t]) else True
            #tree.filter_leaf_nodes(filt,update_bipartitions=True)
            prune_tree(tree, RS[t])
        trees_shrunk.write_to_path(
            normpath(join(outdir, treeName + "_" + quantiles[i] + treeExt)),
            'newick')

    if not args["tempdir"]:
        rmtree(tempdir)


#    call(["rm","-r",tempdir])

    print("Output files written to " + outdir)
Пример #28
0
from argparse import ArgumentParser
from dendropy import TreeList, TaxonNamespace
from dendropy.simulate import treesim
import os

parser = ArgumentParser('Generate trees of a given size with different algos')
parser.add_argument('-n', type=int, help='Tree size', default=100)
parser.add_argument('-d', type=str, help='Output directory')
args = parser.parse_args()

if not os.path.isdir(args.d):
    os.mkdir(args.d)
os.chdir(args.d)
bd2 = TreeList([
    treesim.birth_death_tree(birth_rate=1.0,
                             death_rate=0.5,
                             num_extant_tips=args.n,
                             repeat_until_success=True) for _ in range(100)
])
bd2.write_to_path('birth_death2.nwk', schema='newick')
bd5 = TreeList([
    treesim.birth_death_tree(birth_rate=1.0,
                             death_rate=0.2,
                             num_extant_tips=args.n,
                             repeat_until_success=True) for _ in range(100)
])
bd5.write_to_path('birth_death5.nwk', schema='newick')
taxa = TaxonNamespace(['T{}'.format(x) for x in range(1, args.n + 1)])
king = TreeList(
    [treesim.pure_kingman_tree(taxon_namespace=taxa) for _ in range(100)])
king.write_to_path('kingman.nwk', schema='newick')
Пример #29
0
@author: smirarab
'''
import dendropy
import sys
import os
import copy
import os.path
from dendropy import TreeList

if __name__ == '__main__':

    if len(sys.argv) < 4: 
        print "USAGE: count [output|-] treefile*"
        sys.exit(1)
    
    count= int(sys.argv[1])
    out=open(sys.argv[2],'w') if sys.argv[2] != "-" else sys.stdout 
    c={}
    trees = None
    for treeName in sys.argv[3:]:
        a = dendropy.TreeList.get_from_path(treeName, 'nexus',rooted=True, tree_offset=200)
        if trees:
            trees.append(a)
        else:
            trees = a
    import random
    samples = TreeList(random.sample(trees,count))
    samples.write(out,'newick',write_rooting=False)
    if out != sys.stdout:
        out.close()
Пример #30
0
                    metavar="input")
parser.add_argument("-c",
                    type=str,
                    help="Tree to constrain the search ala RAxML's -g",
                    metavar="constrain")
parser.add_argument("-o", type=str, help="Output file name", metavar="output")
#parser.add_argument("-s",type=int,help="Random number generator seed",metavar="seed")
args = parser.parse_args()

###Random number machinery initialization
#if args.s:
#	seed=args.s
#else:
#	seed=random.randint(0,sys.maxint)

#random.seed(seed)
#print("Seed: %d" % seed)

###Input trees
gene_trees = TreeList.get(path=args.i,
                          schema="newick",
                          rooting="force-unrooted")
constrainTree = Tree.get(path=args.c, schema="newick")
consensus = gene_trees.constrained_consensus(constrainTree=constrainTree,
                                             summarize_splits=False,
                                             min_freq=0)

#Write gene trees
consensus.write(path=args.o, schema="newick", suppress_rooting=True)
print("Done!")
Пример #31
0
def dendropy_read_treefile(treefiles, quiet=False, preserve_underscores=False, **kwargs):
    out_stream = kwargs.pop('writer', sys.stderr)
    intrees = TreeList()
    if not treefiles:
        if not quiet:
            sys.stderr.write('NOTE: reading trees from stdin\n')
        trees = sys.stdin.read()
        #try two input formats
        try:
            intrees.extend(TreeList.get_from_string(trees, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))
        except (DataParseError, NexusReader.NotNexusFileError) as e:
            sys.stderr.write('%s\n' % e.message)
            intrees.extend(TreeList.get_from_string(trees, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))
        except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError)  as e:
            if not quiet:
                sys.stderr.write('%s\n' % e.message)
                sys.exit('Could not read file %s in nexus or newick  format ...\n' % tf)
    else:
        for tf in treefiles:
            if not os.path.isfile(tf):
                out_stream.write('TreeFile %s  does not exist' % tf)
                sys.exit()

            #try two input formats
            try:
                if not quiet:
                    out_stream.write('Reading file %s in nexus format ...\n' % tf)
                intrees.extend(TreeList.get_from_path(tf, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))

            #except (DataParseError, dendropy.dataio.nexusreader.NotNexusFileError) as e:
            except (DataParseError, NexusReader.NotNexusFileError, AttributeError) as e:
                try:
                    if not quiet:
                        out_stream.write('Reading file %s in newick format ...\n' % tf)
                    intrees.extend(TreeList.get_from_path(tf, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))
                except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError)  as e:
                    if not quiet:
                        sys.stderr.write('%s\n' % e.message)
                        sys.exit('Could not read file %s in nexus or newick  format ...\n' % tf)
    return intrees
Пример #32
0
import os, sys
from subprocess import Popen, PIPE
from MSTrees import backend as GrapeTree
from MSTrees2 import backend as GrapeTree2
from dendropy import Tree, TreeList


def appendTo(nwk, label, tlist):
    tre = Tree.get_from_string(nwk, 'newick')
    tre.label = label
    tlist.append(tre)


if __name__ == '__main__':
    sim_data = sys.argv[1]
    trees = TreeList()
    tree = GrapeTree(profile=sim_data, method='MSTreeV2')
    appendTo(tree, 'MSTreeV2', trees)

    tree = GrapeTree(profile=sim_data,
                     method='MSTree',
                     missing_data='as_allele',
                     matrix_type='symmetric',
                     edge_weight='eBurst',
                     branch_recrafting='F')
    appendTo(tree, 'goeBurstA', trees)

    tree = GrapeTree(profile=sim_data,
                     method='MSTree',
                     missing_data='pair_delete',
                     matrix_type='symmetric',
Пример #33
0
def main():

    print("Launching " + treeshrink.PROGRAM_NAME + " version " + treeshrink.PROGRAM_VERSION)
    

    parser = argparse.ArgumentParser()

    parser.add_argument("-i","--indir",required=False,help="The parent input directory where the trees (and alignments) can be found")
    parser.add_argument("-t","--tree",required=False,help="The name of the input tree/trees. If the input directory is specified (see -i option), each subdirectory under it must contain a tree with this name. Otherwise, all the trees can be included in this one file. Default: input.tre")
    parser.add_argument("-a","--alignment",required=False,help="The name of the input alignment; can only be used when the input directory is specified (see -i option). Each subdirectory under it must contain an alignment with this name. Default: input.fasta")
    parser.add_argument("-c","--centroid",required=False,action='store_true',help="Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO")
    parser.add_argument("-k","--k",required=False,help="The maximum number of leaves that can be removed. Default: auto-select based on the data; see also -s")
    parser.add_argument("-s","--kscaling",required=False,help="If -k not given, we use k=min(n/a,b*sqrt(n)) by default; using this option, you can set the a,b constants; Default: '5,2'")
    parser.add_argument("-q","--quantiles",required=False,help="The quantile(s) to set threshold. Default is 0.05")
    parser.add_argument("-b","--minimpact",required=False,help="Do not remove species on the per-species test if their impact on diameter is less than MINIPACT%% where x is the given value. Default: 5")
    parser.add_argument("-m","--mode",required=False,help="Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto")
    parser.add_argument("-o","--outdir",required=False,help="Output directory. Default: the same as input directory (if it is specified) or the same as the input trees")
    parser.add_argument("-p","--tempdir",required=False,help="Directory to keep temporary files. If specified, the temp files will be kept")
    parser.add_argument("-r","--libdir",required=False,help="Directory of the R libraries and scripts. Default: 2 layers above the treeshrink package")

    args = vars(parser.parse_args())


    MIN_OCC = 20
    MIN_TREE_NUM = 20

    libdir = args["libdir"] if args["libdir"] else dirname(dirname(realpath(treeshrink.__file__)))

    tempdir = set_tmp_dir(args["tempdir"])  
    
    quantiles = [ q for q in args["quantiles"].split()] if args["quantiles"] else ["0.05"]
    
    minimpact = (float(args["minimpact"])/100)+1 if args["minimpact"] else 1.05
    
    scaling = [int(x) for x in args["kscaling"].split(",")] if  args["kscaling"] else [5,2]

    if args["indir"]:
        treename = splitext(args["tree"])[0] if args["tree"] else "input"
        subdirs = [d for d in listdir(args["indir"]) if exists(normpath(join(args["indir"],d,args["tree"] if args["tree"] else "input.tre")))]
        intrees = get_tmp_file(treename + ".trees")
        with open(intrees,'w') as fout:
            for d in subdirs:
                treename = args["tree"] if args["tree"] else "input.tre"
                treefile = normpath(join(args["indir"],d,treename))
                if exists(treefile):
                    fout.write(open(treefile,'r').read())                
    else:
        intrees = args["tree"]


    mode = args["mode"] if args["mode"] else 'auto'

    k = int(args["k"]) if args["k"] else None

    if args["outdir"]:
        outdir = args["outdir"] 
        check_dir(outdir)
    elif args["indir"]:
        outdir = args["indir"]
    else:
        outdir = splitext(intrees)[0] + "_treeshrink"
        mkdir(outdir)

    ''' Check to make sure output can be written'''
    if args["indir"]:
        i = 0
        fName,ext = splitext(basename(intrees))
        for sd in subdirs:
            outfile = normpath(join(outdir,sd, fName + "_shrunk_RS_" + quantiles[i] + ".txt"))
            with open(outfile,'w') as f:
                pass


    trees = TreeList.get(path=intrees,schema='newick',preserve_underscores=True)

    if mode=='auto' and len(trees) < MIN_TREE_NUM:
        print("There are only " + str(len(trees)) + " gene trees in the dataset.")
        print("TreeShrink will run in 'All-genes' mode")
        mode='all-genes'

    gene_list = [[] for i in range(len(trees))]
    species_map = {}
    occ = {}
    removing_sets = [ [ [ ] for i in range(len(trees)) ] for j in range(len(quantiles)) ]

    for t,a_tree in enumerate(trees):
        # solve k-shrink
        a_filter = TreeFilter(ddpTree=a_tree,centroid_reroot=args["centroid"],scaling=scaling)
        a_filter.optFilter(d=k)

        # compute species feature (i.e. the max ratio associated with each species for this gene tree)
        mapping = {}
        #print(a_filter.min_diams)
        for i in range(1,len(a_filter.min_diams)):
            if a_filter.min_diams[i] == 0:
                print("Warning: tree %d has no diameter (has only zero branch lengths) after removing %d sequences." %(t+1,i))
                break
            r = a_filter.min_diams[i-1]/a_filter.min_diams[i]
            removals = a_filter.list_removals(d=i)
            for s in removals:
                mapping[s] = r if s not in mapping else max(mapping[s],r)
        
        # gather per-species distributions and per-gene species features
        for s in mapping:
            if mode == 'per-species' or mode == 'auto':
                species_map[s] = [mapping[s]] if s not in species_map else species_map[s]+[mapping[s]]
            if mode == 'per-species' or mode == 'all-genes' or mode == 'auto':
                gene_list[t].append((s,mapping[s]))
        
        # fit kernel density to this gene's species features (per-gene mode)
        if mode == 'per-gene':
            filename = get_tmp_file("gene_%s.dat" %str(t))
            with open(filename,'w') as f:
                for s in mapping:
                    f.write(str(mapping[s]))
                    f.write("\n")
                #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping)
                #for i in range(n_missing):
                #    f.write("1.0")
                #    f.write("\n")
            if len(mapping) > 1:
                for i,q in enumerate(quantiles):
                    threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_loglnorm.R")),filename,q]).lstrip().rstrip()[4:]) 
                    #print("Threshold: ", threshold)
                    for s in mapping:
                        if mapping[s] > threshold: 
                            removing_sets[i][t].append(s)
        # update taxon occupancy (only for per-species mode)
        if mode == 'per-species' or mode == 'auto':
            for n in a_tree.leaf_node_iter():
                s = n.taxon.label
                occ[s] = 1 if not s in occ else occ[s]+1
    
    if mode == 'auto' or mode == 'per-species':
        flag = False
        for s in occ:
            if occ[s] < MIN_OCC:
                print ("Species " + s + " only exists in " + str(occ[s]) + " gene trees")
                flag = True
        if flag:
            if mode == 'auto':
                mode = 'all-genes'
                print ("There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode")
            else:
                print ("WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode")
        elif mode == 'auto':
            mode = 'per-species'
            print("Finish preprocessing. TreeShrink will run in 'Per-species' mode ...    ")

# fit kernel density to the per-species distributions and compute per-species threshold (per-species mode)
    if mode == 'per-species':
        for s in sorted(species_map):
            l = len(species_map[s])
            for i in range(occ[s]-l):
                species_map[s].append(1)
            filename = get_tmp_file(s + ".dat")
            with open(filename,'w') as f:
                for v in species_map[s]:
                    f.write(str(v))
                    f.write("\n")
            thresholds = [ 0 for i in range(len(quantiles)) ]        
            for i,q in enumerate(quantiles): 
                thresholds[i] = max(minimpact,float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:]))
                print("%s:\n\t will be cut in %d trees where its impact is above %f for quantile %s" %(s,sum(1 for x in species_map[s] if x>thresholds[i]),thresholds[i],q,))
            species_map[s] = (species_map[s],thresholds)

        for t,gene in enumerate(gene_list):
            for s,r in gene:
                for i,threshold in enumerate(species_map[s][1]):
                    if r > threshold:
                        removing_sets[i][t].append(s)
                    

# fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode) 
    if mode == 'all-genes':
        filename = get_tmp_file("all_genes" + ".dat")
        with open(filename,'w') as f:
            for gene in gene_list:
                for s,r in gene:
                    f.write(str(r))
                    f.write("\n")
        for i,q in enumerate(quantiles):
            threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:])
            for t,gene in enumerate(gene_list):
                for s,r in gene:
                    if r > threshold:
                        removing_sets[i][t].append(s)

    print("Writing output ...\n")
# Dendropy's filter_leaf_nodes() seems to have problem
# i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration)
# use home-made code to prune the tree instead

    #treeName,treeExt = splitext(basename(intrees))
    #outtrees = args["output"] if args["output"] else treeName + "_shrunk" + treeExt
    fName,ext = splitext(basename(intrees))
    
    for i,RS in enumerate(removing_sets):
        trees_shrunk = deepcopy(trees)
        
        if args["indir"] is None:
            outfile = normpath(join(outdir,fName + "_RS_" + quantiles[i] + ".txt"))
            with open(outfile,'w') as f:
                for item in RS:
                    for s in item:
                        f.write(s + "\t")
                    f.write("\n")
            for tree,rs in zip(trees_shrunk,RS):
                prune_tree(tree,rs)
            trees_shrunk.write_to_path(normpath(join(outdir,fName + "_" + quantiles[i] + ext)),'newick')  
        else:
            for sd,item in zip(subdirs,RS):
                outfile = normpath(join(outdir,sd, fName + "_shrunk_RS_" + quantiles[i] + ".txt"))
                with open(outfile,'w') as f:
                    for s in item:
                        f.write(s + "\t")
            for sd,tree,rs in zip(subdirs,trees_shrunk,RS):
                L = set(x.taxon.label for x in tree.leaf_node_iter())
                prune_tree(tree,rs)
                treeName,treeExt = splitext(args["tree"])
                treefile = normpath(join(outdir,sd, treeName + "_shrunk_" + quantiles[i] + treeExt))
                tree.write_to_path(treefile,'newick',unquoted_underscores=True,real_value_format_specifier=".16g")
                
                aln_filename = args["alignment"] if args["alignment"] else "input.fasta"
                alnName,alnExt = splitext(aln_filename)
                input_aln = normpath(join(args["indir"],sd,aln_filename))
                if isfile(input_aln): 
                    output_aln = normpath(join(outdir,sd,alnName+"_shrunk"+quantiles[i]+alnExt))
                    alg = CompactAlignment()
                    alg.read_file_object(input_aln,'fasta')
                    S=set(alg.keys())
                    if (L.difference(alg.keys())) or S.difference(L):
                        print("ERROR: For gene %s, alignment names don't match tree names. Will skip it.\n\tonly in tree:\t%s\n\tonly in alignment:\t%s"%(sd,str(L.difference(S)),str(S.difference(L))))
                    else:
                        alg.remove_all(rs)
                        alg.mask_gapy_sites(1)
                        alg.write(output_aln,'fasta')

    if not args["tempdir"]:
        rmtree(tempdir)
#    call(["rm","-r",tempdir])

    print("Output files written to " + outdir) 
    	for ut in tree_list:
    	    sd = treecalc.symmetric_difference(tree,ut)
            #print sd ## error check
            if sd == 0:
            	redundant_count +=1
                break
        else:
            tree_list.append(tree)
    return tree_list, redundant_count


if __name__ == '__main__':
    #inputs#
    mle_tree = raw_input("File with Maximum Likelihood tree: ")
    mcmc_trees = raw_input("File with MCMC trees: ")
    burnin = int(raw_input("Burnin: "))
    outfile = raw_input("Name of outfile: ")
    
    uts = [] #list of unique topologies
    taxa = dendropy.TaxonSet() #initialize TaxonSet object
    mle_tree = dendropy.Tree.get_from_path(mle_tree, 'nexus', taxon_set=taxa)
    uts.append(mle_tree) #MLE tree is the first topology in unique list
    
    uts, redundant_count = unique_trees(uts,mcmc_trees,'nexus',burnin,taxonset=taxa)
    print "\nNumber of redundant trees: %d" % redundant_count
    print "Number of unique trees: %d\n" % len(uts)
    unique_tree_list = TreeList(uts)
    unique_tree_list.write_to_path(outfile,'newick',suppress_edge_lengths=True)
	
    	    
import dendropy
from dendropy import TreeList,Taxon,Node
import sys
import argparse

parser = argparse.ArgumentParser(description="Parses a Newick tree file, modifying the branch lengths from number of generations to years and adding an outgroup")
parser.add_argument("-gt",type=float,default=0,required=False,help="Generation time")
parser.add_argument("-od",type=float,default=0,required=False,help="Outgroup branch length")
parser.add_argument("-i",type=str,default="infile.tree",required=True,help="Input Newick tree file")
parser.add_argument("-o",type=str,default="outtree.tree",required=False,help="Output Newick tree file")
args = parser.parse_args()

trees=TreeList.get_from_path(args.i,schema="newick",rooting="force-rooted")
if args.gt != 0:
	print "Scaling branch lengths to time with generation time %d\n" % args.gt
	for tree in trees:
		for edge in tree.preorder_edge_iter():
			#print "DEBUG: %s" % edge.length
			if edge.length != None:
				edge.length=edge.length/args.gt

if args.od != 0:
	print "Adding outgroup with branch length %d\n" % args.od
	namespace=trees.taxon_namespace
	outgroup= Taxon("outgroup")
	namespace.add_taxon(outgroup)
	ntree=0
	labels=namespace.labels()
	labels.remove("outgroup")
	for tree in trees:
		outgroup_node=Node(taxon=outgroup,edge_length=args.od)
Пример #36
0
### Main ###

### Argparse
parser = argparse.ArgumentParser(
    description="Converts a newick tree file in a Nexus file",
    prog="newicktonexusphylonet.py")
parser.add_argument("-i",
                    required=True,
                    type=str,
                    help="Input newick tree name")
parser.add_argument("-o", required=True, type=str, help="Output file name")
args = parser.parse_args()

###Main
itrees = TreeList.get(path=args.i,
                      schema="newick",
                      rooting="default-rooted",
                      preserve_underscores=True)
itrees.write(path=args.o,
             schema="nexus",
             unquoted_underscores=True,
             suppress_rooting=True)
namespace = itrees.taxon_namespace
labels = namespace.labels()
regex = re.compile("(.+)_.+_.+")
speciesmap = defaultdict(list)

for label in labels:
    match = regex.match(label).group(1)
    speciesmap[match].append(label)

textlisttrees = "(" + ",".join(str(x) for x in xrange(1,