示例#1
0
    def _reroot_helper(self, gtree, newCopy=True, returnEdge=False):
        """
        Yields rerooted trees.
        Adapted from phylo.recon_root.
        """

        # make a consistent unrooted copy of gene tree
        if newCopy:
            gtree = gtree.copy()

        if len(gtree.leaves()) == 2:
            raise StopIteration

        oldroot = gtree.root.name
        treelib.unroot(gtree, newCopy=False)
        treelib.reroot(gtree,
                       gtree.nodes[sorted(gtree.leaf_names())[0]].parent.name,
                       onBranch=False, newCopy=False)

        # make rerooting order consistent using hash ordering
        phylo.hash_order_tree(gtree, self.gene2species)

        # get list of edges to root on
        edges = []
        def walk(node):
            edges.append((node, node.parent))
            if not node.is_leaf():
                node.recurse(walk)
                edges.append((node, node.parent))
        for child in gtree.root.children:
            walk(child)

        # try initial root
        treelib.reroot(gtree, edges[0][0].name, newCopy=False)
        gtree.rename(gtree.root.name, oldroot)
        if returnEdge:
            yield gtree, edges[0]
        else:
            yield gtree
        rootedge = sorted(edges[0])

        # try rerooting on everything
        for edge in edges[1:]:
            if sorted(edge) == rootedge:
                continue
            rootedge = sorted(edge)

            node1, node2 = edge
            if node1.parent != node2:
                node1, node2 = node2, node1
            assert node1.parent == node2, "%s %s" % (node1.name, node2.name)

            # new root and cost
            treelib.reroot(gtree, node1.name, newCopy=False, keepName=True)
            if returnEdge:
                yield gtree, edge
            else:
                yield gtree
示例#2
0
def draw_raxml_tree(tr, adef):
    util.tic("Tree to string...")
    treestr = raxml.tree_to_string(tr, adef)
    util.toc()

    util.tic("Drawing tree...")
    T = treelib.parse_newick(treestr)
    T2 = treelib.unroot(T)
    treelib.draw_tree(T2, out=sys.stdout, minlen=5, maxlen=5)
    util.toc()
示例#3
0
def draw_raxml_tree(tr, adef):
    util.tic("Tree to string...")
    treestr = raxml.tree_to_string(tr, adef)
    util.toc()

    util.tic("Drawing tree...")
    T = treelib.parse_newick(treestr)
    T2 = treelib.unroot(T)
    treelib.draw_tree(T2, out=sys.stdout, minlen=5, maxlen=5)
    util.toc()
示例#4
0
def prob_alignment_nooptimize(alnfile,
                              partfile,
                              coal_tree,
                              rates,
                              freqs,
                              alphas,
                              threads=1,
                              seed=ALIGNMENT_SEED,
                              eps=0.1):
    """
    This function implements the pll function. It computes the log likelihood
    of alignment data given the coal_tree without optimizing parameters. 
    Mathematically, it computes: P(A | T^G, t^G).
        
    rates, freqs, alphas  -- parameters in pll computation
    alnfile               -- alignment file
    partfile              -- partition file
    coal_tree             -- coalescent tree
 
    """

    # convert coal_tree to filename
    tree_temp = tempfile.NamedTemporaryFile(delete=False)
    tree = treelib.unroot(coal_tree,
                          newCopy=True)  # unrooted tree required for ML
    tree.write(tree_temp, oneline=True)
    tree_temp.close()
    tree_filename = tree_temp.name

    # initialize pll instance
    pll = pllpy.pll(alnfile, partfile, tree_filename, threads, seed)

    # initialize pll with previously optimized parameters
    for i in range(pll.get_number_of_partitions()):
        pll.set_alpha(alphas[i], i, True)
        pll.set_frequencies(freqs[i], i, True)
        if pll.is_dna(i):
            pll.set_rates(rates[i], i, True)

    # set likelihood convergence
    pll.set_epsilon(eps)

    # do not optimize any of the parameters
    # pll.optimise(False, False, False, False)

    # get (log) likelihood
    prob = pll.get_likelihood()

    os.remove(tree_filename)
    return prob
示例#5
0
def prob_alignment(alnfile,
                   partfile,
                   coal_tree,
                   threads=1,
                   seed=int("0xDEADBEEF", 16),
                   eps=0.1,
                   opt_branches=False):
    """
    This function implements the pll function. It optimize the alpha, rates
    and frequencies, and use these parameters to compute the log likelihood
    of alignment data given the coal_tree. This function is not used because 
    it is computationally inefficient to optimize the parameters.
	 
    """

    # convert coal_tree to filename
    tree_temp = tempfile.NamedTemporaryFile(delete=False)
    tree = treelib.unroot(coal_tree,
                          newCopy=True)  # unrooted tree required for ML
    tree.write(tree_temp, oneline=True)
    tree_temp.close()
    tree_filename = tree_temp.name

    # initialize pll instance
    pll = pllpy.pll(alnfile, partfile, tree_filename, threads, seed)

    # tell pll to optimize all model parameters
    for i in range(pll.get_number_of_partitions()):
        pll.set_optimisable_alpha(i, True)
        pll.set_optimisable_frequencies(i, True)
        if pll.is_dna(i):
            pll.set_optimisable_rates(i, True)

    # set likelihood convergence
    pll.set_epsilon(eps)

    # optimize the model
    pll.optimise(True, True, True,
                 opt_branches)  # rates, freqs, alphas, branches

    # get (log) likelihood
    prob = pll.get_likelihood()

    os.remove(tree_filename)
    return prob
示例#6
0
def optimize_parameters(alnfile,
                        partfile,
                        coal_tree,
                        threads=1,
                        seed=ALIGNMENT_SEED,
                        eps=0.1):
    """
    The function takes in alignment file, partitions file, coal_tree,
    and return the rates, freqs, alphas after optimization. These 
    parameters are used when the alignment probability is calculated.
    """
    rates = []
    freqs = []
    alphas = []

    # convert coal_tree to filename
    tree_temp = tempfile.NamedTemporaryFile(delete=False)
    tree = treelib.unroot(coal_tree,
                          newCopy=True)  # unrooted tree required for ML
    tree.write(tree_temp, oneline=True)
    tree_temp.close()
    tree_filename = tree_temp.name

    # initialize pll instance
    pll = pllpy.pll(alnfile, partfile, tree_filename, threads, seed)

    # set likelihood convergence
    pll.set_epsilon(eps)

    # optimize rates, freqs, alphas, and branches
    pll.optimise(True, True, True, True)

    # store optimal parameters
    for i in range(pll.get_number_of_partitions()):
        rates.append(pll.get_rates_vector(i))
        freqs.append(pll.get_frequencies_vector(i))
        alphas.append(pll.get_alpha(i))

    os.remove(tree_filename)
    return rates, freqs, alphas
示例#7
0
文件: phylo.py 项目: sarab609/scraps
def recon_root(gtree, stree, gene2species = gene2species, 
               rootby = "duploss", newCopy=True):
    """Reroot a tree by minimizing the number of duplications/losses/both"""
    
    # make a consistent unrooted copy of gene tree
    if newCopy:
        gtree = gtree.copy()
        
    if len(gtree.leaves()) == 2:
        return
        
    treelib.unroot(gtree, newCopy=False)
    treelib.reroot(gtree, 
                   gtree.nodes[sorted(gtree.leaf_names())[0]].parent.name, 
                   onBranch=False, newCopy=False)
    
    
    # make recon root consistent for rerooting tree of the same names
    # TODO: there is the possibility of ties, they are currently broken
    # arbitrarily.  In order to make comparison of reconRooted trees with 
    # same gene names accurate, hashOrdering must be done, for now.
    hash_order_tree(gtree, gene2species)
    
    # get list of edges to root on
    edges = []
    def walk(node):
        edges.append((node, node.parent))
        if not node.is_leaf():
            node.recurse(walk)
            edges.append((node, node.parent))
    for child in gtree.root.children:
        walk(child)

    
    # try initial root and recon    
    treelib.reroot(gtree, edges[0][0].name, newCopy=False)
    recon = reconcile(gtree, stree, gene2species)
    events = label_events(gtree, recon)
    
    # find reconciliation that minimizes loss
    minroot = edges[0]
    rootedge = sorted(edges[0])
    if rootby == "dup": 
        cost = count_dup(gtree, events)
    elif rootby == "loss":
        cost = len(find_loss(gtree, stree, recon))
    elif rootby == "duploss":
        cost = count_dup_loss(gtree, stree, recon, events)
    else:
        raise "unknown rootby value '%s'"  % rootby
    mincost = cost
    
    
    # try rooting on everything
    for edge in edges[1:]:
        if sorted(edge) == rootedge:
            continue
        rootedge = sorted(edge)
        
        node1, node2 = edge
        if node1.parent != node2:
            node1, node2 = node2, node1
        assert node1.parent == node2, "%s %s" % (node1.name, node2.name)
        
        # uncount cost
        if rootby in ["dup", "duploss"]:
            if events[gtree.root] == "dup":
                cost -= 1
            if events[node2] == "dup":
                cost -= 1
        if rootby in ["loss", "duploss"]:
            cost -= len(find_loss_under_node(gtree.root, recon))
            cost -= len(find_loss_under_node(node2, recon))
        
        # new root and recon
        treelib.reroot(gtree, node1.name, newCopy=False)        
        
        recon[node2] = reconcile_node(node2, stree, recon)
        recon[gtree.root] = reconcile_node(gtree.root, stree, recon)
        events[node2] = label_events_node(node2, recon)
        events[gtree.root] = label_events_node(gtree.root, recon)
        
        if rootby in ["dup", "duploss"]:
            if events[node2] ==  "dup":
                cost += 1
            if events[gtree.root] ==  "dup":
                cost += 1
        if rootby in ["loss", "duploss"]:
            cost += len(find_loss_under_node(gtree.root, recon))
            cost += len(find_loss_under_node(node2, recon))
        
        # keep track of min cost
        if cost < mincost:
            mincost = cost
            minroot = edge

    
    # root tree by minroot
    if edge != minroot:
        node1, node2 = minroot
        if node1.parent != node2:
            node1, node2 = node2, node1
        assert node1.parent == node2
        treelib.reroot(gtree, node1.name, newCopy=False)
    
    return gtree
示例#8
0
文件: phylo.py 项目: sarab609/scraps
def least_square_error(tree, distmat, genes, forcePos=True, weighting=False):
    """Least Squared Error algorithm for phylogenetic reconstruction"""
    
    # use SCIPY to perform LSE
    import scipy
    import scipy.linalg
    
    def makeVector(array):
        """convience function for handling different configurations of scipy"""
        if len(array.shape) == 2:
            if array.shape[0] == 1:
                return array[0]
            else:
                return scipy.transpose(array)[0]
        else:
            return array
            
    
    if treelib.is_rooted(tree):
        rootedge = sorted([x.name for x in tree.root.children])
        treelib.unroot(tree, newCopy=False)
    else:
        rootedge = None        
    
    # create pairwise dist array
    dists = []
    for i in xrange(len(genes)):
        for j in xrange(i+1, len(genes)):
            dists.append(distmat[i][j])
    
    # create topology matrix
    topmat, edges = makeTopologyMatrix(tree, genes)
    
    # setup matrix and vector
    if weighting:
        topmat2 = scipy.array([[util.safediv(x, math.sqrt(dists[i]), 0) 
                                for x in row]
                               for i, row in enumerate(topmat)])
        paths = scipy.array(map(math.sqrt, dists))
    else:
        topmat2 = scipy.array(topmat)
        paths = scipy.array(dists)

    
    # solve LSE
    edgelens, resids, rank, singlars = scipy.linalg.lstsq(topmat2, paths)
    
    # force non-negative branch lengths
    if forcePos:
        edgelens = [max(float(x), 0) for x in makeVector(edgelens)]
    else:
        edgelens = [float(x) for x in makeVector(edgelens)]
    
    # calc path residuals (errors)
    paths2 = makeVector(scipy.dot(topmat2, edgelens))
    resids = (paths2 - paths).tolist()
    paths = paths.tolist()
    
    # set branch lengths
    setBranchLengths(tree, edges, edgelens, paths, resids, 
                     topmat=topmat, rootedge=rootedge)
    
    return util.Bundle(resids=resids, 
                       paths=paths, 
                       edges=edges, 
                       topmat=topmat)
def phyml(seqs, verbose=True, args=None, 
          usertree=None, seqtype="pep", saveOutput="", bootiter=0,
          opttree=True, optbranches=True, nrates=4):
    
    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()
    
    util.tic("phyml on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = phylip.write_phylip_align(file("infile", "w"), seqs)
    util.write_list(file("labels", "w"), labels)
    
    options = "y"
    
    # only bootstrap when iterations are above 1
    if bootiter == 1:
        bootiter = 0
    
    if usertree != None:
        usertree = treelib.unroot(usertree)
        phylip.write_in_tree("intree", usertree, labels)
        treefile = "intree"
    else:
        treefile = "BIONJ"
    
    
    optimize = ""
    if opttree:
        optimize += "y "
    else:
        optimize += "n "
    
    if optbranches:
        optimize += "y "
    else:
        optimize += "n "
    
    
    if args == None:
        if seqtype == "dna":
            args = "infile 0 s 1 %d HKY e e %d e %s %s" % \
                (bootiter, nrates, treefile, optimize)
        elif seqtype == "pep":
            args = "infile 1 s 1 %d JTT e %d e %s %s" % \
                (bootiter, nrates, treefile, optimize)
        else:
            assert False, "unknown sequence type '%s'" % seqtype
    
    
    phylip.exec_phylip("phyml %s" % args, options, verbose)
    
    # parse tree
    tree = phylip.read_out_tree("infile_phyml_tree.txt", labels)
    
    # parse likelihood
    tree.data["logl"] = float(file("infile_phyml_lk.txt").read())
    
    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)
    util.toc()
    
    return tree
示例#10
0
raxml.optimize_model(adef, tr)
util.toc()

# draw_raxml_tree(tr, adef)

util.tic("Getting parameters for LH...")
bestVector, bestLH, weightSum = raxml.compute_best_LH(tr)
util.log("bestLH: %.3f" % bestLH)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))
treehashes = set([treehash])

for i in xrange(options.niter):
    while treehash in treehashes:
        util.log("random spr")
        node1, node2 = phylo.propose_random_spr(tree)
        phylo.perform_spr(tree, node1, node2)
        treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))

    treehashes.add(treehash)

    r, w = os.pipe()
    fr, fw = os.fdopen(r, "r"), os.fdopen(w, "w")

    tree.write(out, oneline=True)
示例#11
0
def phyml(seqs,
          verbose=True,
          args=None,
          usertree=None,
          seqtype="pep",
          saveOutput="",
          bootiter=0,
          opttree=True,
          optbranches=True,
          nrates=4):

    phylip.validate_seqs(seqs)
    cwd = phylip.create_temp_dir()

    util.tic("phyml on %d of length %d" % (len(seqs), len(seqs.values()[0])))

    # create input
    labels = phylip.write_phylip_align(file("infile", "w"), seqs)
    util.write_list(file("labels", "w"), labels)

    options = "y"

    # only bootstrap when iterations are above 1
    if bootiter == 1:
        bootiter = 0

    if usertree != None:
        usertree = treelib.unroot(usertree)
        phylip.write_in_tree("intree", usertree, labels)
        treefile = "intree"
    else:
        treefile = "BIONJ"

    optimize = ""
    if opttree:
        optimize += "y "
    else:
        optimize += "n "

    if optbranches:
        optimize += "y "
    else:
        optimize += "n "

    if args == None:
        if seqtype == "dna":
            args = "infile 0 s 1 %d HKY e e %d e %s %s" % \
                (bootiter, nrates, treefile, optimize)
        elif seqtype == "pep":
            args = "infile 1 s 1 %d JTT e %d e %s %s" % \
                (bootiter, nrates, treefile, optimize)
        else:
            assert False, "unknown sequence type '%s'" % seqtype

    phylip.exec_phylip("phyml %s" % args, options, verbose)

    # parse tree
    tree = phylip.read_out_tree("infile_phyml_tree.txt", labels)

    # parse likelihood
    tree.data["logl"] = float(file("infile_phyml_lk.txt").read())

    if saveOutput != "":
        phylip.save_temp_dir(cwd, saveOutput)
    else:
        phylip.cleanup_temp_dir(cwd)
    util.toc()

    return tree
示例#12
0
 def draw_raxml_tree(self, *args, **kargs):
     """Draw raxml tr -- adef and tr must have been previously defined"""
     treestr = raxml.tree_to_string(self.tr, self.adef)
     tree = treelib.parse_newick(treestr)
     treelib.draw_tree(treelib.unroot(tree), *args, **kargs)
示例#13
0
treefile = args[0]
seqfile = util.replace_ext(treefile, options.treeext, options.alignext)
out = util.open_stream(options.output, 'w')

util.tic("Initializing RAXML and optimizing...")
module = raxml.RAxML()
module.optimize_model(treefile, seqfile, options.extra)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))
treehashes = set([treehash])

for i in xrange(options.niter):
    while treehash in treehashes:
        util.log("random spr")
        node1, node2 = phylo.propose_random_spr(tree)
        phylo.perform_spr(tree, node1, node2)
        treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))

    treehashes.add(treehash)
    tree.write(out, oneline=True); out.write('\n'); out.flush()

    util.tic("Computing LH...")
    p, Dlnl = module.compute_lik_test(tree)
    util.log("pvalue: %.3f, Dlnl: %.3f" % (p, Dlnl))