Exemplo n.º 1
0
def gnj(dists, keep=None, dkeep=0, ui=None):
    """Arguments:
        - dists: dict of (name1, name2): distance
        - keep: number of best partial trees to keep at each iteration,  
          and therefore to return.  Same as Q parameter in original GNJ paper.
        - dkeep: number of diverse partial trees to keep at each iteration, 
          and therefore to return.  Same as D parameter in original GNJ paper.
    Result:
        - a sorted list of (tree length, tree) tuples
    """
     
    (names, d) = distanceDictTo2D(dists)

    if keep is None:
        keep = len(names) * 5
    all_keep = keep + dkeep
        
    # For recognising duplicate topologies, encode partitions (ie: edges) as 
    # frozensets of tip names, which should be quickly comparable.
    arbitrary_anchor = names[0]
    all_tips = frozenset(names)
    def encode_partition(tips):
        included = frozenset(tips)
        if arbitrary_anchor not in included:
            included = all_tips - included
        return included
        # could also convert to long int, or cache, would be faster?    
    
    tips = [frozenset([n]) for n in names]
    nodes = [LightweightTreeTip(name) for name in names]
    star_tree = PartialTree(d, nodes, tips, 0.0)
    star_tree.topology = frozenset([])
    trees = [star_tree]
    
    # Progress display auxiliary code
    template = ' size %%s/%s  trees %%%si' % (len(names), len(str(all_keep)))
    total_work = 0
    max_candidates = 1
    total_work_before = {}
    for L in range(len(names), 3, -1):
        total_work_before[L] = total_work
        max_candidates = min(all_keep, max_candidates*L*(L-1)//2)
        total_work += max_candidates
        
    def _show_progress():
        t = len(next_trees)
        work_done = total_work_before[L] + t
        ui.display(msg=template % (L, t), progress=work_done/total_work)
    
    for L in range(len(names), 3, -1):
        # Generator of candidate joins, best first.
        # Note that with dkeep>0 this generator is used up a bit at a time
        # by 2 different interupted 'for' loops below.
        candidates = uniq_neighbour_joins(trees, encode_partition)
        
        # First take up to 'keep' best ones
        next_trees = []
        _show_progress()
        for pair in candidates:
            next_trees.append(pair)
            if len(next_trees) == keep:
                break 
        _show_progress()

        # The very best one is used as an anchor for measuring the 
        # topological distance to others
        best_topology = next_trees[0].topology
        prior_td = [len(best_topology ^ tree.topology) for tree in trees]
        
        # Maintain a separate queue of joins for each possible 
        # topological distance 
        max_td = (max(prior_td) + 1) // 2
        queue = [deque() for g in range(max_td+1)]
        queued = 0
        
        # Now take up to dkeep joins, an equal number of the best at each 
        # topological distance, while not calculating any more TDs than 
        # necessary.
        prior_td = dict(zip(map(id, trees), prior_td))
        target_td = 1
        while (candidates or queued) and len(next_trees) < all_keep:
            if candidates and not queue[target_td]:
                for pair in candidates:
                    diff = pair.new_partition not in best_topology
                    td = (prior_td[id(pair.tree)] + [-1,+1][diff]) // 2
                    # equiv, slower: td = len(best_topology ^ topology) // 2
                    queue[td].append(pair)
                    queued += 1
                    if td == target_td:
                        break
                else:
                    candidates = None
            if queue[target_td]:
                next_trees.append(queue[target_td].popleft())
                queued -= 1
                _show_progress()

            target_td = target_td % max_td + 1
        
        trees = [pair.joined() for pair in next_trees]
                
    result = [tree.asScoreTreeTuple() for tree in trees]
    result.sort()
    return ScoredTreeCollection(result)
Exemplo n.º 2
0
def gnj(dists, keep=None, dkeep=0, ui=None):
    """Arguments:
        - dists: dict of (name1, name2): distance
        - keep: number of best partial trees to keep at each iteration,  
          and therefore to return.  Same as Q parameter in original GNJ paper.
        - dkeep: number of diverse partial trees to keep at each iteration, 
          and therefore to return.  Same as D parameter in original GNJ paper.
    Result:
        - a sorted list of (tree length, tree) tuples
    """
     
    (names, d) = distanceDictTo2D(dists)

    if keep is None:
        keep = len(names) * 5
    all_keep = keep + dkeep
        
    # For recognising duplicate topologies, encode partitions (ie: edges) as 
    # frozensets of tip names, which should be quickly comparable.
    arbitrary_anchor = names[0]
    all_tips = frozenset(names)
    def encode_partition(tips):
        included = frozenset(tips)
        if arbitrary_anchor not in included:
            included = all_tips - included
        return included
        # could also convert to long int, or cache, would be faster?    
    
    tips = [frozenset([n]) for n in names]
    nodes = [LightweightTreeTip(name) for name in names]
    star_tree = PartialTree(d, nodes, tips, 0.0)
    star_tree.topology = frozenset([])
    trees = [star_tree]
    
    # Progress display auxiliary code
    template = ' size %%s/%s  trees %%%si' % (len(names), len(str(all_keep)))
    total_work = 0
    max_candidates = 1
    total_work_before = {}
    for L in range(len(names), 3, -1):
        total_work_before[L] = total_work
        max_candidates = min(all_keep, max_candidates*L*(L-1)//2)
        total_work += max_candidates
        
    def _show_progress():
        t = len(next_trees)
        work_done = total_work_before[L] + t
        ui.display(msg=template % (L, t), progress=work_done/total_work)
    
    for L in range(len(names), 3, -1):
        # Generator of candidate joins, best first.
        # Note that with dkeep>0 this generator is used up a bit at a time
        # by 2 different interupted 'for' loops below.
        candidates = uniq_neighbour_joins(trees, encode_partition)
        
        # First take up to 'keep' best ones
        next_trees = []
        _show_progress()
        for pair in candidates:
            next_trees.append(pair)
            if len(next_trees) == keep:
                break 
        _show_progress()

        # The very best one is used as an anchor for measuring the 
        # topological distance to others
        best_topology = next_trees[0].topology
        prior_td = [len(best_topology ^ tree.topology) for tree in trees]
        
        # Maintain a separate queue of joins for each possible 
        # topological distance 
        max_td = (max(prior_td) + 1) // 2
        queue = [deque() for g in range(max_td+1)]
        queued = 0
        
        # Now take up to dkeep joins, an equal number of the best at each 
        # topological distance, while not calculating any more TDs than 
        # necessary.
        prior_td = dict(zip(map(id, trees), prior_td))
        target_td = 1
        while (candidates or queued) and len(next_trees) < all_keep:
            if candidates and not queue[target_td]:
                for pair in candidates:
                    diff = pair.new_partition not in best_topology
                    td = (prior_td[id(pair.tree)] + [-1,+1][diff]) // 2
                    # equiv, slower: td = len(best_topology ^ topology) // 2
                    queue[td].append(pair)
                    queued += 1
                    if td == target_td:
                        break
                else:
                    candidates = None
            if queue[target_td]:
                next_trees.append(queue[target_td].popleft())
                queued -= 1
                _show_progress()

            target_td = target_td % max_td + 1
        
        trees = [pair.joined() for pair in next_trees]
                
    result = [tree.asScoreTreeTuple() for tree in trees]
    result.sort()
    return ScoredTreeCollection(result)
Exemplo n.º 3
0
def rnj(dists, no_negatives=True, randomize=True):
    """Computes a tree using the relaxed neighbor joining method
    
    Arguments:
        - dists: dict of (name1, name2): distance
        - no_negatives: negative branch lengths will be set to 0
        - randomize: the algorithm will search nodes randomly until two
        neighbors are found.
    """
        
    constructor = TreeBuilder(mutable=True).createEdge
    (names, d) = distanceDictTo2D(dists)
    
    nodes = [constructor([], name, {}) for name in names]
    
    while len(nodes) > 2:
        # Eliminate one node per iteration until 2 left
        num_nodes = len(nodes)
        
        # compute r (normalized), the sum of all pairwise distances
        # the normalization is over (num - 2), since later for a given i, j
        # distance(i, j) will be removed, and distance(i, i) = 0 always
        r = numpy.sum(d, 0) * 1./(num_nodes-2.)
        
        # find two nodes i, j that are minimize each other's 
        # transformed distance
        node_indices = range(num_nodes)
        if randomize == True:
            shuffle(node_indices)
        chose_pair = False
        
        # coefficient used calculating transformed distances
        coef = num_nodes * 1./(num_nodes - 2.)
        for i in node_indices:
        # find i's closest, call it j
        
            # xformed_dists is a list of T_i,j for all j
            xformed_dists = coef*d[i] - r - r[i]
        
            # give distance to self a bogus but nonminimum value
            xformed_dists[i] = numpy.abs(xformed_dists[0])*2. +\
                numpy.abs(xformed_dists[num_nodes - 1])*2.
        
            j = numpy.argmin(xformed_dists)
        
        
        # now find j's closest
            xformed_dists = coef*d[j] - r - r[j]
            xformed_dists[j] = numpy.abs(xformed_dists[0])*2. +\
                numpy.abs(xformed_dists[num_nodes - 1])*2.
            
            # if i and j are each other's minimum, choose this (i, j) pair
            if i == numpy.argmin(xformed_dists):
                # choose these i, j
                chose_pair = True
                break
        
        if not chose_pair:
            raise Exception("didn't choose a pair of nodes correctly")
        assert i != j, (i, j)
        
        # Branch lengths from i and j to new node
        nodes[i].Length = 0.5 * (d[i,j] + r[i] - r[j])
        nodes[j].Length = 0.5 * (d[i,j] + r[j] - r[i])
            
        # no negative branch lengths
        if no_negatives:
            nodes[i].Length = max(0.0, nodes[i].Length)
            nodes[j].Length = max(0.0, nodes[j].Length)
        
        # Join i and k to make new node
        new_node = constructor([nodes[i], nodes[j]], None, {})
        
        # Store new node at i
        new_dists = 0.5 * (d[i] + d[j] - d[i,j])
        d[:, i] = new_dists
        d[i, :] = new_dists
        d[i, i] = 0.0
        nodes[i] = new_node
        
        # Eliminate j
        d[j, :] = d[num_nodes-1, :]
        d[:, j] = d[:, num_nodes-1]
        assert d[j, j] == 0.0, d
        d = d[0:num_nodes-1, 0:num_nodes-1]
        nodes[j] = nodes[num_nodes-1]
        nodes.pop()
    
    # no negative branch lengths
    if len(nodes[0].Children) < len(nodes[1].Children):
        nodes.reverse()
    
    # 2 remaining nodes will be [root, extra_child]
    nodes[1].Length = d[0,1]
    if no_negatives:
        nodes[1].Length = max(0.0, nodes[1].Length)
    
    #Need to replace nodes[0] with new root
    nodes[1].Parent = nodes[0]
    return constructor(nodes[0].Children, 'root', {}).deepcopy()