Exemplo n.º 1
0
def merge_trees(tree_a, tree_b):
    """
    Merge the newick trees by joining the branches with the highest serial numbers.
    The new branch connecting the trees will have the mean length of the joined branches.
    @param tree_a: a newick tree with tips marked with serial numbers
    @param tree_b: another newick tree with tips marked with serial numbers
    @return: a combined newick tree
    """
    # for each tree find the node with the highest serial number
    serial_tip_pairs = [(p.serial_number, p) for p in tree_a.gen_tips()]
    tip_a = max(serial_tip_pairs)[1]
    serial_tip_pairs = [(p.serial_number, p) for p in tree_b.gen_tips()]
    tip_b = max(serial_tip_pairs)[1]
    # reroot the trees
    tree_a.reroot(tip_a)
    tree_b.reroot(tip_b)
    # calculate the length of the new connecting branch
    neo_blen = (tree_a.root.children[0].blen +
                tree_b.root.children[0].blen) / 2.0
    # merge the trees
    neo_root = tree_a.root.children[0]
    neo_root.parent = None
    neo_root.blen = None
    neo_sink = tree_b.root.children[0]
    neo_sink.blen = neo_blen
    neo_root.add_child(neo_sink)
    neo_sink.set_parent(neo_root)
    # return the merged trees
    return Newick.NewickTree(neo_root)
Exemplo n.º 2
0
 def __init__(self, tree, epsilon):
     """
     @param tree: a newick tree in the felsenstein-inspired format
     @param epsilon: determines whether loadings are considered negligible
     """
     # clear some flags that describe events that occur during reconstruction
     self.is_negligible = False
     self.is_incomplete = False
     self.is_conflicting = False
     # define the trees
     self.tree = tree
     self.reconstructed_tree = None
     # set the threshold for loading negligibility
     self.epsilon = epsilon
     # define some arbitrary ordering of tip names
     self.ordered_names = [node.get_name() for node in tree.gen_tips()]
     # get the distance matrix with respect to this ordering
     D = tree.get_distance_matrix(self.ordered_names)
     # get the Gower doubly centered matrix
     G = MatrixUtil.double_centered(np.array(D))
     # get the eigendecomposition of the Gower matrix
     eigenvalues, eigenvector_transposes = np.linalg.eigh(G)
     eigenvectors = eigenvector_transposes.T
     self.sorted_eigensystem = list(
         reversed(
             list(
                 sorted((abs(w), v)
                        for w, v in zip(eigenvalues, eigenvectors)))))
     # build the tree recursively using the sorted eigensystem
     indices = set(range(len(self.ordered_names)))
     try:
         # try to reconstruct the tree
         root = self._build_tree(indices, 0)
         root.set_branch_length(None)
         output_tree = Newick.NewickTree(root)
         # convert the tree to the FelTree format
         newick_string = NewickIO.get_newick_string(output_tree)
         self.reconstructed_tree = NewickIO.parse(newick_string,
                                                  FelTree.NewickTree)
     except NegligibleError:
         self.is_negligible = True
     except IncompleteError:
         self.is_incomplete = True
     else:
         # compare the splits defined by the reconstructed tree
         # to splits in the original tree
         expected_partitions = TreeComparison.get_nontrivial_partitions(
             self.tree)
         observed_partitions = TreeComparison.get_nontrivial_partitions(
             self.reconstructed_tree)
         invalid_partitions = observed_partitions - expected_partitions
         if invalid_partitions:
             self.is_conflicting = True
Exemplo n.º 3
0
def contrast_matrix_to_tree(C, ordered_names):
    """
    @param C: contrast matrix as a numpy array
    @param ordered_names: leaf names corresponding to rows of C
    @return: a newick tree object
    """
    contrasts = C.T.tolist()
    # partition the contrasts into the ones with and without entries that are zero
    c_with_zero = [c for c in contrasts if 0 in c]
    c_without_zero = [c for c in contrasts if 0 not in c]
    # exactly one contrast should not have any zero element
    assert len(c_without_zero) == 1
    root_contrast = c_without_zero[0]
    # the variance partition is not defined at the root
    neg_weight = None
    root_node = Newick.NewickNode()
    root_info = ReconstructionInfo(root_node)
    root_info.build_subtree(root_contrast, c_with_zero, neg_weight,
                            ordered_names)
    # get a newick tree from the newick root
    tree = Newick.NewickTree(root_node)
    return tree
Exemplo n.º 4
0
def get_response_content(fs):
    # read the values from the form
    subtree_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree)
    taxa_a1 = Util.get_stripped_lines(StringIO(fs.taxa_a1))
    taxa_a2 = Util.get_stripped_lines(StringIO(fs.taxa_a2))
    subtree_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree)
    taxa_b1 = Util.get_stripped_lines(StringIO(fs.taxa_b1))
    taxa_b2 = Util.get_stripped_lines(StringIO(fs.taxa_b2))
    connecting_branch_length = fs.blen
    # assert that no group of taxa contains duplicates
    for taxa in (taxa_a1, taxa_a2, taxa_b1, taxa_b2):
        if len(set(taxa)) != len(taxa):
            raise HandlingError('one of the lists of taxa contains duplicates')
    # assert that each subtree has at least two tips and no duplicates
    for tree in (subtree_a, subtree_b):
        tip_names = list(node.get_name() for node in tree.gen_tips())
        if len(tip_names) < 2:
            raise HandlingError('each subtree should have at least two tips')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('a subtree has duplicate tip names')
    # assert that the partitions are valid
    first_group = ('A', subtree_a, taxa_a1, taxa_a2) 
    second_group = ('B', subtree_b, taxa_b1, taxa_b2)
    for tree_name, tree, taxa_1, taxa_2 in (first_group, second_group):
        tip_names = set(node.get_name() for node in tree.gen_tips())
        for group_name, taxa in (('1', taxa_1), ('2', taxa_2)):
            nonsense_names = list(set(taxa) - set(tip_names))
            msg_a = 'the following taxa in group %s ' % group_name
            msg_b = 'of subtree %s ' % tree_name
            msg_c = 'are not valid tips: %s' % str(nonsense_names)
            message = msg_a + msg_b + msg_c
            if nonsense_names:
                raise HandlingError(message)
        if set(taxa_1) & set(taxa_2):
            msg_a = 'the taxon lists for subtree %s ' % tree_name
            msg_b = 'are not disjoint'
            raise HandlingError(msg_a + msg_b)
        if set(taxa_1) | set(taxa_2) < tip_names:
            msg_a = 'a tip in subtree %s ' % tree_name
            msg_b = 'is not represented in either of the groups'
            raise HandlingError(msg_a + msg_b)
    # define the response
    out = StringIO()
    # get the results for the first method
    do_first_method(subtree_a, subtree_b, taxa_a1, taxa_a2,
            taxa_b1, taxa_b2, connecting_branch_length, out)
    # define the entire tree by connecting the subtrees
    subtree_b.get_root().set_branch_length(connecting_branch_length)
    subtree_a.get_root().add_child(subtree_b.get_root())
    tree = subtree_a
    # define the order and structure of the distance matrix
    block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2)
    name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2
    # get the distance matrix
    fel_tree = NewickIO.parse(NewickIO.get_newick_string(tree),
            FelTree.NewickTree)
    D = fel_tree.get_distance_matrix(name_order)
    # get the R matrix
    R = Clustering.get_R_balaji(D)
    # get the sums of block elements of R
    block_R = [[0]*4 for i in range(4)]
    for i, block_i in enumerate(block_structure):
        for j, block_j in enumerate(block_structure):
            block_R[block_i][block_j] += R[i][j]
    # show the results from the second method
    do_second_method(fel_tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out)
    # show the results from the third method
    tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree)
    tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree)
    for t in (tree_m3_a, tree_m3_b):
        neo = Newick.NewickNode()
        neo.name = 'special'
        neo.blen = connecting_branch_length / 2
        t.get_root().add_child(neo)
    feltree_m3_a = NewickIO.parse(NewickIO.get_newick_string(tree_m3_a),
            FelTree.NewickTree)
    feltree_m3_b = NewickIO.parse(NewickIO.get_newick_string(tree_m3_b),
            FelTree.NewickTree)
    tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree)
    tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree)
    new_root = Newick.NewickNode()
    tree_m3_a.get_root().blen = connecting_branch_length / 2
    tree_m3_b.get_root().blen = connecting_branch_length / 2
    new_root.add_child(tree_m3_a.get_root())
    new_root.add_child(tree_m3_b.get_root())
    tree_m3 = Newick.NewickTree(new_root)
    feltree_m3 = NewickIO.parse(NewickIO.get_newick_string(tree_m3),
            FelTree.NewickTree)
    branch_d2 = connecting_branch_length / 2
    do_third_method(feltree_m3_a, feltree_m3_b, feltree_m3,
            branch_d2, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out)
    # show the expected results
    print >> out, 'M:'
    print >> out, MatrixUtil.m_to_string(R)
    print >> out, 'M summed within blocks:'
    print >> out, MatrixUtil.m_to_string(block_R)
    # return the response
    return out.getvalue()
Exemplo n.º 5
0
def make_tree(D, ordered_states, iteration_callback=None):
    """
    Create a newick tree from a distance matrix using neighbor joining.
    @param D: a row major distance matrix
    @param ordered_states: state names ordered according to the distance matrix
    @param iteration_callback: called with the output of each iteration call
    @return: a newick tree
    """
    # make sure that there are enough states
    if len(ordered_states) < 3:
        raise ValueError(
            'the neighbor joining algorithm needs at least three nodes')
    # create a dictionary mapping the subtree root node serial number to a subtree
    forest = {}
    # set the current state
    index_to_serial = range(len(ordered_states))
    next_serial = len(ordered_states)
    # repeatedly pair off neighbors
    while True:
        # get the new vector of distances and the neighbor index pair
        result = do_iteration(D)
        if iteration_callback:
            # get the Q matrix for show
            Q = get_Q_matrix(D)
            # report the Q matrix and the result of the iteration
            iteration_callback(Q, result)
        v, (f, g) = result
        # create the subtree from the index pair
        root = Newick.NewickNode()
        root.serial_number = next_serial
        # determine the indices to use as branches
        if len(index_to_serial) == 3:
            branch_indices = range(3)
        else:
            branch_indices = (f, g)
        # add branches to the tree
        for index in branch_indices:
            neo = forest.pop(index_to_serial[index], None)
            if not neo:
                neo = Newick.NewickNode()
                neo.serial_number = index_to_serial[index]
            root.add_child(neo)
            neo.set_parent(root)
            neo.blen = v[index]
        # handle the terminal case
        if len(index_to_serial) == 3:
            # create the newick tree from the root node
            tree = Newick.NewickTree(root)
            # add names to the tips of the tree
            for node in tree.gen_tips():
                node.name = ordered_states[node.serial_number]
            # convert the tree to a FelTree and return it
            return NewickIO.parse(tree.get_newick_string(), FelTree.NewickTree)
        else:
            # add the subtree to the forest
            forest[next_serial] = root
            # make the next distance matrix
            next_D = []
            for i, row in enumerate(D):
                if i not in (f, g):
                    next_row = [
                        value for j, value in enumerate(row) if j not in (f, g)
                    ]
                    next_row.append(v[i])
                    next_D.append(next_row)
            next_row = [value for j, value in enumerate(v) if j not in (f, g)]
            next_row.append(0)
            next_D.append(next_row)
            D = next_D
            # make the next serial number map
            next_index_to_serial = [
                value for j, value in enumerate(index_to_serial)
                if j not in (f, g)
            ]
            next_index_to_serial.append(next_serial)
            index_to_serial = next_index_to_serial
            # increment the serial number
            next_serial += 1
Exemplo n.º 6
0
 def _make_tree_helper(self, D, index_to_serial, depth=0):
     """
     Recursively build a newick tree from a distance matrix.
     @param D: a row major distance matrix
     @param index_to_serial: converts an index in D to a serial number for the tree node
     @param depth: gives the recursion depth; this is for instrumentation
     @return: a newick tree with branch lengths
     """
     # instrumentation to notify the framework that a recursive call has been made
     if self.callback:
         self.callback(depth)
     # recursively build the newick tree
     n = len(D)
     if n == 3:
         # if there are only three nodes then return a single star tree
         v, (f, g) = NeighborJoining.do_iteration(D)
         root = Newick.NewickNode()
         for i, d in enumerate(v):
             neo = Newick.NewickNode()
             neo.serial_number = index_to_serial[i]
             neo.blen = d
             root.add_child(neo)
             neo.set_parent(root)
         return Newick.NewickTree(root)
     # try to get the selection using a custom splitter
     selection = self.splitter.get_selection(D)
     complement = set(range(n)) - selection
     # if the split was insufficient then resort to either modifying the distance matrix or using neighbor joining
     fallback = False
     if min(len(selection), len(complement)) < 2:
         fallback = True
         if self.fallback_name == 'nj':
             # use an iteration of neighbor joining if this is the preferred fallback method
             v, (f, g) = NeighborJoining.do_iteration(D)
             selection = set((f, g))
             complement = set(range(n)) - selection
         elif self.fallback_name == 'halving':
             # repeatedly modify the distance matrix if this is the preferred fallback method
             halving_count = 0
             while min(len(selection), len(complement)) < 2:
                 # kill the loop if the halving count is ridiculous
                 if halving_count > 1000:
                     error_out = StringIO()
                     print >> error_out, 'the number of leaf stem halving iterations is ridiculous (%d);' % halving_count
                     print >> error_out, 'the singleton leaf stem length is %s;' % leaf_stem_length
                     print >> error_out, 'the distance matrix is:'
                     print >> error_out, MatrixUtil.m_to_string(D)
                     raise NeighborhoodJoiningError(
                         error_out.getvalue().strip())
                 # find the index of the leaf singleton
                 halving_count += 1
                 if len(selection) == 1:
                     smaller = selection
                     larger = complement
                 elif len(complement) == 1:
                     smaller = complement
                     larger = selection
                 else:
                     error_out = StringIO()
                     print >> error_out, 'in the following distance matrix,'
                     print >> error_out, 'a split was so degenerate that it did not even leave a leaf stem to work with:'
                     print >> error_out, MatrixUtil.m_to_string(D)
                     raise NeighborhoodJoiningError(
                         error_out.getvalue().strip())
                 v = get_crossing_distances(D, selection, complement)
                 # get the distance from the leaf singleton to the root of the rest of the tree
                 leaf_singleton_index = list(smaller)[0]
                 leaf_stem_length = v[leaf_singleton_index]
                 # if the leaf stem length is zero then repeatedly halving it will not help.
                 if not leaf_stem_length:
                     error_out = StringIO()
                     print >> error_out, 'the singleton leaf stem length is zero;'
                     print >> error_out, 'the number of leaf stem halving iterations performed was %d;' % halving_count
                     print >> error_out, 'the distance matrix is:'
                     print >> error_out, MatrixUtil.m_to_string(D)
                     raise NeighborhoodJoiningError(
                         error_out.getvalue().strip())
                 # modify the distance matrix
                 for i in larger:
                     D[i][leaf_singleton_index] -= leaf_stem_length / 2
                     D[leaf_singleton_index][i] -= leaf_stem_length / 2
                 # get the selection and complement using the modified distance matrix
                 selection = self.splitter.get_selection(D)
                 complement = set(range(n)) - selection
     # define the new serial numbers for the selection and complement subtrees
     selection_serial = self.number_generator.get_next()
     complement_serial = self.number_generator.get_next()
     # for reporting purposes only,
     # store the subset of leaf serials defined by each new serial number
     for new_serial, indices in ((selection_serial, selection),
                                 (complement_serial, complement)):
         serials = set(index_to_serial[i] for i in indices)
         new_set = set()
         for serial in serials:
             new_set.update(self.serial_number_to_tip_set[serial])
         self.serial_number_to_tip_set[new_serial] = new_set
     # report the split
     flattened_selection = set(
         self.ordered_labels[serial]
         for serial in self.serial_number_to_tip_set[selection_serial])
     if fallback:
         if self.fallback_name == 'nj':
             self.on_nj_fallback_split(flattened_selection, len(selection),
                                       len(complement))
         elif self.fallback_name == 'halving':
             self.on_halving_fallback_split(flattened_selection,
                                            len(selection), len(complement),
                                            halving_count)
         else:
             assert False, 'internal error: invalid fallback method'
     else:
         self.on_custom_split(flattened_selection, len(selection),
                              len(complement))
     # break the distance matrix into two distance matrices,
     # then make a tree for each one.
     A = list(sorted(selection))
     B = list(sorted(complement))
     A_distance_matrix, B_distance_matrix = split_distance_matrix(
         D, selection, complement)
     # define the serial numbers for the split distance matrices
     A_index_to_serial = [index_to_serial[i]
                          for i in A] + [complement_serial]
     B_index_to_serial = [index_to_serial[i]
                          for i in B] + [selection_serial]
     # make the next trees
     A_tree = self._make_tree_helper(A_distance_matrix, A_index_to_serial,
                                     depth + 1)
     B_tree = self._make_tree_helper(B_distance_matrix, B_index_to_serial,
                                     depth + 1)
     # return the merged tree
     return merge_trees(A_tree, B_tree)