def merge_trees(tree_a, tree_b): """ Merge the newick trees by joining the branches with the highest serial numbers. The new branch connecting the trees will have the mean length of the joined branches. @param tree_a: a newick tree with tips marked with serial numbers @param tree_b: another newick tree with tips marked with serial numbers @return: a combined newick tree """ # for each tree find the node with the highest serial number serial_tip_pairs = [(p.serial_number, p) for p in tree_a.gen_tips()] tip_a = max(serial_tip_pairs)[1] serial_tip_pairs = [(p.serial_number, p) for p in tree_b.gen_tips()] tip_b = max(serial_tip_pairs)[1] # reroot the trees tree_a.reroot(tip_a) tree_b.reroot(tip_b) # calculate the length of the new connecting branch neo_blen = (tree_a.root.children[0].blen + tree_b.root.children[0].blen) / 2.0 # merge the trees neo_root = tree_a.root.children[0] neo_root.parent = None neo_root.blen = None neo_sink = tree_b.root.children[0] neo_sink.blen = neo_blen neo_root.add_child(neo_sink) neo_sink.set_parent(neo_root) # return the merged trees return Newick.NewickTree(neo_root)
def __init__(self, tree, epsilon): """ @param tree: a newick tree in the felsenstein-inspired format @param epsilon: determines whether loadings are considered negligible """ # clear some flags that describe events that occur during reconstruction self.is_negligible = False self.is_incomplete = False self.is_conflicting = False # define the trees self.tree = tree self.reconstructed_tree = None # set the threshold for loading negligibility self.epsilon = epsilon # define some arbitrary ordering of tip names self.ordered_names = [node.get_name() for node in tree.gen_tips()] # get the distance matrix with respect to this ordering D = tree.get_distance_matrix(self.ordered_names) # get the Gower doubly centered matrix G = MatrixUtil.double_centered(np.array(D)) # get the eigendecomposition of the Gower matrix eigenvalues, eigenvector_transposes = np.linalg.eigh(G) eigenvectors = eigenvector_transposes.T self.sorted_eigensystem = list( reversed( list( sorted((abs(w), v) for w, v in zip(eigenvalues, eigenvectors))))) # build the tree recursively using the sorted eigensystem indices = set(range(len(self.ordered_names))) try: # try to reconstruct the tree root = self._build_tree(indices, 0) root.set_branch_length(None) output_tree = Newick.NewickTree(root) # convert the tree to the FelTree format newick_string = NewickIO.get_newick_string(output_tree) self.reconstructed_tree = NewickIO.parse(newick_string, FelTree.NewickTree) except NegligibleError: self.is_negligible = True except IncompleteError: self.is_incomplete = True else: # compare the splits defined by the reconstructed tree # to splits in the original tree expected_partitions = TreeComparison.get_nontrivial_partitions( self.tree) observed_partitions = TreeComparison.get_nontrivial_partitions( self.reconstructed_tree) invalid_partitions = observed_partitions - expected_partitions if invalid_partitions: self.is_conflicting = True
def contrast_matrix_to_tree(C, ordered_names): """ @param C: contrast matrix as a numpy array @param ordered_names: leaf names corresponding to rows of C @return: a newick tree object """ contrasts = C.T.tolist() # partition the contrasts into the ones with and without entries that are zero c_with_zero = [c for c in contrasts if 0 in c] c_without_zero = [c for c in contrasts if 0 not in c] # exactly one contrast should not have any zero element assert len(c_without_zero) == 1 root_contrast = c_without_zero[0] # the variance partition is not defined at the root neg_weight = None root_node = Newick.NewickNode() root_info = ReconstructionInfo(root_node) root_info.build_subtree(root_contrast, c_with_zero, neg_weight, ordered_names) # get a newick tree from the newick root tree = Newick.NewickTree(root_node) return tree
def get_response_content(fs): # read the values from the form subtree_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) taxa_a1 = Util.get_stripped_lines(StringIO(fs.taxa_a1)) taxa_a2 = Util.get_stripped_lines(StringIO(fs.taxa_a2)) subtree_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) taxa_b1 = Util.get_stripped_lines(StringIO(fs.taxa_b1)) taxa_b2 = Util.get_stripped_lines(StringIO(fs.taxa_b2)) connecting_branch_length = fs.blen # assert that no group of taxa contains duplicates for taxa in (taxa_a1, taxa_a2, taxa_b1, taxa_b2): if len(set(taxa)) != len(taxa): raise HandlingError('one of the lists of taxa contains duplicates') # assert that each subtree has at least two tips and no duplicates for tree in (subtree_a, subtree_b): tip_names = list(node.get_name() for node in tree.gen_tips()) if len(tip_names) < 2: raise HandlingError('each subtree should have at least two tips') if len(set(tip_names)) != len(tip_names): raise HandlingError('a subtree has duplicate tip names') # assert that the partitions are valid first_group = ('A', subtree_a, taxa_a1, taxa_a2) second_group = ('B', subtree_b, taxa_b1, taxa_b2) for tree_name, tree, taxa_1, taxa_2 in (first_group, second_group): tip_names = set(node.get_name() for node in tree.gen_tips()) for group_name, taxa in (('1', taxa_1), ('2', taxa_2)): nonsense_names = list(set(taxa) - set(tip_names)) msg_a = 'the following taxa in group %s ' % group_name msg_b = 'of subtree %s ' % tree_name msg_c = 'are not valid tips: %s' % str(nonsense_names) message = msg_a + msg_b + msg_c if nonsense_names: raise HandlingError(message) if set(taxa_1) & set(taxa_2): msg_a = 'the taxon lists for subtree %s ' % tree_name msg_b = 'are not disjoint' raise HandlingError(msg_a + msg_b) if set(taxa_1) | set(taxa_2) < tip_names: msg_a = 'a tip in subtree %s ' % tree_name msg_b = 'is not represented in either of the groups' raise HandlingError(msg_a + msg_b) # define the response out = StringIO() # get the results for the first method do_first_method(subtree_a, subtree_b, taxa_a1, taxa_a2, taxa_b1, taxa_b2, connecting_branch_length, out) # define the entire tree by connecting the subtrees subtree_b.get_root().set_branch_length(connecting_branch_length) subtree_a.get_root().add_child(subtree_b.get_root()) tree = subtree_a # define the order and structure of the distance matrix block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2) name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2 # get the distance matrix fel_tree = NewickIO.parse(NewickIO.get_newick_string(tree), FelTree.NewickTree) D = fel_tree.get_distance_matrix(name_order) # get the R matrix R = Clustering.get_R_balaji(D) # get the sums of block elements of R block_R = [[0]*4 for i in range(4)] for i, block_i in enumerate(block_structure): for j, block_j in enumerate(block_structure): block_R[block_i][block_j] += R[i][j] # show the results from the second method do_second_method(fel_tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out) # show the results from the third method tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) for t in (tree_m3_a, tree_m3_b): neo = Newick.NewickNode() neo.name = 'special' neo.blen = connecting_branch_length / 2 t.get_root().add_child(neo) feltree_m3_a = NewickIO.parse(NewickIO.get_newick_string(tree_m3_a), FelTree.NewickTree) feltree_m3_b = NewickIO.parse(NewickIO.get_newick_string(tree_m3_b), FelTree.NewickTree) tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) new_root = Newick.NewickNode() tree_m3_a.get_root().blen = connecting_branch_length / 2 tree_m3_b.get_root().blen = connecting_branch_length / 2 new_root.add_child(tree_m3_a.get_root()) new_root.add_child(tree_m3_b.get_root()) tree_m3 = Newick.NewickTree(new_root) feltree_m3 = NewickIO.parse(NewickIO.get_newick_string(tree_m3), FelTree.NewickTree) branch_d2 = connecting_branch_length / 2 do_third_method(feltree_m3_a, feltree_m3_b, feltree_m3, branch_d2, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out) # show the expected results print >> out, 'M:' print >> out, MatrixUtil.m_to_string(R) print >> out, 'M summed within blocks:' print >> out, MatrixUtil.m_to_string(block_R) # return the response return out.getvalue()
def make_tree(D, ordered_states, iteration_callback=None): """ Create a newick tree from a distance matrix using neighbor joining. @param D: a row major distance matrix @param ordered_states: state names ordered according to the distance matrix @param iteration_callback: called with the output of each iteration call @return: a newick tree """ # make sure that there are enough states if len(ordered_states) < 3: raise ValueError( 'the neighbor joining algorithm needs at least three nodes') # create a dictionary mapping the subtree root node serial number to a subtree forest = {} # set the current state index_to_serial = range(len(ordered_states)) next_serial = len(ordered_states) # repeatedly pair off neighbors while True: # get the new vector of distances and the neighbor index pair result = do_iteration(D) if iteration_callback: # get the Q matrix for show Q = get_Q_matrix(D) # report the Q matrix and the result of the iteration iteration_callback(Q, result) v, (f, g) = result # create the subtree from the index pair root = Newick.NewickNode() root.serial_number = next_serial # determine the indices to use as branches if len(index_to_serial) == 3: branch_indices = range(3) else: branch_indices = (f, g) # add branches to the tree for index in branch_indices: neo = forest.pop(index_to_serial[index], None) if not neo: neo = Newick.NewickNode() neo.serial_number = index_to_serial[index] root.add_child(neo) neo.set_parent(root) neo.blen = v[index] # handle the terminal case if len(index_to_serial) == 3: # create the newick tree from the root node tree = Newick.NewickTree(root) # add names to the tips of the tree for node in tree.gen_tips(): node.name = ordered_states[node.serial_number] # convert the tree to a FelTree and return it return NewickIO.parse(tree.get_newick_string(), FelTree.NewickTree) else: # add the subtree to the forest forest[next_serial] = root # make the next distance matrix next_D = [] for i, row in enumerate(D): if i not in (f, g): next_row = [ value for j, value in enumerate(row) if j not in (f, g) ] next_row.append(v[i]) next_D.append(next_row) next_row = [value for j, value in enumerate(v) if j not in (f, g)] next_row.append(0) next_D.append(next_row) D = next_D # make the next serial number map next_index_to_serial = [ value for j, value in enumerate(index_to_serial) if j not in (f, g) ] next_index_to_serial.append(next_serial) index_to_serial = next_index_to_serial # increment the serial number next_serial += 1
def _make_tree_helper(self, D, index_to_serial, depth=0): """ Recursively build a newick tree from a distance matrix. @param D: a row major distance matrix @param index_to_serial: converts an index in D to a serial number for the tree node @param depth: gives the recursion depth; this is for instrumentation @return: a newick tree with branch lengths """ # instrumentation to notify the framework that a recursive call has been made if self.callback: self.callback(depth) # recursively build the newick tree n = len(D) if n == 3: # if there are only three nodes then return a single star tree v, (f, g) = NeighborJoining.do_iteration(D) root = Newick.NewickNode() for i, d in enumerate(v): neo = Newick.NewickNode() neo.serial_number = index_to_serial[i] neo.blen = d root.add_child(neo) neo.set_parent(root) return Newick.NewickTree(root) # try to get the selection using a custom splitter selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # if the split was insufficient then resort to either modifying the distance matrix or using neighbor joining fallback = False if min(len(selection), len(complement)) < 2: fallback = True if self.fallback_name == 'nj': # use an iteration of neighbor joining if this is the preferred fallback method v, (f, g) = NeighborJoining.do_iteration(D) selection = set((f, g)) complement = set(range(n)) - selection elif self.fallback_name == 'halving': # repeatedly modify the distance matrix if this is the preferred fallback method halving_count = 0 while min(len(selection), len(complement)) < 2: # kill the loop if the halving count is ridiculous if halving_count > 1000: error_out = StringIO() print >> error_out, 'the number of leaf stem halving iterations is ridiculous (%d);' % halving_count print >> error_out, 'the singleton leaf stem length is %s;' % leaf_stem_length print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) # find the index of the leaf singleton halving_count += 1 if len(selection) == 1: smaller = selection larger = complement elif len(complement) == 1: smaller = complement larger = selection else: error_out = StringIO() print >> error_out, 'in the following distance matrix,' print >> error_out, 'a split was so degenerate that it did not even leave a leaf stem to work with:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) v = get_crossing_distances(D, selection, complement) # get the distance from the leaf singleton to the root of the rest of the tree leaf_singleton_index = list(smaller)[0] leaf_stem_length = v[leaf_singleton_index] # if the leaf stem length is zero then repeatedly halving it will not help. if not leaf_stem_length: error_out = StringIO() print >> error_out, 'the singleton leaf stem length is zero;' print >> error_out, 'the number of leaf stem halving iterations performed was %d;' % halving_count print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) # modify the distance matrix for i in larger: D[i][leaf_singleton_index] -= leaf_stem_length / 2 D[leaf_singleton_index][i] -= leaf_stem_length / 2 # get the selection and complement using the modified distance matrix selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # define the new serial numbers for the selection and complement subtrees selection_serial = self.number_generator.get_next() complement_serial = self.number_generator.get_next() # for reporting purposes only, # store the subset of leaf serials defined by each new serial number for new_serial, indices in ((selection_serial, selection), (complement_serial, complement)): serials = set(index_to_serial[i] for i in indices) new_set = set() for serial in serials: new_set.update(self.serial_number_to_tip_set[serial]) self.serial_number_to_tip_set[new_serial] = new_set # report the split flattened_selection = set( self.ordered_labels[serial] for serial in self.serial_number_to_tip_set[selection_serial]) if fallback: if self.fallback_name == 'nj': self.on_nj_fallback_split(flattened_selection, len(selection), len(complement)) elif self.fallback_name == 'halving': self.on_halving_fallback_split(flattened_selection, len(selection), len(complement), halving_count) else: assert False, 'internal error: invalid fallback method' else: self.on_custom_split(flattened_selection, len(selection), len(complement)) # break the distance matrix into two distance matrices, # then make a tree for each one. A = list(sorted(selection)) B = list(sorted(complement)) A_distance_matrix, B_distance_matrix = split_distance_matrix( D, selection, complement) # define the serial numbers for the split distance matrices A_index_to_serial = [index_to_serial[i] for i in A] + [complement_serial] B_index_to_serial = [index_to_serial[i] for i in B] + [selection_serial] # make the next trees A_tree = self._make_tree_helper(A_distance_matrix, A_index_to_serial, depth + 1) B_tree = self._make_tree_helper(B_distance_matrix, B_index_to_serial, depth + 1) # return the merged tree return merge_trees(A_tree, B_tree)