def get_stone_weights(tree): """ This method was proposed by Stone and Sidow. @param tree: a tree object with branch lengths for all non-root nodes @return: a sequence of (name, weight) pairs """ # augment each node with an identifier that will survive a deep copy for i, node in enumerate(tree.preorder()): node.id = i # average over all rootings of the tree tip_id_to_weight = {} for old_target in tree.gen_non_root_nodes(): # create a new rerooted tree clone = copy.deepcopy(tree) new_target_list = [ node for node in clone.preorder() if node.id == old_target.id ] assert len(new_target_list) == 1 target = new_target_list[0] new_root = Newick.NewickNode() clone.insert_node(new_root, target.parent, target, .5) clone.reroot(new_root) # find the weights of the rerooted tree using a more traditional method # the 'current' attribute added to each tip is its weight get_thompson_weights(clone) # for each tip add the contribution of this weighting for tip in clone.gen_tips(): weight = tip_id_to_weight.get(tip.id, 0) contribution = old_target.blen * tip.current tip_id_to_weight[tip.id] = weight + contribution # report the final weights grand_total_weight = sum(tip_id_to_weight.values()) return [(tip.name, tip_id_to_weight[tip.id] / grand_total_weight) for tip in tree.gen_tips()]
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the minimum number of segments min_segment_count = fs.segments # determine the maximum allowed branch length total_branch_length = tree.get_total_length() max_branch_length = total_branch_length / float(min_segment_count) # any branch longer than the max branch length will be broken in half while True: old_nodes = list(tree.preorder()) for node in old_nodes: if node is tree.root: if node.blen is not None: msg = 'the root node should not have a branch length' raise HandlingError(msg) elif node.blen is None: msg = 'each non-root node should have a branch length' raise HandlingError(msg) elif node.blen > max_branch_length: # create a new node and set its attributes new = Newick.NewickNode() new.name = node.name # insert the new node tree.insert_node(new, node.parent, node, .5) # if no node was added then break out of the loop if len(old_nodes) == len(list(tree.preorder())): break # return the response return tree.get_newick_string() + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the fraction fraction = fs.fraction # convert the node names to node objects try: parent = tree.get_unique_node(fs.parent) child = tree.get_unique_node(fs.child) except Newick.NewickSearchError as e: raise HandlingError(e) # allow the parent and child nodes to be specified in the reverse order if (parent is not child.parent) and (child is parent.parent): parent, child = child, parent fraction = 1 - fraction # verify the relationship between the parent and child nodes if parent is not child.parent: msg = 'the given parent and child nodes are not adjacent' raise HandlingError(msg) # determine the new root node, creating a new one if necessary if fraction == 0: target = parent elif fraction == 1: target = child else: target = Newick.NewickNode() tree.insert_node(target, parent, child, fraction) if target is tree.root: raise HandlingError('the new root is the same as the old root') if not target.parent: raise HandlingError('topology error') # reroot old = tree.root tree.reroot(target) # if the old root has a single child then remove the old root if len(old.children) == 1: tree.remove_node(old) # return the response return tree.get_newick_string() + '\n'
def _build_tree(self, indices, depth): """ @param indices: a set of indices of taxa in the current subtree @param depth: the depth of the current subtree @return: the node representing the subtree """ root = Newick.NewickNode() if not indices: msg = 'trying to build a tree from an empty set of indices' raise ValueError(msg) elif len(indices) == 1: index = list(indices)[0] root.set_name(self.ordered_names[index]) else: if depth >= len(self.sorted_eigensystem): # the ordered eigenvector loading signs # were unable to distinguish each taxon raise IncompleteError() negative_indices = set() positive_indices = set() negligible_indices = set() w, v = self.sorted_eigensystem[depth] for i in indices: if abs(v[i]) < self.epsilon: negligible_indices.add(i) elif v[i] < 0: negative_indices.add(i) else: positive_indices.add(i) if negligible_indices: # eigenvector loadings near zero are degenerate raise NegligibleError() for next_indices in (negative_indices, positive_indices): if next_indices: child = self._build_tree(next_indices, depth + 1) child.set_branch_length(1) root.add_child(child) child.set_parent(root) return root
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # modify the tree old_nodes = list(tree.preorder()) for node in old_nodes: if node is tree.root: if node.blen is not None: msg = 'the root node should not have a branch length' raise HandlingError(msg) elif node.blen is None: msg = 'each non-root node should have a branch length' raise HandlingError(msg) else: # create a new node and set its attributes new = Newick.NewickNode() new.name = node.name # insert the new node tree.insert_node(new, node.parent, node, .5) # return the response return tree.get_newick_string() + '\n'
def contrast_matrix_to_tree(C, ordered_names): """ @param C: contrast matrix as a numpy array @param ordered_names: leaf names corresponding to rows of C @return: a newick tree object """ contrasts = C.T.tolist() # partition the contrasts into the ones with and without entries that are zero c_with_zero = [c for c in contrasts if 0 in c] c_without_zero = [c for c in contrasts if 0 not in c] # exactly one contrast should not have any zero element assert len(c_without_zero) == 1 root_contrast = c_without_zero[0] # the variance partition is not defined at the root neg_weight = None root_node = Newick.NewickNode() root_info = ReconstructionInfo(root_node) root_info.build_subtree(root_contrast, c_with_zero, neg_weight, ordered_names) # get a newick tree from the newick root tree = Newick.NewickTree(root_node) return tree
def get_response_content(fs): # read the values from the form subtree_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) taxa_a1 = Util.get_stripped_lines(StringIO(fs.taxa_a1)) taxa_a2 = Util.get_stripped_lines(StringIO(fs.taxa_a2)) subtree_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) taxa_b1 = Util.get_stripped_lines(StringIO(fs.taxa_b1)) taxa_b2 = Util.get_stripped_lines(StringIO(fs.taxa_b2)) connecting_branch_length = fs.blen # assert that no group of taxa contains duplicates for taxa in (taxa_a1, taxa_a2, taxa_b1, taxa_b2): if len(set(taxa)) != len(taxa): raise HandlingError('one of the lists of taxa contains duplicates') # assert that each subtree has at least two tips and no duplicates for tree in (subtree_a, subtree_b): tip_names = list(node.get_name() for node in tree.gen_tips()) if len(tip_names) < 2: raise HandlingError('each subtree should have at least two tips') if len(set(tip_names)) != len(tip_names): raise HandlingError('a subtree has duplicate tip names') # assert that the partitions are valid first_group = ('A', subtree_a, taxa_a1, taxa_a2) second_group = ('B', subtree_b, taxa_b1, taxa_b2) for tree_name, tree, taxa_1, taxa_2 in (first_group, second_group): tip_names = set(node.get_name() for node in tree.gen_tips()) for group_name, taxa in (('1', taxa_1), ('2', taxa_2)): nonsense_names = list(set(taxa) - set(tip_names)) msg_a = 'the following taxa in group %s ' % group_name msg_b = 'of subtree %s ' % tree_name msg_c = 'are not valid tips: %s' % str(nonsense_names) message = msg_a + msg_b + msg_c if nonsense_names: raise HandlingError(message) if set(taxa_1) & set(taxa_2): msg_a = 'the taxon lists for subtree %s ' % tree_name msg_b = 'are not disjoint' raise HandlingError(msg_a + msg_b) if set(taxa_1) | set(taxa_2) < tip_names: msg_a = 'a tip in subtree %s ' % tree_name msg_b = 'is not represented in either of the groups' raise HandlingError(msg_a + msg_b) # define the response out = StringIO() # get the results for the first method do_first_method(subtree_a, subtree_b, taxa_a1, taxa_a2, taxa_b1, taxa_b2, connecting_branch_length, out) # define the entire tree by connecting the subtrees subtree_b.get_root().set_branch_length(connecting_branch_length) subtree_a.get_root().add_child(subtree_b.get_root()) tree = subtree_a # define the order and structure of the distance matrix block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2) name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2 # get the distance matrix fel_tree = NewickIO.parse(NewickIO.get_newick_string(tree), FelTree.NewickTree) D = fel_tree.get_distance_matrix(name_order) # get the R matrix R = Clustering.get_R_balaji(D) # get the sums of block elements of R block_R = [[0]*4 for i in range(4)] for i, block_i in enumerate(block_structure): for j, block_j in enumerate(block_structure): block_R[block_i][block_j] += R[i][j] # show the results from the second method do_second_method(fel_tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out) # show the results from the third method tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) for t in (tree_m3_a, tree_m3_b): neo = Newick.NewickNode() neo.name = 'special' neo.blen = connecting_branch_length / 2 t.get_root().add_child(neo) feltree_m3_a = NewickIO.parse(NewickIO.get_newick_string(tree_m3_a), FelTree.NewickTree) feltree_m3_b = NewickIO.parse(NewickIO.get_newick_string(tree_m3_b), FelTree.NewickTree) tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) new_root = Newick.NewickNode() tree_m3_a.get_root().blen = connecting_branch_length / 2 tree_m3_b.get_root().blen = connecting_branch_length / 2 new_root.add_child(tree_m3_a.get_root()) new_root.add_child(tree_m3_b.get_root()) tree_m3 = Newick.NewickTree(new_root) feltree_m3 = NewickIO.parse(NewickIO.get_newick_string(tree_m3), FelTree.NewickTree) branch_d2 = connecting_branch_length / 2 do_third_method(feltree_m3_a, feltree_m3_b, feltree_m3, branch_d2, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out) # show the expected results print >> out, 'M:' print >> out, MatrixUtil.m_to_string(R) print >> out, 'M summed within blocks:' print >> out, MatrixUtil.m_to_string(block_R) # return the response return out.getvalue()
def make_tree(D, ordered_states, iteration_callback=None): """ Create a newick tree from a distance matrix using neighbor joining. @param D: a row major distance matrix @param ordered_states: state names ordered according to the distance matrix @param iteration_callback: called with the output of each iteration call @return: a newick tree """ # make sure that there are enough states if len(ordered_states) < 3: raise ValueError( 'the neighbor joining algorithm needs at least three nodes') # create a dictionary mapping the subtree root node serial number to a subtree forest = {} # set the current state index_to_serial = range(len(ordered_states)) next_serial = len(ordered_states) # repeatedly pair off neighbors while True: # get the new vector of distances and the neighbor index pair result = do_iteration(D) if iteration_callback: # get the Q matrix for show Q = get_Q_matrix(D) # report the Q matrix and the result of the iteration iteration_callback(Q, result) v, (f, g) = result # create the subtree from the index pair root = Newick.NewickNode() root.serial_number = next_serial # determine the indices to use as branches if len(index_to_serial) == 3: branch_indices = range(3) else: branch_indices = (f, g) # add branches to the tree for index in branch_indices: neo = forest.pop(index_to_serial[index], None) if not neo: neo = Newick.NewickNode() neo.serial_number = index_to_serial[index] root.add_child(neo) neo.set_parent(root) neo.blen = v[index] # handle the terminal case if len(index_to_serial) == 3: # create the newick tree from the root node tree = Newick.NewickTree(root) # add names to the tips of the tree for node in tree.gen_tips(): node.name = ordered_states[node.serial_number] # convert the tree to a FelTree and return it return NewickIO.parse(tree.get_newick_string(), FelTree.NewickTree) else: # add the subtree to the forest forest[next_serial] = root # make the next distance matrix next_D = [] for i, row in enumerate(D): if i not in (f, g): next_row = [ value for j, value in enumerate(row) if j not in (f, g) ] next_row.append(v[i]) next_D.append(next_row) next_row = [value for j, value in enumerate(v) if j not in (f, g)] next_row.append(0) next_D.append(next_row) D = next_D # make the next serial number map next_index_to_serial = [ value for j, value in enumerate(index_to_serial) if j not in (f, g) ] next_index_to_serial.append(next_serial) index_to_serial = next_index_to_serial # increment the serial number next_serial += 1
def build_subtree(self, current_contrast, subtree_contrasts, neg_weight, ordered_names): """ This is a recursive function that builds the tree. @param current_contrast: the contrast of the current node @param subtree_contrasts: a set of contrasts defining the subtree @param neg_weight: a way of partitioning the variance or None if root @param ordered_names: a list of names conformant with the contrast vector """ # build the negative and positive subtrees for i, mycmp in enumerate((is_negative, is_positive)): indices = frozenset(i for i, x in enumerate(current_contrast) if mycmp(x)) child_contrasts = [] child_subtree_contrasts = [] for contrast in subtree_contrasts: nonzero_indices = frozenset(i for i, x in enumerate(contrast) if x) if nonzero_indices == indices: child_contrasts.append(contrast) elif nonzero_indices < indices: child_subtree_contrasts.append(contrast) if len(child_contrasts) > 1: raise ContrastError() elif len(child_contrasts) == 0 and len(indices) != 1: raise ContrastError() # create the child node child_node = Newick.NewickNode() child_node.parent = self.node self.node.children.append(child_node) child_info = ReconstructionInfo(child_node) if child_contrasts: # the child node is an internal node child_contrast = child_contrasts[0] child_neg_weight = _get_neg_weight(current_contrast, child_contrast) child_info.build_subtree(child_contrast, child_subtree_contrasts, child_neg_weight, ordered_names) else: # the child node is a leaf node index, = indices child_node.name = ordered_names[index] child_info.variance = 0 self.child_info_pair.append(child_info) # define the child branch lengths using the variance partition a0 = self.child_info_pair[0].variance b0 = self.child_info_pair[1].variance total_variance = _get_contrast_variance(current_contrast) if neg_weight is None: a = (total_variance - a0 - b0) / 2 b = (total_variance - a0 - b0) / 2 else: a, b = _get_branch_lengths(total_variance, (1 - neg_weight), a0, b0) alpha = a + a0 beta = b + b0 # set the branch lengths of the children self.node.children[0].blen = a self.node.children[1].blen = b # set the variance of the current node self.variance = (alpha * beta) / (alpha + beta)
def _make_tree_helper(self, D, index_to_serial, depth=0): """ Recursively build a newick tree from a distance matrix. @param D: a row major distance matrix @param index_to_serial: converts an index in D to a serial number for the tree node @param depth: gives the recursion depth; this is for instrumentation @return: a newick tree with branch lengths """ # instrumentation to notify the framework that a recursive call has been made if self.callback: self.callback(depth) # recursively build the newick tree n = len(D) if n == 3: # if there are only three nodes then return a single star tree v, (f, g) = NeighborJoining.do_iteration(D) root = Newick.NewickNode() for i, d in enumerate(v): neo = Newick.NewickNode() neo.serial_number = index_to_serial[i] neo.blen = d root.add_child(neo) neo.set_parent(root) return Newick.NewickTree(root) # try to get the selection using a custom splitter selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # if the split was insufficient then resort to either modifying the distance matrix or using neighbor joining fallback = False if min(len(selection), len(complement)) < 2: fallback = True if self.fallback_name == 'nj': # use an iteration of neighbor joining if this is the preferred fallback method v, (f, g) = NeighborJoining.do_iteration(D) selection = set((f, g)) complement = set(range(n)) - selection elif self.fallback_name == 'halving': # repeatedly modify the distance matrix if this is the preferred fallback method halving_count = 0 while min(len(selection), len(complement)) < 2: # kill the loop if the halving count is ridiculous if halving_count > 1000: error_out = StringIO() print >> error_out, 'the number of leaf stem halving iterations is ridiculous (%d);' % halving_count print >> error_out, 'the singleton leaf stem length is %s;' % leaf_stem_length print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) # find the index of the leaf singleton halving_count += 1 if len(selection) == 1: smaller = selection larger = complement elif len(complement) == 1: smaller = complement larger = selection else: error_out = StringIO() print >> error_out, 'in the following distance matrix,' print >> error_out, 'a split was so degenerate that it did not even leave a leaf stem to work with:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) v = get_crossing_distances(D, selection, complement) # get the distance from the leaf singleton to the root of the rest of the tree leaf_singleton_index = list(smaller)[0] leaf_stem_length = v[leaf_singleton_index] # if the leaf stem length is zero then repeatedly halving it will not help. if not leaf_stem_length: error_out = StringIO() print >> error_out, 'the singleton leaf stem length is zero;' print >> error_out, 'the number of leaf stem halving iterations performed was %d;' % halving_count print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) # modify the distance matrix for i in larger: D[i][leaf_singleton_index] -= leaf_stem_length / 2 D[leaf_singleton_index][i] -= leaf_stem_length / 2 # get the selection and complement using the modified distance matrix selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # define the new serial numbers for the selection and complement subtrees selection_serial = self.number_generator.get_next() complement_serial = self.number_generator.get_next() # for reporting purposes only, # store the subset of leaf serials defined by each new serial number for new_serial, indices in ((selection_serial, selection), (complement_serial, complement)): serials = set(index_to_serial[i] for i in indices) new_set = set() for serial in serials: new_set.update(self.serial_number_to_tip_set[serial]) self.serial_number_to_tip_set[new_serial] = new_set # report the split flattened_selection = set( self.ordered_labels[serial] for serial in self.serial_number_to_tip_set[selection_serial]) if fallback: if self.fallback_name == 'nj': self.on_nj_fallback_split(flattened_selection, len(selection), len(complement)) elif self.fallback_name == 'halving': self.on_halving_fallback_split(flattened_selection, len(selection), len(complement), halving_count) else: assert False, 'internal error: invalid fallback method' else: self.on_custom_split(flattened_selection, len(selection), len(complement)) # break the distance matrix into two distance matrices, # then make a tree for each one. A = list(sorted(selection)) B = list(sorted(complement)) A_distance_matrix, B_distance_matrix = split_distance_matrix( D, selection, complement) # define the serial numbers for the split distance matrices A_index_to_serial = [index_to_serial[i] for i in A] + [complement_serial] B_index_to_serial = [index_to_serial[i] for i in B] + [selection_serial] # make the next trees A_tree = self._make_tree_helper(A_distance_matrix, A_index_to_serial, depth + 1) B_tree = self._make_tree_helper(B_distance_matrix, B_index_to_serial, depth + 1) # return the merged tree return merge_trees(A_tree, B_tree)