def do_distance_analysis(X): # get the matrix of squared distances labels = list("0123") # reconstruct the matrix of Euclidean distances from a tree D_sqrt = np.array([[np.linalg.norm(y - x) for x in X] for y in X]) sqrt_tree = NeighborJoining.make_tree(D_sqrt, labels) sqrt_tree_string = NewickIO.get_newick_string(sqrt_tree) sqrt_feltree = NewickIO.parse(sqrt_tree_string, FelTree.NewickTree) D_sqrt_reconstructed = np.array(sqrt_feltree.get_distance_matrix(labels)) # reconstruct the matrix of squared Euclidean distances from a tree D = D_sqrt ** 2 tree = NeighborJoining.make_tree(D, labels) tree_string = NewickIO.get_newick_string(tree) feltree = NewickIO.parse(tree_string, FelTree.NewickTree) D_reconstructed = np.array(feltree.get_distance_matrix(labels)) # start writing out = StringIO() # matrix of Euclidean distances and its reconstruction from a tree print >> out, "matrix of Euclidean distances between tetrahedron vertices:" print >> out, D_sqrt print >> out, "neighbor joining tree constructed from D = non-squared Euclidean distances (unusual):" print >> out, sqrt_tree_string print >> out, "distance matrix implied by this tree:" print >> out, D_sqrt_reconstructed # matrix of squared Euclidean distances and its reconstruction from a tree print >> out, "matrix of squared distances between tetrahedron vertices:" print >> out, D print >> out, "neighbor joining tree constructed from D = squared Euclidean distances (normal):" print >> out, tree_string print >> out, "distance matrix implied by this tree:" print >> out, D_reconstructed return out.getvalue().strip()
def do_distance_analysis(X): # get the matrix of squared distances labels = list('0123') # reconstruct the matrix of Euclidean distances from a tree D_sqrt = np.array([[np.linalg.norm(y - x) for x in X] for y in X]) sqrt_tree = NeighborJoining.make_tree(D_sqrt, labels) sqrt_tree_string = NewickIO.get_newick_string(sqrt_tree) sqrt_feltree = NewickIO.parse(sqrt_tree_string, FelTree.NewickTree) D_sqrt_reconstructed = np.array(sqrt_feltree.get_distance_matrix(labels)) # reconstruct the matrix of squared Euclidean distances from a tree D = D_sqrt**2 tree = NeighborJoining.make_tree(D, labels) tree_string = NewickIO.get_newick_string(tree) feltree = NewickIO.parse(tree_string, FelTree.NewickTree) D_reconstructed = np.array(feltree.get_distance_matrix(labels)) # start writing out = StringIO() # matrix of Euclidean distances and its reconstruction from a tree print >> out, 'matrix of Euclidean distances between tetrahedron vertices:' print >> out, D_sqrt print >> out, 'neighbor joining tree constructed from D = non-squared Euclidean distances (unusual):' print >> out, sqrt_tree_string print >> out, 'distance matrix implied by this tree:' print >> out, D_sqrt_reconstructed # matrix of squared Euclidean distances and its reconstruction from a tree print >> out, 'matrix of squared distances between tetrahedron vertices:' print >> out, D print >> out, 'neighbor joining tree constructed from D = squared Euclidean distances (normal):' print >> out, tree_string print >> out, 'distance matrix implied by this tree:' print >> out, D_reconstructed return out.getvalue().strip()
def get_augmented_gower_selection(D): """ Do a spectral sign split with neighbor joining fallback. The first choice is to return indices corresponding to positive elements of the dominant eigenvector of the gower matrix. If this defines a degenerate bipartition, then neighbor joining is used as a fallback. @param D: a distance matrix @return: the set of selected indices """ n = len(D) if n < 4: raise ValueError('expected a distance matrix with at least four rows') # get the gower matrix G = MatrixUtil.double_centered(numpy.array(D)) # get the dominant eigenvector eigenvalues, eigenvector_transposes = linalg.eigh(G) eigenvectors = eigenvector_transposes.T dominant_value, dominant_vector = max( (abs(w), v) for w, v in zip(eigenvalues, eigenvectors)) # get the bipartition defined by the dominant eigenvector selection = set(i for i, x in enumerate(dominant_vector) if x > 0) complement = set(range(n)) - selection # if the bipartition is degenerate then resort to neighbor joining if min(len(selection), len(complement)) < 2: selection = set(NeighborJoining.get_neighbors(D)) return selection
def get_augmented_gower_selection(D): """ Do a spectral sign split with neighbor joining fallback. The first choice is to return indices corresponding to positive elements of the dominant eigenvector of the gower matrix. If this defines a degenerate bipartition, then neighbor joining is used as a fallback. @param D: a distance matrix @return: the set of selected indices """ n = len(D) if n < 4: raise ValueError('expected a distance matrix with at least four rows') # get the gower matrix G = MatrixUtil.double_centered(numpy.array(D)) # get the dominant eigenvector eigenvalues, eigenvector_transposes = linalg.eigh(G) eigenvectors = eigenvector_transposes.T dominant_value, dominant_vector = max((abs(w), v) for w, v in zip(eigenvalues, eigenvectors)) # get the bipartition defined by the dominant eigenvector selection = set(i for i, x in enumerate(dominant_vector) if x > 0) complement = set(range(n)) - selection # if the bipartition is degenerate then resort to neighbor joining if min(len(selection), len(complement)) < 2: selection = set(NeighborJoining.get_neighbors(D)) return selection
def get_response_content(fs): # read the matrix D = fs.matrix if len(D) < 3: raise HandlingError('the matrix should have at least three rows') # read the ordered labels ordered_labels = Util.get_stripped_lines(fs.labels.splitlines()) if len(ordered_labels) != len(D): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of rows in the matrix' raise HandlingError(msg_a + msg_b) # get the newick tree tree = NeighborJoining.make_tree(D.tolist(), ordered_labels) # return the response return NewickIO.get_newick_string(tree) + '\n'
def get_response_content(fs): # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # get the names of the tips of the tree alphabetically_ordered_states = list(sorted(node.name for node in tree.gen_tips())) n = len(alphabetically_ordered_states) if n < 2: raise HandlingError('the newick tree should have at least two leaves') # read the ordered labels states = [] if fs.inlabels: states = Util.get_stripped_lines(fs.inlabels.splitlines()) if len(states) > 1: if set(states) != set(alphabetically_ordered_states): msg_a = 'if ordered labels are provided, ' msg_b = 'each should correspond to a leaf of the newick tree' raise HandlingError(msg_a + msg_b) else: states = alphabetically_ordered_states # create the distance matrix D = tree.get_distance_matrix(states) # create the laplacian matrix M = np.array(D) P = np.eye(n) - np.ones((n,n))/n L_pinv = - 0.5 * np.dot(P, np.dot(M, P)) L = linalg.pinv(L_pinv) # start collecting the paragraphs paragraphs = [] # show the distance matrix if requested if fs.distance: paragraph = StringIO() print >> paragraph, 'path resistance (distance) matrix:' print >> paragraph, MatrixUtil.m_to_string(D) paragraphs.append(paragraph.getvalue().strip()) # show the edge matrix if requested if fs.edge: paragraph = StringIO() print >> paragraph, 'edge resistance matrix:' edge_matrix = L.copy() for i in range(n): for j in range(n): if i == j: edge_matrix[i][j] = 0 else: edge_matrix[i][j] = -1.0 / edge_matrix[i][j] print >> paragraph, MatrixUtil.m_to_string(edge_matrix) paragraphs.append(paragraph.getvalue().strip()) # show the affinity matrix if requested if fs.affinity: paragraph = StringIO() print >> paragraph, 'affinity matrix:' affinity_matrix = L.copy() for i in range(n): for j in range(n): if i == j: affinity_matrix[i][j] = 0 else: affinity_matrix[i][j] *= -1 print >> paragraph, MatrixUtil.m_to_string(affinity_matrix) paragraphs.append(paragraph.getvalue().strip()) # show the laplacian matrix if requested if fs.laplacian: paragraph = StringIO() print >> paragraph, 'laplacian matrix:' print >> paragraph, MatrixUtil.m_to_string(L) paragraphs.append(paragraph.getvalue().strip()) # show the negative laplacian matrix if requested if fs.neglaplacian: paragraph = StringIO() print >> paragraph, 'negative laplacian matrix:' print >> paragraph, MatrixUtil.m_to_string(-L) paragraphs.append(paragraph.getvalue().strip()) # show the neighbor joining Q matrix if fs.Q: Q = NeighborJoining.get_Q_matrix(D) paragraph = StringIO() print >> paragraph, 'neighbor-joining Q matrix:' print >> paragraph, MatrixUtil.m_to_string(Q) paragraphs.append(paragraph.getvalue().strip()) # show the ordered labels if requested if fs.labels: paragraph = StringIO() print >> paragraph, 'ordered labels:' print >> paragraph, '\n'.join(states) paragraphs.append(paragraph.getvalue().strip()) # return the reponse return '\n\n'.join(paragraphs) + '\n'
def get_response_content(fs): # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # get the names of the tips of the tree alphabetically_ordered_states = list( sorted(node.name for node in tree.gen_tips())) n = len(alphabetically_ordered_states) if n < 2: raise HandlingError('the newick tree should have at least two leaves') # read the ordered labels states = [] if fs.inlabels: states = Util.get_stripped_lines(fs.inlabels.splitlines()) if len(states) > 1: if set(states) != set(alphabetically_ordered_states): msg_a = 'if ordered labels are provided, ' msg_b = 'each should correspond to a leaf of the newick tree' raise HandlingError(msg_a + msg_b) else: states = alphabetically_ordered_states # create the distance matrix D = tree.get_distance_matrix(states) # create the laplacian matrix M = np.array(D) P = np.eye(n) - np.ones((n, n)) / n L_pinv = -0.5 * np.dot(P, np.dot(M, P)) L = linalg.pinv(L_pinv) # start collecting the paragraphs paragraphs = [] # show the distance matrix if requested if fs.distance: paragraph = StringIO() print >> paragraph, 'path resistance (distance) matrix:' print >> paragraph, MatrixUtil.m_to_string(D) paragraphs.append(paragraph.getvalue().strip()) # show the edge matrix if requested if fs.edge: paragraph = StringIO() print >> paragraph, 'edge resistance matrix:' edge_matrix = L.copy() for i in range(n): for j in range(n): if i == j: edge_matrix[i][j] = 0 else: edge_matrix[i][j] = -1.0 / edge_matrix[i][j] print >> paragraph, MatrixUtil.m_to_string(edge_matrix) paragraphs.append(paragraph.getvalue().strip()) # show the affinity matrix if requested if fs.affinity: paragraph = StringIO() print >> paragraph, 'affinity matrix:' affinity_matrix = L.copy() for i in range(n): for j in range(n): if i == j: affinity_matrix[i][j] = 0 else: affinity_matrix[i][j] *= -1 print >> paragraph, MatrixUtil.m_to_string(affinity_matrix) paragraphs.append(paragraph.getvalue().strip()) # show the laplacian matrix if requested if fs.laplacian: paragraph = StringIO() print >> paragraph, 'laplacian matrix:' print >> paragraph, MatrixUtil.m_to_string(L) paragraphs.append(paragraph.getvalue().strip()) # show the negative laplacian matrix if requested if fs.neglaplacian: paragraph = StringIO() print >> paragraph, 'negative laplacian matrix:' print >> paragraph, MatrixUtil.m_to_string(-L) paragraphs.append(paragraph.getvalue().strip()) # show the neighbor joining Q matrix if fs.Q: Q = NeighborJoining.get_Q_matrix(D) paragraph = StringIO() print >> paragraph, 'neighbor-joining Q matrix:' print >> paragraph, MatrixUtil.m_to_string(Q) paragraphs.append(paragraph.getvalue().strip()) # show the ordered labels if requested if fs.labels: paragraph = StringIO() print >> paragraph, 'ordered labels:' print >> paragraph, '\n'.join(states) paragraphs.append(paragraph.getvalue().strip()) # return the reponse return '\n\n'.join(paragraphs) + '\n'
def _get_any_selection(self, distance_matrix): """ @param distance_matrix: a numpy or row major distance matrix @return: a set of selected indices representing one of the two parts of the bipartition """ return set(NeighborJoining.get_neighbors(distance_matrix))
def _make_tree_helper(self, D, index_to_serial, depth=0): """ Recursively build a newick tree from a distance matrix. @param D: a row major distance matrix @param index_to_serial: converts an index in D to a serial number for the tree node @param depth: gives the recursion depth; this is for instrumentation @return: a newick tree with branch lengths """ # instrumentation to notify the framework that a recursive call has been made if self.callback: self.callback(depth) # recursively build the newick tree n = len(D) if n == 3: # if there are only three nodes then return a single star tree v, (f, g) = NeighborJoining.do_iteration(D) root = Newick.NewickNode() for i, d in enumerate(v): neo = Newick.NewickNode() neo.serial_number = index_to_serial[i] neo.blen = d root.add_child(neo) neo.set_parent(root) return Newick.NewickTree(root) # try to get the selection using a custom splitter selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # if the split was insufficient then resort to either modifying the distance matrix or using neighbor joining fallback = False if min(len(selection), len(complement)) < 2: fallback = True if self.fallback_name == 'nj': # use an iteration of neighbor joining if this is the preferred fallback method v, (f, g) = NeighborJoining.do_iteration(D) selection = set((f, g)) complement = set(range(n)) - selection elif self.fallback_name == 'halving': # repeatedly modify the distance matrix if this is the preferred fallback method halving_count = 0 while min(len(selection), len(complement)) < 2: # kill the loop if the halving count is ridiculous if halving_count > 1000: error_out = StringIO() print >> error_out, 'the number of leaf stem halving iterations is ridiculous (%d);' % halving_count print >> error_out, 'the singleton leaf stem length is %s;' % leaf_stem_length print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError(error_out.getvalue().strip()) # find the index of the leaf singleton halving_count += 1 if len(selection) == 1: smaller = selection larger = complement elif len(complement) == 1: smaller = complement larger = selection else: error_out = StringIO() print >> error_out, 'in the following distance matrix,' print >> error_out, 'a split was so degenerate that it did not even leave a leaf stem to work with:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError(error_out.getvalue().strip()) v = get_crossing_distances(D, selection, complement) # get the distance from the leaf singleton to the root of the rest of the tree leaf_singleton_index = list(smaller)[0] leaf_stem_length = v[leaf_singleton_index] # if the leaf stem length is zero then repeatedly halving it will not help. if not leaf_stem_length: error_out = StringIO() print >> error_out, 'the singleton leaf stem length is zero;' print >> error_out, 'the number of leaf stem halving iterations performed was %d;' % halving_count print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError(error_out.getvalue().strip()) # modify the distance matrix for i in larger: D[i][leaf_singleton_index] -= leaf_stem_length / 2 D[leaf_singleton_index][i] -= leaf_stem_length / 2 # get the selection and complement using the modified distance matrix selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # define the new serial numbers for the selection and complement subtrees selection_serial = self.number_generator.get_next() complement_serial = self.number_generator.get_next() # for reporting purposes only, # store the subset of leaf serials defined by each new serial number for new_serial, indices in ((selection_serial, selection), (complement_serial, complement)): serials = set(index_to_serial[i] for i in indices) new_set = set() for serial in serials: new_set.update(self.serial_number_to_tip_set[serial]) self.serial_number_to_tip_set[new_serial] = new_set # report the split flattened_selection = set(self.ordered_labels[serial] for serial in self.serial_number_to_tip_set[selection_serial]) if fallback: if self.fallback_name == 'nj': self.on_nj_fallback_split(flattened_selection, len(selection), len(complement)) elif self.fallback_name == 'halving': self.on_halving_fallback_split(flattened_selection, len(selection), len(complement), halving_count) else: assert False, 'internal error: invalid fallback method' else: self.on_custom_split(flattened_selection, len(selection), len(complement)) # break the distance matrix into two distance matrices, # then make a tree for each one. A = list(sorted(selection)) B = list(sorted(complement)) A_distance_matrix, B_distance_matrix = split_distance_matrix(D, selection, complement) # define the serial numbers for the split distance matrices A_index_to_serial = [index_to_serial[i] for i in A] + [complement_serial] B_index_to_serial = [index_to_serial[i] for i in B] + [selection_serial] # make the next trees A_tree = self._make_tree_helper(A_distance_matrix, A_index_to_serial, depth+1) B_tree = self._make_tree_helper(B_distance_matrix, B_index_to_serial, depth+1) # return the merged tree return merge_trees(A_tree, B_tree)
def _make_tree_helper(self, D, index_to_serial, depth=0): """ Recursively build a newick tree from a distance matrix. @param D: a row major distance matrix @param index_to_serial: converts an index in D to a serial number for the tree node @param depth: gives the recursion depth; this is for instrumentation @return: a newick tree with branch lengths """ # instrumentation to notify the framework that a recursive call has been made if self.callback: self.callback(depth) # recursively build the newick tree n = len(D) if n == 3: # if there are only three nodes then return a single star tree v, (f, g) = NeighborJoining.do_iteration(D) root = Newick.NewickNode() for i, d in enumerate(v): neo = Newick.NewickNode() neo.serial_number = index_to_serial[i] neo.blen = d root.add_child(neo) neo.set_parent(root) return Newick.NewickTree(root) # try to get the selection using a custom splitter selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # if the split was insufficient then resort to either modifying the distance matrix or using neighbor joining fallback = False if min(len(selection), len(complement)) < 2: fallback = True if self.fallback_name == 'nj': # use an iteration of neighbor joining if this is the preferred fallback method v, (f, g) = NeighborJoining.do_iteration(D) selection = set((f, g)) complement = set(range(n)) - selection elif self.fallback_name == 'halving': # repeatedly modify the distance matrix if this is the preferred fallback method halving_count = 0 while min(len(selection), len(complement)) < 2: # kill the loop if the halving count is ridiculous if halving_count > 1000: error_out = StringIO() print >> error_out, 'the number of leaf stem halving iterations is ridiculous (%d);' % halving_count print >> error_out, 'the singleton leaf stem length is %s;' % leaf_stem_length print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) # find the index of the leaf singleton halving_count += 1 if len(selection) == 1: smaller = selection larger = complement elif len(complement) == 1: smaller = complement larger = selection else: error_out = StringIO() print >> error_out, 'in the following distance matrix,' print >> error_out, 'a split was so degenerate that it did not even leave a leaf stem to work with:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) v = get_crossing_distances(D, selection, complement) # get the distance from the leaf singleton to the root of the rest of the tree leaf_singleton_index = list(smaller)[0] leaf_stem_length = v[leaf_singleton_index] # if the leaf stem length is zero then repeatedly halving it will not help. if not leaf_stem_length: error_out = StringIO() print >> error_out, 'the singleton leaf stem length is zero;' print >> error_out, 'the number of leaf stem halving iterations performed was %d;' % halving_count print >> error_out, 'the distance matrix is:' print >> error_out, MatrixUtil.m_to_string(D) raise NeighborhoodJoiningError( error_out.getvalue().strip()) # modify the distance matrix for i in larger: D[i][leaf_singleton_index] -= leaf_stem_length / 2 D[leaf_singleton_index][i] -= leaf_stem_length / 2 # get the selection and complement using the modified distance matrix selection = self.splitter.get_selection(D) complement = set(range(n)) - selection # define the new serial numbers for the selection and complement subtrees selection_serial = self.number_generator.get_next() complement_serial = self.number_generator.get_next() # for reporting purposes only, # store the subset of leaf serials defined by each new serial number for new_serial, indices in ((selection_serial, selection), (complement_serial, complement)): serials = set(index_to_serial[i] for i in indices) new_set = set() for serial in serials: new_set.update(self.serial_number_to_tip_set[serial]) self.serial_number_to_tip_set[new_serial] = new_set # report the split flattened_selection = set( self.ordered_labels[serial] for serial in self.serial_number_to_tip_set[selection_serial]) if fallback: if self.fallback_name == 'nj': self.on_nj_fallback_split(flattened_selection, len(selection), len(complement)) elif self.fallback_name == 'halving': self.on_halving_fallback_split(flattened_selection, len(selection), len(complement), halving_count) else: assert False, 'internal error: invalid fallback method' else: self.on_custom_split(flattened_selection, len(selection), len(complement)) # break the distance matrix into two distance matrices, # then make a tree for each one. A = list(sorted(selection)) B = list(sorted(complement)) A_distance_matrix, B_distance_matrix = split_distance_matrix( D, selection, complement) # define the serial numbers for the split distance matrices A_index_to_serial = [index_to_serial[i] for i in A] + [complement_serial] B_index_to_serial = [index_to_serial[i] for i in B] + [selection_serial] # make the next trees A_tree = self._make_tree_helper(A_distance_matrix, A_index_to_serial, depth + 1) B_tree = self._make_tree_helper(B_distance_matrix, B_index_to_serial, depth + 1) # return the merged tree return merge_trees(A_tree, B_tree)