def get_response_content(fs): # read the matrix D = np.array(fs.matrix) n = len(D) # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) selected_labels = Util.get_stripped_lines(StringIO(fs.selection)) # validate the input if n != len(ordered_labels): raise HandlingError( 'the number of taxon labels should match the number of rows in the distance matrix' ) # get the two sets of indices index_set_A = set(i for i, label in enumerate(ordered_labels) if label in selected_labels) index_set_B = set(range(n)) - index_set_A # get internal values related to the split R, alpha, beta, gamma = get_R_alpha_beta_gamma(D, index_set_B) # get the two new distance matrices D_A = BuildTreeTopology.update_generalized_nj(D, index_set_B) D_B = BuildTreeTopology.update_generalized_nj(D, index_set_A) # get the names associated with the indices of the new distance matrices all_names = [set([name]) for name in ordered_labels] D_A_names = [ set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_B) ] D_B_names = [ set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_A) ] # show the results out = StringIO() print >> out, 'alpha:', alpha print >> out, 'beta:', beta print >> out, 'gamma:', gamma print >> out print >> out, 'new distance matrix corresponding to the selected names:' print >> out, MatrixUtil.m_to_string(D_A) print >> out print >> out, 'ordered labels corresponding to this matrix:' for name in D_A_names: print >> out, name print >> out print >> out, 'new distance matrix corresponding to the non-selected names:' print >> out, MatrixUtil.m_to_string(D_B) print >> out print >> out, 'ordered labels corresponding to this matrix:' for name in D_B_names: print >> out, name # return the response return out.getvalue()
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor): """ @param L: the laplacian corresponding to tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and L @param m_to_string: a function that converts a matrix to a string @param scaling_factor: show the Laplacian scaled by this factor @return: a multi-line string """ out = StringIO() n = len(L) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child L_child = SchurAlgebra.mmerge(L, complement) print >> out, 'the Schur complement in the Laplacian of child tree', i + 1, 'scaled by', scaling_factor print >> out, m_to_string(scaling_factor * L_child) print >> out child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement) v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i + 1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor): """ @param L: the laplacian corresponding to tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and L @param m_to_string: a function that converts a matrix to a string @param scaling_factor: show the Laplacian scaled by this factor @return: a multi-line string """ out = StringIO() n = len(L) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child L_child = SchurAlgebra.mmerge(L, complement) print >> out, 'the Schur complement in the Laplacian of child tree', i+1, 'scaled by', scaling_factor print >> out, m_to_string(scaling_factor * L_child) print >> out child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement) v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i+1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
def get_splits(initial_distance_matrix, split_function, update_function, on_label_split=None): """ This is the most external of the functions in this module. Get the set of splits implied by the tree that would be reconstructed. @param initial_distance_matrix: a distance matrix @param split_function: takes a distance matrix and returns an index split @param update_function: takes a distance matrix and an index subset and returns a distance matrix @param on_label_split: notifies the caller of the label split induced by an index split @return: a set of splits """ n = len(initial_distance_matrix) # keep a stack of (label_set_per_vertex, distance_matrix) pairs initial_state = ([set([i]) for i in range(n)], initial_distance_matrix) stack = [initial_state] # process the stack in a depth first manner, building the split set label_split_set = set() while stack: label_sets, D = stack.pop() # if the matrix is small then we are done if len(D) < 4: continue # split the indices using the specified function try: index_split = split_function(D) # convert the index split to a label split label_split = index_split_to_label_split(index_split, label_sets) # notify the caller if a callback is requested if on_label_split: on_label_split(label_split) # add the split to the master set of label splits label_split_set.add(label_split) # for large matrices create the new label sets and the new conformant distance matrices a, b = index_split for index_selection, index_complement in ((a, b), (b, a)): if len(index_complement) > 2: next_label_sets = SchurAlgebra.vmerge( label_sets, index_selection) next_D = update_function(D, index_selection) next_state = (next_label_sets, next_D) stack.append(next_state) except DegenerateSplitException, e: # we cannot recover from a degenerate split unless there are more than four indices if len(D) <= 4: continue # with more than four indices we can fall back to partial splits index_set = set([e.index]) # get the next label sets next_label_sets = SchurAlgebra.vdelete(label_sets, index_set) # get the next conformant distance matrix by schur complementing out the offending index L = Euclid.edm_to_laplacian(D) L_small = SchurAlgebra.mschur(L, index_set) next_D = Euclid.laplacian_to_edm(L_small) next_state = (next_label_sets, next_D) stack.append(next_state)
def get_response_content(fs): # read the matrix D = np.array(fs.matrix) n = len(D) # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) selected_labels = Util.get_stripped_lines(StringIO(fs.selection)) # validate the input if n != len(ordered_labels): raise HandlingError("the number of taxon labels should match the number of rows in the distance matrix") # get the two sets of indices index_set_A = set(i for i, label in enumerate(ordered_labels) if label in selected_labels) index_set_B = set(range(n)) - index_set_A # get internal values related to the split R, alpha, beta, gamma = get_R_alpha_beta_gamma(D, index_set_B) # get the two new distance matrices D_A = BuildTreeTopology.update_generalized_nj(D, index_set_B) D_B = BuildTreeTopology.update_generalized_nj(D, index_set_A) # get the names associated with the indices of the new distance matrices all_names = [set([name]) for name in ordered_labels] D_A_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_B)] D_B_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_A)] # show the results out = StringIO() print >> out, "alpha:", alpha print >> out, "beta:", beta print >> out, "gamma:", gamma print >> out print >> out, "new distance matrix corresponding to the selected names:" print >> out, MatrixUtil.m_to_string(D_A) print >> out print >> out, "ordered labels corresponding to this matrix:" for name in D_A_names: print >> out, name print >> out print >> out, "new distance matrix corresponding to the non-selected names:" print >> out, MatrixUtil.m_to_string(D_B) print >> out print >> out, "ordered labels corresponding to this matrix:" for name in D_B_names: print >> out, name # return the response return out.getvalue()
def get_splits(initial_distance_matrix, split_function, update_function, on_label_split=None): """ This is the most external of the functions in this module. Get the set of splits implied by the tree that would be reconstructed. @param initial_distance_matrix: a distance matrix @param split_function: takes a distance matrix and returns an index split @param update_function: takes a distance matrix and an index subset and returns a distance matrix @param on_label_split: notifies the caller of the label split induced by an index split @return: a set of splits """ n = len(initial_distance_matrix) # keep a stack of (label_set_per_vertex, distance_matrix) pairs initial_state = ([set([i]) for i in range(n)], initial_distance_matrix) stack = [initial_state] # process the stack in a depth first manner, building the split set label_split_set = set() while stack: label_sets, D = stack.pop() # if the matrix is small then we are done if len(D) < 4: continue # split the indices using the specified function try: index_split = split_function(D) # convert the index split to a label split label_split = index_split_to_label_split(index_split, label_sets) # notify the caller if a callback is requested if on_label_split: on_label_split(label_split) # add the split to the master set of label splits label_split_set.add(label_split) # for large matrices create the new label sets and the new conformant distance matrices a, b = index_split for index_selection, index_complement in ((a, b), (b, a)): if len(index_complement) > 2: next_label_sets = SchurAlgebra.vmerge(label_sets, index_selection) next_D = update_function(D, index_selection) next_state = (next_label_sets, next_D) stack.append(next_state) except DegenerateSplitException, e: # we cannot recover from a degenerate split unless there are more than four indices if len(D) <= 4: continue # with more than four indices we can fall back to partial splits index_set = set([e.index]) # get the next label sets next_label_sets = SchurAlgebra.vdelete(label_sets, index_set) # get the next conformant distance matrix by schur complementing out the offending index L = Euclid.edm_to_laplacian(D) L_small = SchurAlgebra.mschur(L, index_set) next_D = Euclid.laplacian_to_edm(L_small) next_state = (next_label_sets, next_D) stack.append(next_state)
def get_verbose_summary(self): """ @return: a multiline string """ # begin the response out = StringIO() # show the number of taxa in various domains print >> out, self._get_name_summary() print >> out # show the pruned full tree formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) print >> out # report the clade intersections of sides of the split side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit] print >> out, 'domains represented by each side of the primary split:' print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0])) print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1])) print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(self.pruned_names))] # do the secondary splits for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_secondary = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_secondary) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(self.pruned_names[i] for i in left_sublabels) right_subnames = set(self.pruned_names[i] for i in right_sublabels) print >> out, 'domains represented by a subsplit:' print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames)) print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames)) print >> out # return the multiline string return out.getvalue().strip()
def _do_analysis(self, use_generalized_nj): """ Do some splits of the tree. @param use_generalized_nj: True if we use an old method of outgrouping """ # define the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) # get the primary split of the criterion matrix L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # assert that the first split cleanly separates the bacteria from the rest left_indices, right_indices = eigensplit left_domains = self._get_domains([self.pruned_names[x] for x in left_indices]) right_domains = self._get_domains([self.pruned_names[x] for x in right_indices]) if ('bacteria' in left_domains) and ('bacteria' in right_domains): raise HandlingError('bacteria were not defined by the first split') # now we have enough info to define the first supplementary csv file self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v) # define the bacteria indices vs the non-bacteria indices for the second split if 'bacteria' in left_domains: bacteria_indices = left_indices non_bacteria_indices = right_indices elif 'bacteria' in right_domains: bacteria_indices = right_indices non_bacteria_indices = left_indices # get the secondary split of interest if use_generalized_nj: D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices) L_secondary = Euclid.edm_to_laplacian(D_secondary) else: L_secondary = SchurAlgebra.mmerge(L, bacteria_indices) full_label_sets = [set([i]) for i in range(len(self.pruned_names))] next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices) v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary) eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary) left_subindices, right_subindices = eigensplit_secondary pruned_names_secondary = [] for label_set in next_label_sets: if len(label_set) == 1: label = list(label_set)[0] pruned_names_secondary.append(self.pruned_names[label]) else: pruned_names_secondary.append('all-bacteria') # assert that the second split cleanly separates the eukaryota from the rest left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices]) right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices]) if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains): raise HandlingError('eukaryota were not defined by the second split') # now we have enough info to define the second supplementary csv file self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
def get_response_content(fs): # read the matrix D = fs.matrix # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) if not ordered_labels: raise HandlingError('no ordered taxa were provided') if len(ordered_labels) != len(set(ordered_labels)): raise HandlingError('the ordered taxa should be unique') # get the label selection and its complement min_selected_labels = 2 min_unselected_labels = 1 selected_labels = set(Util.get_stripped_lines(StringIO(fs.selection))) if len(selected_labels) < min_selected_labels: raise HandlingError('at least %d taxa should be selected to be grouped' % min_selected_labels) # get the set of labels in the complement unselected_labels = set(ordered_labels) - selected_labels if len(unselected_labels) < min_unselected_labels: raise HandlingError('at least %d taxa should remain outside the selected group' % min_unselected_labels) # assert that no bizarre labels were selected weird_labels = selected_labels - set(ordered_labels) if weird_labels: raise HandlingError('some selected taxa are invalid: ' + str(weird_labels)) # assert that the size of the distance matrix is compatible with the number of ordered labels if len(D) != len(ordered_labels): raise HandlingError('the number of listed taxa does not match the number of rows in the distance matrix') # get the set of selected indices and its complement n = len(D) index_selection = set(i for i, label in enumerate(ordered_labels) if label in selected_labels) index_complement = set(range(n)) - index_selection # begin the response out = StringIO() # get the ordered list of sets of indices to merge merged_indices = SchurAlgebra.vmerge([set([x]) for x in range(n)], index_selection) # calculate the new distance matrix L = Euclid.edm_to_laplacian(D) L_merged = SchurAlgebra.mmerge(L, index_selection) D_merged = Euclid.laplacian_to_edm(L_merged) # print the output distance matrix and the labels of its rows print >> out, 'new distance matrix:' print >> out, MatrixUtil.m_to_string(D_merged) print >> out print >> out, 'new taxon labels:' for merged_index_set in merged_indices: if len(merged_index_set) == 1: print >> out, ordered_labels[merged_index_set.pop()] else: print >> out, '{' + ', '.join(selected_labels) + '}' # write the response return out.getvalue()
def update_generalized_nj(D, index_set): """ Create a new distance matrix according to a neighbor-joining-like criterion. Do this according to the explanation in our tree reconstruction manuscript. The length of the branch defined by the split is divided evenly between the two successor distance matrices. @param D: the distance matrix @param index_set: the subset of indices that will be removed from the updated distance matrix @return: a new distance matrix """ n = len(D) A = set(range(n)) - set(index_set) B = set(index_set) nA = len(A) nB = len(B) if nA < 2 or nB < 2: raise ValueError( 'expected each side of the split to have at least two elements') # The split of the indices into A and B defines a single internal branch. # The average distance from A to the branch is alpha. # The average distance from B to the branch is beta. # The length of the branch is gamma. # The expected distance from i to a taxon in the other group is R[i]. R = {} R.update((i, sum(D[i, b] for b in B) / float(nB)) for i in A) R.update((j, sum(D[a, j] for a in A) / float(nA)) for j in B) gamma_plus_beta = 0.5 * min(R[i] + R[j] - D[i, j] for i, j in itertools.combinations(A, 2)) alpha_plus_gamma = 0.5 * min(R[i] + R[j] - D[i, j] for i, j in itertools.combinations(B, 2)) alpha_plus_gamma_plus_beta = sum( D[i, j] for i, j in itertools.product(A, B)) / float(nA * nB) gamma = alpha_plus_gamma + gamma_plus_beta - alpha_plus_gamma_plus_beta beta = gamma_plus_beta - gamma # Initialize the new distance matrix. D_out = SchurAlgebra.mmerge(D, index_set) # Find the index of D_out that corresponds to the outgroup. outgroup_index = sum(1 for a in A if a < min(B)) D_out[outgroup_index, outgroup_index] = 0 # Adjust one of the rows and columns to reflect distances to the outgroup. label_sets = SchurAlgebra.vmerge([set([i]) for i in range(n)], index_set) for i, labels in enumerate(label_sets): if i != outgroup_index: a = iterutils.get_only(labels) d = R[a] - beta - 0.5 * gamma D_out[i, outgroup_index] = D_out[outgroup_index, i] = d return D_out
def update_generalized_nj(D, index_set): """ Create a new distance matrix according to a neighbor-joining-like criterion. Do this according to the explanation in our tree reconstruction manuscript. The length of the branch defined by the split is divided evenly between the two successor distance matrices. @param D: the distance matrix @param index_set: the subset of indices that will be removed from the updated distance matrix @return: a new distance matrix """ n = len(D) A = set(range(n)) - set(index_set) B = set(index_set) nA = len(A) nB = len(B) if nA < 2 or nB < 2: raise ValueError("expected each side of the split to have at least two elements") # The split of the indices into A and B defines a single internal branch. # The average distance from A to the branch is alpha. # The average distance from B to the branch is beta. # The length of the branch is gamma. # The expected distance from i to a taxon in the other group is R[i]. R = {} R.update((i, sum(D[i, b] for b in B) / float(nB)) for i in A) R.update((j, sum(D[a, j] for a in A) / float(nA)) for j in B) gamma_plus_beta = 0.5 * min(R[i] + R[j] - D[i, j] for i, j in itertools.combinations(A, 2)) alpha_plus_gamma = 0.5 * min(R[i] + R[j] - D[i, j] for i, j in itertools.combinations(B, 2)) alpha_plus_gamma_plus_beta = sum(D[i, j] for i, j in itertools.product(A, B)) / float(nA * nB) gamma = alpha_plus_gamma + gamma_plus_beta - alpha_plus_gamma_plus_beta beta = gamma_plus_beta - gamma # Initialize the new distance matrix. D_out = SchurAlgebra.mmerge(D, index_set) # Find the index of D_out that corresponds to the outgroup. outgroup_index = sum(1 for a in A if a < min(B)) D_out[outgroup_index, outgroup_index] = 0 # Adjust one of the rows and columns to reflect distances to the outgroup. label_sets = SchurAlgebra.vmerge([set([i]) for i in range(n)], index_set) for i, labels in enumerate(label_sets): if i != outgroup_index: a = iterutils.get_only(labels) d = R[a] - beta - 0.5 * gamma D_out[i, outgroup_index] = D_out[outgroup_index, i] = d return D_out
def get_standard_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # begin the response out = StringIO() # show a summary of the original data print >> out, 'data summary before removing branches with zero length:' print >> out, len(archaea_names), 'archaea names in the original tree' print >> out, len(bacteria_names), 'bacteria names in the original tree' print >> out, len(eukaryota_names), 'eukaryota names in the original tree' print >> out, len(all_names), 'total names in the original tree' print >> out # get the pruned full tree pruned_full_tree = get_pruned_tree(full_tree) ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips()) # show a summary of the processed data print >> out, 'data summary after removing branches with zero length:' print >> out, len( ordered_names), 'total names in the processed non-degenerate tree' print >> out # draw the pruned full tree print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' formatted_tree_string = NewickIO.get_narrow_newick_string( pruned_full_tree, 120) print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(pruned_full_tree.get_distance_matrix(ordered_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) # report the clade intersections of sides of the split side_names = [set(ordered_names[i] for i in side) for side in eigensplit] clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota')) print >> out, 'clade intersections with each side of the split:' for side, side_name in zip(side_names, ('left', 'right')): for clade, clade_name in clade_name_pairs: if clade & side: print >> out, 'the', side_name, 'side intersects', clade_name print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(ordered_names))] # get a secondary split for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_s1 = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_s1) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split( v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(ordered_names[i] for i in left_sublabels) right_subnames = set(ordered_names[i] for i in right_sublabels) print >> out, 'clade intersections with a subsplit:' for clade, clade_name in clade_name_pairs: if clade & left_subnames: print >> out, 'the left side intersects', clade_name for clade, clade_name in clade_name_pairs: if clade & right_subnames: print >> out, 'the right side intersects', clade_name print >> out # show debug info print >> out, 'archaea names:' print >> out, '\n'.join(x for x in sorted(archaea_names)) print >> out print >> out, 'bacteria names:' print >> out, '\n'.join(x for x in sorted(bacteria_names)) print >> out print >> out, 'eukaryota names:' print >> out, '\n'.join(x for x in sorted(eukaryota_names)) print >> out # return the response response_text = out.getvalue().strip() return [('Content-Type', 'text/plain')], response_text
def get_verbose_summary(self): """ @return: a multiline string """ # begin the response out = StringIO() # show the number of taxa in various domains print >> out, self._get_name_summary() print >> out # show the pruned full tree formatted_tree_string = NewickIO.get_narrow_newick_string( self.pruned_tree, 120) print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) print >> out # report the clade intersections of sides of the split side_names = [ set(self.pruned_names[i] for i in side) for side in eigensplit ] print >> out, 'domains represented by each side of the primary split:' print >> out, 'the left side has:\t', ', '.join( self._get_domains(side_names[0])) print >> out, 'the right side has:\t', ', '.join( self._get_domains(side_names[1])) print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(self.pruned_names))] # do the secondary splits for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_secondary = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_secondary) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split( v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(self.pruned_names[i] for i in left_sublabels) right_subnames = set(self.pruned_names[i] for i in right_sublabels) print >> out, 'domains represented by a subsplit:' print >> out, 'the left side has:\t', ', '.join( self._get_domains(left_subnames)) print >> out, 'the right side has:\t', ', '.join( self._get_domains(right_subnames)) print >> out # return the multiline string return out.getvalue().strip()
def _do_analysis(self, use_generalized_nj): """ Do some splits of the tree. @param use_generalized_nj: True if we use an old method of outgrouping """ # define the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) # get the primary split of the criterion matrix L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # assert that the first split cleanly separates the bacteria from the rest left_indices, right_indices = eigensplit left_domains = self._get_domains( [self.pruned_names[x] for x in left_indices]) right_domains = self._get_domains( [self.pruned_names[x] for x in right_indices]) if ('bacteria' in left_domains) and ('bacteria' in right_domains): raise HandlingError('bacteria were not defined by the first split') # now we have enough info to define the first supplementary csv file self.first_split_object = SupplementarySpreadsheetObject( self.pruned_names, L, v) # define the bacteria indices vs the non-bacteria indices for the second split if 'bacteria' in left_domains: bacteria_indices = left_indices non_bacteria_indices = right_indices elif 'bacteria' in right_domains: bacteria_indices = right_indices non_bacteria_indices = left_indices # get the secondary split of interest if use_generalized_nj: D_secondary = BuildTreeTopology.update_generalized_nj( D, bacteria_indices) L_secondary = Euclid.edm_to_laplacian(D_secondary) else: L_secondary = SchurAlgebra.mmerge(L, bacteria_indices) full_label_sets = [set([i]) for i in range(len(self.pruned_names))] next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices) v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary) eigensplit_secondary = BuildTreeTopology.eigenvector_to_split( v_secondary) left_subindices, right_subindices = eigensplit_secondary pruned_names_secondary = [] for label_set in next_label_sets: if len(label_set) == 1: label = list(label_set)[0] pruned_names_secondary.append(self.pruned_names[label]) else: pruned_names_secondary.append('all-bacteria') # assert that the second split cleanly separates the eukaryota from the rest left_subdomains = self._get_domains( [pruned_names_secondary[x] for x in left_subindices]) right_subdomains = self._get_domains( [pruned_names_secondary[x] for x in right_subindices]) if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains): raise HandlingError( 'eukaryota were not defined by the second split') # now we have enough info to define the second supplementary csv file self.second_split_object = SupplementarySpreadsheetObject( pruned_names_secondary, L_secondary, v_secondary)
def get_standard_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # begin the response out = StringIO() # show a summary of the original data print >> out, 'data summary before removing branches with zero length:' print >> out, len(archaea_names), 'archaea names in the original tree' print >> out, len(bacteria_names), 'bacteria names in the original tree' print >> out, len(eukaryota_names), 'eukaryota names in the original tree' print >> out, len(all_names), 'total names in the original tree' print >> out # get the pruned full tree pruned_full_tree = get_pruned_tree(full_tree) ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips()) # show a summary of the processed data print >> out, 'data summary after removing branches with zero length:' print >> out, len(ordered_names), 'total names in the processed non-degenerate tree' print >> out # draw the pruned full tree print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(pruned_full_tree.get_distance_matrix(ordered_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) # report the clade intersections of sides of the split side_names = [set(ordered_names[i] for i in side) for side in eigensplit] clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota')) print >> out, 'clade intersections with each side of the split:' for side, side_name in zip(side_names, ('left', 'right')): for clade, clade_name in clade_name_pairs: if clade & side: print >> out, 'the', side_name, 'side intersects', clade_name print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(ordered_names))] # get a secondary split for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_s1 = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_s1) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(ordered_names[i] for i in left_sublabels) right_subnames = set(ordered_names[i] for i in right_sublabels) print >> out, 'clade intersections with a subsplit:' for clade, clade_name in clade_name_pairs: if clade & left_subnames: print >> out, 'the left side intersects', clade_name for clade, clade_name in clade_name_pairs: if clade & right_subnames: print >> out, 'the right side intersects', clade_name print >> out # show debug info print >> out, 'archaea names:' print >> out, '\n'.join(x for x in sorted(archaea_names)) print >> out print >> out, 'bacteria names:' print >> out, '\n'.join(x for x in sorted(bacteria_names)) print >> out print >> out, 'eukaryota names:' print >> out, '\n'.join(x for x in sorted(eukaryota_names)) print >> out # return the response response_text = out.getvalue().strip() return [('Content-Type', 'text/plain')], response_text