def _do_analysis(self, use_generalized_nj): """ Do some splits of the tree. @param use_generalized_nj: True if we use an old method of outgrouping """ # define the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) # get the primary split of the criterion matrix L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # assert that the first split cleanly separates the bacteria from the rest left_indices, right_indices = eigensplit left_domains = self._get_domains([self.pruned_names[x] for x in left_indices]) right_domains = self._get_domains([self.pruned_names[x] for x in right_indices]) if ('bacteria' in left_domains) and ('bacteria' in right_domains): raise HandlingError('bacteria were not defined by the first split') # now we have enough info to define the first supplementary csv file self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v) # define the bacteria indices vs the non-bacteria indices for the second split if 'bacteria' in left_domains: bacteria_indices = left_indices non_bacteria_indices = right_indices elif 'bacteria' in right_domains: bacteria_indices = right_indices non_bacteria_indices = left_indices # get the secondary split of interest if use_generalized_nj: D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices) L_secondary = Euclid.edm_to_laplacian(D_secondary) else: L_secondary = SchurAlgebra.mmerge(L, bacteria_indices) full_label_sets = [set([i]) for i in range(len(self.pruned_names))] next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices) v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary) eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary) left_subindices, right_subindices = eigensplit_secondary pruned_names_secondary = [] for label_set in next_label_sets: if len(label_set) == 1: label = list(label_set)[0] pruned_names_secondary.append(self.pruned_names[label]) else: pruned_names_secondary.append('all-bacteria') # assert that the second split cleanly separates the eukaryota from the rest left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices]) right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices]) if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains): raise HandlingError('eukaryota were not defined by the second split') # now we have enough info to define the second supplementary csv file self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
def get_verbose_summary(self): """ @return: a multiline string """ # begin the response out = StringIO() # show the number of taxa in various domains print >> out, self._get_name_summary() print >> out # show the pruned full tree formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) print >> out # report the clade intersections of sides of the split side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit] print >> out, 'domains represented by each side of the primary split:' print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0])) print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1])) print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(self.pruned_names))] # do the secondary splits for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_secondary = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_secondary) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(self.pruned_names[i] for i in left_sublabels) right_subnames = set(self.pruned_names[i] for i in right_sublabels) print >> out, 'domains represented by a subsplit:' print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames)) print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames)) print >> out # return the multiline string return out.getvalue().strip()
def get_eigendecomposition_report(D): """ @param D: a distance matrix @return: a multi-line string """ out = StringIO() # get some intermediate matrices and vectors L = Euclid.edm_to_laplacian(D) laplacian_fiedler = BuildTreeTopology.laplacian_to_fiedler(L) distance_fiedler = BuildTreeTopology.edm_to_fiedler(D) eigensplit = BuildTreeTopology.eigenvector_to_split(laplacian_fiedler) # report the two eigenvalue lists that should be the same HDH = MatrixUtil.double_centered(D) HSH = -0.5 * HDH w_distance, vt_distance = np.linalg.eigh(HSH) print >> out, 'the laplacian-derived and distance-derived eigenvalues:' w_laplacian, vt_laplacian = np.linalg.eigh(L) for a, b in zip(sorted(w_laplacian), sorted(w_distance)): print >> out, a, '\t', b print >> out # report the two fiedler vectors that should be the same print >> out, 'the laplacian-derived and distance-derived fiedler vectors:' for a, b in zip(laplacian_fiedler, distance_fiedler): print >> out, a, '\t', b return out.getvalue().strip()
def get_full_tree_message(tree, m_to_string): """ In this function we find the Fiedler split of the full tree. @param tree: each node in this tree must have a name @param m_to_string: a function that converts a matrix to a string @return: a message about the split of the tips of the tree induced by the fiedler vector """ out = StringIO() # get the alphabetically ordered names ordered_names = list(sorted(node.get_name() for node in tree.preorder())) # get the corresponding ordered ids name_to_id = dict((node.get_name(), id(node)) for node in tree.preorder()) ordered_ids = [name_to_id[name] for name in ordered_names] # get the full weighted adjacency matrix A = np.array(tree.get_affinity_matrix(ordered_ids)) print >> out, 'the weighted reciprocal adjacency matrix of the full tree:' print >> out, m_to_string(get_reciprocal_matrix(A)) print >> out # get the full Laplacian matrix L = Euclid.adjacency_to_laplacian(A) # get the fiedler split v = BuildTreeTopology.laplacian_to_fiedler(L) print >> out, 'the Fiedler split of the full tree:' for name, value in zip(ordered_names, v): print >> out, name, ':', value return out.getvalue().strip()
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor): """ @param L: the laplacian corresponding to tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and L @param m_to_string: a function that converts a matrix to a string @param scaling_factor: show the Laplacian scaled by this factor @return: a multi-line string """ out = StringIO() n = len(L) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child L_child = SchurAlgebra.mmerge(L, complement) print >> out, 'the Schur complement in the Laplacian of child tree', i + 1, 'scaled by', scaling_factor print >> out, m_to_string(scaling_factor * L_child) print >> out child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement) v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i + 1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor): """ @param L: the laplacian corresponding to tips of the tree @param eigensplit: the split induced by the fiedler vector @param ordered_tip_names: names of the tips of the tree conformant to v and L @param m_to_string: a function that converts a matrix to a string @param scaling_factor: show the Laplacian scaled by this factor @return: a multi-line string """ out = StringIO() n = len(L) ordered_label_sets = [set([i]) for i in range(n)] all_labels = set(range(n)) for i, child in enumerate(eigensplit): complement = all_labels - child L_child = SchurAlgebra.mmerge(L, complement) print >> out, 'the Schur complement in the Laplacian of child tree', i+1, 'scaled by', scaling_factor print >> out, m_to_string(scaling_factor * L_child) print >> out child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement) v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i+1 for label_set, value in zip(child_label_sets, v_child): s = label_set_to_string(label_set, ordered_tip_names) print >> out, s, ':', value print >> out return out.getvalue().strip()
def get_response_content(fs): out = StringIO() # try to make some graphs unconnected_count = 0 invalid_split_count = 0 valid_split_count = 0 for graph_index in range(fs.ngraphs): G = erdos_renyi(fs.nvertices, fs.pedge) if is_connected(G): # add interesting edge weights add_exponential_weights(G) # turn the adjacency matrix into a laplacian matrix L = Euclid.adjacency_to_laplacian(G) for v in range(fs.nvertices): small_index_to_big_index = {} for i_small, i_big in enumerate([i for i in range(fs.nvertices) if i != v]): small_index_to_big_index[i_small] = i_big # take the schur complement with respect to the given vertex L_reduced = get_single_element_schur_complement(L, v) assert len(L_reduced) == len(L) - 1 # get the loadings of the vertices of the reduced graph if fs.fiedler_cut: Y_reduced = BuildTreeTopology.laplacian_to_fiedler(L_reduced) elif fs.random_cut: Y_reduced = get_random_vector(L_reduced) assert len(Y_reduced) == len(L_reduced) # expand the fiedler vector with positive and negative valuations for the removed vertex found_valid_split = False for augmented_loading in (-1.0, 1.0): # get the augmented split vector for this assignment of the removed vertex Y_full = [0]*len(G) for i_reduced, loading in enumerate(Y_reduced): i_big = small_index_to_big_index[i_reduced] Y_full[i_big] = loading Y_full[v] = augmented_loading assert len(Y_full) == len(G) # get the two graphs defined by the split subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full)) # if the subgraphs are both connected then the split is valid if is_connected(subgraph_a) and is_connected(subgraph_b): found_valid_split = True # if a valid split was not found then show the matrix if found_valid_split: valid_split_count += 1 else: print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!' print >> out, 'matrix:' print >> out, MatrixUtil.m_to_string(G) print >> out, 'index that was removed:', v invalid_split_count += 1 else: unconnected_count += 1 # show the number of connected and of unconnected graphs print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count print >> out, 'this many random graphs were not connected:', unconnected_count print >> out, 'this many splits were valid:', valid_split_count print >> out, 'this many splits were invalid:', invalid_split_count # return the result return out.getvalue()
def get_response_content(fs): out = StringIO() # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # assert that each node is named for node in tree.preorder(): if not node.name: raise HandlingError('each node in the tree must have a name') # get the function that converts a matrix to a string if fs.plain_matrix: m_to_string = MatrixUtil.m_to_string elif fs.latex_matrix: m_to_string = latexutil.m_to_latex_string # print the results for the split of the full tree print >> out, get_full_tree_message(tree, m_to_string) print >> out # get the alphabetically ordered names of the tips ordered_tip_names = list(sorted(tip.get_name() for tip in tree.gen_tips())) # get the corresponding ordered ids tip_name_to_id = dict((tip.get_name(), id(tip)) for tip in tree.gen_tips()) ordered_tip_ids = [tip_name_to_id[name] for name in ordered_tip_names] # get the distance matrix defined by the tips of the tree D = np.array(tree.get_partial_distance_matrix(ordered_tip_ids)) L = Euclid.edm_to_laplacian(D) #print >> out, 'the Laplacian obtained from the full tree by Schur complementation:' #print >> out, MatrixUtil.m_to_string(L) #print >> out print >> out, 'the Schur complement in the Laplacian of the full tree scaled by', fs.scaling_factor print >> out, m_to_string(fs.scaling_factor * L) print >> out #L_merged = SchurAlgebra.mmerge(L, set([3,4,5])) #print >> out, 'the merged Laplacian:' #print >> out, MatrixUtil.m_to_string(L_merged) #print >> out # get the Fiedler cut of the Schur Laplacian v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) print >> out, 'the Fiedler split of the Schur complement of the full tree:' for name, value in zip(ordered_tip_names, v): print >> out, name, ':', value print >> out # get the Fiedler cuts of Schur complements of child trees print >> out, get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, fs.scaling_factor) print >> out # get the Fiedler cuts of Schur complements of subtrees print >> out, get_subtree_messages(D, eigensplit, ordered_tip_names) # return the response return out.getvalue()
def do_search(self, nseconds, sampling_function): """ @param nseconds: allowed search time or None @param sampling_function: a function that samples a branch length @return: True if a tree was found that met the criteria """ if not self.is_initialized(): raise RuntimeError("the search was not sufficiently initialized") true_splits = self.tree.get_nontrivial_splits() start_time = time.time() while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: return False # assign new sampled branch lengths for branch in self.tree.get_branches(): branch.length = sampling_function() # get the distance matrix so we can use a library function to get the split D = np.array(self.tree.get_distance_matrix()) ntips = len(D) # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves if self.force_difference or self.informative_full_split: A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index)) L_aug = Euclid.adjacency_to_laplacian(A_aug) v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug) left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug) left = [x for x in left_aug if x in range(ntips)] right = [x for x in right_aug if x in range(ntips)] leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right) if self.force_difference: if leaf_eigensplit_aug == self.desired_primary_split: self.aug_split_collision_count += 1 continue if self.informative_full_split: if min(len(s) for s in leaf_eigensplit_aug) < 2: self.aug_split_degenerate_count += 1 continue # get the eigensplit try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) except BuildTreeTopology.DegenerateSplitException, e: self.degenerate_primary_split_count += 1 continue except BuildTreeTopology.InvalidSpectralSplitException, e: self.error_primary_split_count += 1 continue
def do_search(self, nseconds, sampling_function): """ @param nseconds: allowed search time or None @param sampling_function: a function that samples a branch length @return: True if a tree was found that met the criteria """ if not self.is_initialized(): raise RuntimeError('the search was not sufficiently initialized') true_splits = self.tree.get_nontrivial_splits() start_time = time.time() while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: return False # assign new sampled branch lengths for branch in self.tree.get_branches(): branch.length = sampling_function() # get the distance matrix so we can use a library function to get the split D = np.array(self.tree.get_distance_matrix()) ntips = len(D) # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves if self.force_difference or self.informative_full_split: A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index)) L_aug = Euclid.adjacency_to_laplacian(A_aug) v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug) left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug) left = [x for x in left_aug if x in range(ntips)] right = [x for x in right_aug if x in range(ntips)] leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right) if self.force_difference: if leaf_eigensplit_aug == self.desired_primary_split: self.aug_split_collision_count += 1 continue if self.informative_full_split: if min(len(s) for s in leaf_eigensplit_aug) < 2: self.aug_split_degenerate_count += 1 continue # get the eigensplit try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) except BuildTreeTopology.DegenerateSplitException, e: self.degenerate_primary_split_count += 1 continue except BuildTreeTopology.InvalidSpectralSplitException, e: self.error_primary_split_count += 1 continue
def get_response_content(fs): # read the points and edges points, edges = read_points_and_edges(fs.graph_data) # get the width and height of the drawable area of the image width = fs.total_width - 2*fs.border height = fs.total_height - 2*fs.border if width < 1 or height < 1: msg = 'the image dimensions do not allow for enough drawable area' raise HandlingError(msg) # read the image info show_labels = None if fs.label_from_0: show_labels = 0 elif fs.label_from_1: show_labels = 1 # define the valuations which will define the node colors if fs.color_x: valuations = [p[0] for p in points] elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted: if fs.color_fiedler_weighted: X = [np.array(p) for p in points] dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges] weights = [1.0 / d for d in dists] else: weights = [1.0 for e in edges] L = edges_to_laplacian(edges, weights) valuations = BuildTreeTopology.laplacian_to_fiedler(L) else: valuations = [0 for p in points] valuations = [-v if fs.flip else v for v in valuations] colors = valuations_to_colors(valuations) # draw the image ext = Form.g_imageformat_to_ext[fs.imageformat] info = ImageInfo(fs.total_width, fs.total_height, fs.black, show_labels, fs.border, ext) try: return get_image_string(points, edges, colors, info) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # read the points and edges points, edges = read_points_and_edges(fs.graph_data) # get the width and height of the drawable area of the image width = fs.total_width - 2 * fs.border height = fs.total_height - 2 * fs.border if width < 1 or height < 1: msg = 'the image dimensions do not allow for enough drawable area' raise HandlingError(msg) # read the image info show_labels = None if fs.label_from_0: show_labels = 0 elif fs.label_from_1: show_labels = 1 # define the valuations which will define the node colors if fs.color_x: valuations = [p[0] for p in points] elif fs.color_fiedler_weighted or fs.color_fiedler_unweighted: if fs.color_fiedler_weighted: X = [np.array(p) for p in points] dists = [np.linalg.norm(X[j] - X[i]) for i, j in edges] weights = [1.0 / d for d in dists] else: weights = [1.0 for e in edges] L = edges_to_laplacian(edges, weights) valuations = BuildTreeTopology.laplacian_to_fiedler(L) else: valuations = [0 for p in points] valuations = [-v if fs.flip else v for v in valuations] colors = valuations_to_colors(valuations) # draw the image ext = Form.g_imageformat_to_ext[fs.imageformat] info = ImageInfo(fs.total_width, fs.total_height, fs.black, show_labels, fs.border, ext) try: return get_image_string(points, edges, colors, info) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_standard_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # begin the response out = StringIO() # show a summary of the original data print >> out, 'data summary before removing branches with zero length:' print >> out, len(archaea_names), 'archaea names in the original tree' print >> out, len(bacteria_names), 'bacteria names in the original tree' print >> out, len(eukaryota_names), 'eukaryota names in the original tree' print >> out, len(all_names), 'total names in the original tree' print >> out # get the pruned full tree pruned_full_tree = get_pruned_tree(full_tree) ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips()) # show a summary of the processed data print >> out, 'data summary after removing branches with zero length:' print >> out, len( ordered_names), 'total names in the processed non-degenerate tree' print >> out # draw the pruned full tree print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' formatted_tree_string = NewickIO.get_narrow_newick_string( pruned_full_tree, 120) print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(pruned_full_tree.get_distance_matrix(ordered_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) # report the clade intersections of sides of the split side_names = [set(ordered_names[i] for i in side) for side in eigensplit] clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota')) print >> out, 'clade intersections with each side of the split:' for side, side_name in zip(side_names, ('left', 'right')): for clade, clade_name in clade_name_pairs: if clade & side: print >> out, 'the', side_name, 'side intersects', clade_name print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(ordered_names))] # get a secondary split for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_s1 = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_s1) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split( v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(ordered_names[i] for i in left_sublabels) right_subnames = set(ordered_names[i] for i in right_sublabels) print >> out, 'clade intersections with a subsplit:' for clade, clade_name in clade_name_pairs: if clade & left_subnames: print >> out, 'the left side intersects', clade_name for clade, clade_name in clade_name_pairs: if clade & right_subnames: print >> out, 'the right side intersects', clade_name print >> out # show debug info print >> out, 'archaea names:' print >> out, '\n'.join(x for x in sorted(archaea_names)) print >> out print >> out, 'bacteria names:' print >> out, '\n'.join(x for x in sorted(bacteria_names)) print >> out print >> out, 'eukaryota names:' print >> out, '\n'.join(x for x in sorted(eukaryota_names)) print >> out # return the response response_text = out.getvalue().strip() return [('Content-Type', 'text/plain')], response_text
def get_verbose_summary(self): """ @return: a multiline string """ # begin the response out = StringIO() # show the number of taxa in various domains print >> out, self._get_name_summary() print >> out # show the pruned full tree formatted_tree_string = NewickIO.get_narrow_newick_string( self.pruned_tree, 120) print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) print >> out # report the clade intersections of sides of the split side_names = [ set(self.pruned_names[i] for i in side) for side in eigensplit ] print >> out, 'domains represented by each side of the primary split:' print >> out, 'the left side has:\t', ', '.join( self._get_domains(side_names[0])) print >> out, 'the right side has:\t', ', '.join( self._get_domains(side_names[1])) print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(self.pruned_names))] # do the secondary splits for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_secondary = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_secondary) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split( v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(self.pruned_names[i] for i in left_sublabels) right_subnames = set(self.pruned_names[i] for i in right_sublabels) print >> out, 'domains represented by a subsplit:' print >> out, 'the left side has:\t', ', '.join( self._get_domains(left_subnames)) print >> out, 'the right side has:\t', ', '.join( self._get_domains(right_subnames)) print >> out # return the multiline string return out.getvalue().strip()
def _do_analysis(self, use_generalized_nj): """ Do some splits of the tree. @param use_generalized_nj: True if we use an old method of outgrouping """ # define the distance matrix D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names)) # get the primary split of the criterion matrix L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # assert that the first split cleanly separates the bacteria from the rest left_indices, right_indices = eigensplit left_domains = self._get_domains( [self.pruned_names[x] for x in left_indices]) right_domains = self._get_domains( [self.pruned_names[x] for x in right_indices]) if ('bacteria' in left_domains) and ('bacteria' in right_domains): raise HandlingError('bacteria were not defined by the first split') # now we have enough info to define the first supplementary csv file self.first_split_object = SupplementarySpreadsheetObject( self.pruned_names, L, v) # define the bacteria indices vs the non-bacteria indices for the second split if 'bacteria' in left_domains: bacteria_indices = left_indices non_bacteria_indices = right_indices elif 'bacteria' in right_domains: bacteria_indices = right_indices non_bacteria_indices = left_indices # get the secondary split of interest if use_generalized_nj: D_secondary = BuildTreeTopology.update_generalized_nj( D, bacteria_indices) L_secondary = Euclid.edm_to_laplacian(D_secondary) else: L_secondary = SchurAlgebra.mmerge(L, bacteria_indices) full_label_sets = [set([i]) for i in range(len(self.pruned_names))] next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices) v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary) eigensplit_secondary = BuildTreeTopology.eigenvector_to_split( v_secondary) left_subindices, right_subindices = eigensplit_secondary pruned_names_secondary = [] for label_set in next_label_sets: if len(label_set) == 1: label = list(label_set)[0] pruned_names_secondary.append(self.pruned_names[label]) else: pruned_names_secondary.append('all-bacteria') # assert that the second split cleanly separates the eukaryota from the rest left_subdomains = self._get_domains( [pruned_names_secondary[x] for x in left_subindices]) right_subdomains = self._get_domains( [pruned_names_secondary[x] for x in right_subindices]) if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains): raise HandlingError( 'eukaryota were not defined by the second split') # now we have enough info to define the second supplementary csv file self.second_split_object = SupplementarySpreadsheetObject( pruned_names_secondary, L_secondary, v_secondary)
def get_standard_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # begin the response out = StringIO() # show a summary of the original data print >> out, 'data summary before removing branches with zero length:' print >> out, len(archaea_names), 'archaea names in the original tree' print >> out, len(bacteria_names), 'bacteria names in the original tree' print >> out, len(eukaryota_names), 'eukaryota names in the original tree' print >> out, len(all_names), 'total names in the original tree' print >> out # get the pruned full tree pruned_full_tree = get_pruned_tree(full_tree) ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips()) # show a summary of the processed data print >> out, 'data summary after removing branches with zero length:' print >> out, len(ordered_names), 'total names in the processed non-degenerate tree' print >> out # draw the pruned full tree print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:' formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) print >> out, formatted_tree_string print >> out # split the distance matrix D = np.array(pruned_full_tree.get_distance_matrix(ordered_names)) L = Euclid.edm_to_laplacian(D) v = BuildTreeTopology.laplacian_to_fiedler(L) eigensplit = BuildTreeTopology.eigenvector_to_split(v) # report the eigendecomposition print >> out, get_eigendecomposition_report(D) # report the clade intersections of sides of the split side_names = [set(ordered_names[i] for i in side) for side in eigensplit] clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota')) print >> out, 'clade intersections with each side of the split:' for side, side_name in zip(side_names, ('left', 'right')): for clade, clade_name in clade_name_pairs: if clade & side: print >> out, 'the', side_name, 'side intersects', clade_name print >> out # prepare to do the secondary splits left_indices, right_indices = eigensplit full_label_sets = [set([i]) for i in range(len(ordered_names))] # get a secondary split for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)): L_s1 = SchurAlgebra.mmerge(L, index_complement) next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement) v = BuildTreeTopology.laplacian_to_fiedler(L_s1) left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v) left_sublabels = set() for i in left_subindices: left_sublabels.update(next_label_sets[i]) right_sublabels = set() for i in right_subindices: right_sublabels.update(next_label_sets[i]) left_subnames = set(ordered_names[i] for i in left_sublabels) right_subnames = set(ordered_names[i] for i in right_sublabels) print >> out, 'clade intersections with a subsplit:' for clade, clade_name in clade_name_pairs: if clade & left_subnames: print >> out, 'the left side intersects', clade_name for clade, clade_name in clade_name_pairs: if clade & right_subnames: print >> out, 'the right side intersects', clade_name print >> out # show debug info print >> out, 'archaea names:' print >> out, '\n'.join(x for x in sorted(archaea_names)) print >> out print >> out, 'bacteria names:' print >> out, '\n'.join(x for x in sorted(bacteria_names)) print >> out print >> out, 'eukaryota names:' print >> out, '\n'.join(x for x in sorted(eukaryota_names)) print >> out # return the response response_text = out.getvalue().strip() return [('Content-Type', 'text/plain')], response_text
class TreeSearch: """ This is a virtual base class. """ def __init__(self): # boolean requirements defined by the user self.informative_children = None self.force_difference = None self.informative_full_split = None self.invalid_dendrogram = None # search options defined by the subclass self.tree = None self.desired_primary_split = None self.id_to_index = None # initialize the counts that are tracked for bookkeeping self.aug_split_collision_count = 0 self.aug_split_degenerate_count = 0 self.error_primary_split_count = 0 self.invalid_primary_split_count = 0 self.degenerate_primary_split_count = 0 self.undesired_primary_split_count = 0 self.desired_primary_split_count = 0 self.uninformative_child_count = 0 self.informative_child_count = 0 self.valid_dendrogram_count = 0 self.success_count = 0 def is_initialized(self): required_data = [ self.informative_children, self.force_difference, self.informative_full_split, self.invalid_dendrogram, self.tree, self.desired_primary_split, self.id_to_index] return not (None in required_data) def get_result_text(self): """ @return: a multi-line string of text """ out = StringIO() if self.force_difference or self.informative_full_split: print >> out, 'full graph split stats:' print >> out, self.aug_split_collision_count, print >> out, 'full graph splits collided with the desired primary split' print >> out, self.aug_split_degenerate_count, print >> out, 'full graph splits were degenerate' print >> out print >> out, 'primary split stats:' print >> out, self.error_primary_split_count, print >> out, 'errors in finding the primary split (should be 0)' print >> out, self.invalid_primary_split_count, print >> out, 'invalid primary splits (should be 0)' print >> out, self.degenerate_primary_split_count, print >> out, 'degenerate primary splits' print >> out, self.undesired_primary_split_count, print >> out, 'primary splits were not the target split' print >> out, self.desired_primary_split_count, print >> out, 'primary splits were the target split' print >> out if self.informative_children: print >> out, 'secondary split stats:' print >> out, self.uninformative_child_count, print >> out, 'samples had at least one uninformative child tree' print >> out, self.informative_child_count, print>> out, 'samples had two informative child trees' print >> out if self.invalid_dendrogram: print >> out, 'naive dendrogram stats:' print >> out, self.valid_dendrogram_count, print >> out, 'naive dendrograms were valid' print >> out return out.getvalue().strip() def do_search(self, nseconds, sampling_function): """ @param nseconds: allowed search time or None @param sampling_function: a function that samples a branch length @return: True if a tree was found that met the criteria """ if not self.is_initialized(): raise RuntimeError('the search was not sufficiently initialized') true_splits = self.tree.get_nontrivial_splits() start_time = time.time() while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: return False # assign new sampled branch lengths for branch in self.tree.get_branches(): branch.length = sampling_function() # get the distance matrix so we can use a library function to get the split D = np.array(self.tree.get_distance_matrix()) ntips = len(D) # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves if self.force_difference or self.informative_full_split: A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index)) L_aug = Euclid.adjacency_to_laplacian(A_aug) v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug) left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug) left = [x for x in left_aug if x in range(ntips)] right = [x for x in right_aug if x in range(ntips)] leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right) if self.force_difference: if leaf_eigensplit_aug == self.desired_primary_split: self.aug_split_collision_count += 1 continue if self.informative_full_split: if min(len(s) for s in leaf_eigensplit_aug) < 2: self.aug_split_degenerate_count += 1 continue # get the eigensplit try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) except BuildTreeTopology.DegenerateSplitException, e: self.degenerate_primary_split_count += 1 continue except BuildTreeTopology.InvalidSpectralSplitException, e: self.error_primary_split_count += 1 continue if eigensplit not in true_splits: raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string()) if eigensplit != self.desired_primary_split: self.undesired_primary_split_count += 1 continue self.desired_primary_split_count += 1 # check the splits of the two child trees degenerate_subsplit_count = 0 L = Euclid.edm_to_laplacian(D) for side in eigensplit: L_child = SchurAlgebra.mmerge(L, side) v = BuildTreeTopology.laplacian_to_fiedler(L_child) child_eigensplit = BuildTreeTopology.eigenvector_to_split(v) if min(len(s) for s in child_eigensplit) < 2: degenerate_subsplit_count += 1 if degenerate_subsplit_count: self.uninformative_child_count += 1 else: self.informative_child_count += 1 if self.informative_children: if degenerate_subsplit_count: continue # check the dendrogram if self.invalid_dendrogram: labels = range(len(D)) hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels) dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy)) if dendrogram_splits == true_splits: self.valid_dendrogram_count += 1 continue # the tree has met all of the requirements return True
def get_response_content(fs): out = StringIO() # try to make some graphs unconnected_count = 0 invalid_split_count = 0 valid_split_count = 0 for graph_index in range(fs.ngraphs): G = erdos_renyi(fs.nvertices, fs.pedge) if is_connected(G): # add interesting edge weights add_exponential_weights(G) # turn the adjacency matrix into a laplacian matrix L = Euclid.adjacency_to_laplacian(G) for v in range(fs.nvertices): small_index_to_big_index = {} for i_small, i_big in enumerate( [i for i in range(fs.nvertices) if i != v]): small_index_to_big_index[i_small] = i_big # take the schur complement with respect to the given vertex L_reduced = get_single_element_schur_complement(L, v) assert len(L_reduced) == len(L) - 1 # get the loadings of the vertices of the reduced graph if fs.fiedler_cut: Y_reduced = BuildTreeTopology.laplacian_to_fiedler( L_reduced) elif fs.random_cut: Y_reduced = get_random_vector(L_reduced) assert len(Y_reduced) == len(L_reduced) # expand the fiedler vector with positive and negative valuations for the removed vertex found_valid_split = False for augmented_loading in (-1.0, 1.0): # get the augmented split vector for this assignment of the removed vertex Y_full = [0] * len(G) for i_reduced, loading in enumerate(Y_reduced): i_big = small_index_to_big_index[i_reduced] Y_full[i_big] = loading Y_full[v] = augmented_loading assert len(Y_full) == len(G) # get the two graphs defined by the split subgraph_a, subgraph_b = list(gen_subgraphs(G, Y_full)) # if the subgraphs are both connected then the split is valid if is_connected(subgraph_a) and is_connected(subgraph_b): found_valid_split = True # if a valid split was not found then show the matrix if found_valid_split: valid_split_count += 1 else: print >> out, 'Found a matrix that was split incompatibly by a cut of its schur complement!' print >> out, 'matrix:' print >> out, MatrixUtil.m_to_string(G) print >> out, 'index that was removed:', v invalid_split_count += 1 else: unconnected_count += 1 # show the number of connected and of unconnected graphs print >> out, 'this many random graphs were connected:', fs.ngraphs - unconnected_count print >> out, 'this many random graphs were not connected:', unconnected_count print >> out, 'this many splits were valid:', valid_split_count print >> out, 'this many splits were invalid:', invalid_split_count # return the result return out.getvalue()