def clique_to_node_set( self ): '''repairs the edges from clique to real alignment graph, because cliques may contain more edges than the original graph(s)''' results = self.get_coopt() res_list = [] for clique in results: for node in clique: for neighbour in list(node.neighbours)[:]: if not neighbour in clique: node.remove_neighbour(neighbour) curr_node_set = set() for node in clique: corr_n = self.get_corr_node( node ) new_neighbours = set() for neighbour in node.neighbours: for corr_neighbour in corr_n.neighbours: if set(corr_neighbour.mult_id.split(".")).issubset(set(neighbour.mult_id.split("."))): new_neighbours.add(neighbour) curr_node = Node( node.id, node.label, new_neighbours) curr_node.mult_id = node.mult_id curr_node_set.add(curr_node) res_list.append(curr_node_set) return res_list
def apply_algorithm(self, graph1, graph2): if self.algorithm == "BK": mp = MP(graph1, graph2) # print(graph2) bk = BK(graph1, graph2) x = set() r = set() p = list(mp.modp) bk.bk_pivot(r, p, x) results = bk.clique_to_node_set() max_res = results[0] max_res_neighbour_sum = 0 for res in results: for max_node in max_res: max_res_neighbour_sum += len(max_node.neighbours) neighbour_sum = 0 for node in res: neighbour_sum += len(node.neighbours) if neighbour_sum > max_res_neighbour_sum: max_res = res max_res_neighbour_sum = neighbour_sum return max_res elif self.algorithm == "VF2": vf2 = VF2(graph1, graph2) vf2.match() if not vf2.results: vf2.results.append([Node("null", "")]) return max(vf2.results)
def match( self, last_mapped=(Node("-1", ""), Node("-1", "")), depth=0 ): if self.s_in_small_g(): self.append_result_graph( self.core_s ) self.restore_ds( last_mapped[0], last_mapped[1], depth ) return td = self.set_inout( last_mapped[0], last_mapped[1], depth ) p = self.compute_p(td) for tup in p: if self.is_feasible(tup[0], tup[1], depth, td): self.compute_s_( tup[0], tup[1] ) self.match( tup, depth+1 ) self.restore_ds( last_mapped[0], last_mapped[1], depth )
def find_max_pivot( self, p, x ): p_union_x = p + list(x) helper = 0 piv = Node('-1', '') for v in p_union_x: cur_len_intersection = len([n for n in v.neighbours if n in p_union_x]) if cur_len_intersection > helper: piv = v helper = cur_len_intersection return piv
def append_result_graph(self, result): '''creates a graph which contains the concatenated mapped nodes. Then, it adds the neighbours to the new nodes following the original neighbours.''' result_graph = Graph( "({},{})#{}".format(self.small_g.id, self.large_g.id, len(self.result_graphs) + 1), set()) for key, value in result.items(): cur_node = Node("{}.{}".format(key.id, value.id), "{}".format(key.label)) cur_node.mult_id = "{}.{}".format(key.mult_id, value.mult_id) for node in result_graph.nodes: # f.ex. 1.2 orig_node = Node("") for n in result.keys(): # original nodes from small graph if node.id.split( "." )[: -1][0] == n.id: # comparing the first part of already mapped node id and original node id orig_node = n break if key in orig_node.neighbours: node.add_neighbour(cur_node) cur_node.add_neighbour(node) result_graph.nodes.add(cur_node) self.result_graphs.append(result_graph) self.results.append(result_graph.nodes)
def match(self, last_mapped=(Node("0", ""), Node("0", "")), depth=0): if self.s_in_small_g(): self.append_result_graph(self.core_s) self.restore_ds(last_mapped[0], last_mapped[1], depth) return td = self.set_inout(last_mapped[0], last_mapped[1], depth) p = self.compute_p(td) pprint.pprint(self.core_s) print("") print("td") print(td) print("") pprint.pprint(self.in_l) pprint.pprint(self.out_l) print("") pprint.pprint(self.in_s) pprint.pprint(self.out_s) print("") print("\ndepth {}\n".format(depth)) print("p") pprint.pprint(p) for tup in p: if self.is_feasible(tup[0], tup[1], depth, td): print("feasible") self.compute_s_(tup[0], tup[1], depth) self.match(tup, depth + 1) self.restore_ds(last_mapped[0], last_mapped[1], depth)
def apply_algorithm(self, graph1, graph2): ''' performs pairwise alignment using the algorithm provided. At the moment, SUBVF2 algorithm is the only working algorithm for multiple alignment ''' if self.algorithm == "BK": raise Exception( "BK algorithm is not usable for multiple alignment at the moment. But it is slow as hell anyway. Please use 'subVF2', which is also the default algorithm." ) mp = MP(graph1, graph2) bk = BK(graph1, graph2) x = set() r = set() p = list(mp.modp) bk.bk_pivot(r, p, x) results = bk.clique_to_node_set() max_res = results[0] max_res_neighbour_sum = 0 for res in results: for max_node in max_res: max_res_neighbour_sum += len(max_node.neighbours) neighbour_sum = 0 for node in res: neighbour_sum += len(node.neighbours) if neighbour_sum > max_res_neighbour_sum: max_res = res max_res_neighbour_sum = neighbour_sum return max_res elif self.algorithm == "VF2": raise Exception( "VF2 algorithm is not really what you want for multiple alignment. Trust me. Really. Use 'subVF2' instead, please. Thanks." ) vf2 = VF2(graph1, graph2) vf2.match() if not vf2.results: vf2.results.append([Node("null", "")]) return max(vf2.results) elif self.algorithm == "SUBVF2": subvf2 = subVF2(graph1, graph2, self.scoring_matrix) subvf2.match() return subvf2.results
def __init__(self, g, h): self.null_n = Node("-1", "") self.g = g self.h = h '''makes sure, that small_g is the smaller graph''' self.type = 'subgraph' if h == g: self.type = 'isomorphism' self.small_g, self.large_g = g, h if g.int_size() > h.int_size(): self.small_g = h self.large_g = g # if the graph is undirected, the inverse edges (1,2 -> 2,1) are # constructed to work with the original VF2 algorithm if not self.large_g.is_directed: self.large_g.create_fake_directions() if not self.small_g.is_directed: self.small_g.create_fake_directions() self.large_g.get_inout_neighbours() self.small_g.get_inout_neighbours() # Initializing the two core dictionaries that store each node of the # Corresponding graph as key and the node of the other graph where it maps # As soon as it mapps for now we use self.null_n as inital value self.core_s = self.small_g.gen_dict( self.null_n ) self.core_l = self.large_g.gen_dict( self.null_n ) # initialiazing the terminal sets for each graph. These are dictionaries # that store the node as values and the recursion depth as keys where the # nodes entered the corresponding set. For now we initialiazing them with 0 ''' self.in_s = self.small_g.gen_dict( 0 ) self.out_s = self.small_g.gen_dict( 0 ) self.in_l = self.large_g.gen_dict( 0 ) self.out_l = self.large_g.gen_dict( 0 ) self.result_graphs = [] self.results = []
def greedy(self): ''' performs multiple alignment following a greedy approach: Every pairwise alignment is calculated and the best scoring co-optimal is chosen. Then, the best scored pairwise alignment is chosen. Then, all pairwise alignments with this new graph are calculated and scored and so on. ''' if len(self.graph_list) <= 1: try: res = self.graph_list[0] except: return res.edges = set() res.create_undirected_edges() self.result = res self.newick = self.graph_list[0].newick return maximum_score = float( '-inf') # is used to save the maximum number of mapped nodes counter = 1 # makes sure that every graph couple is only processed once for g1 in self.graph_list[:-1]: for g2 in self.graph_list[counter:]: if g1.id == g2.id: continue if (g1.id, g2.id) in self.already_done.keys(): max_alignment = self.already_done[(g1.id, g2.id)] else: # print() # print("Aligning {} and {}...".format(g1.id,g2.id)) # print(g1) # print(g2) # print() max_alignment = self.apply_algorithm(g1, g2)[0] self.already_done[(g1.id, g2.id)] = max_alignment if max_alignment[1] > maximum_score: #subVF2 alignment = Graph("{}-{}".format(g1.abbrev, g2.abbrev), max_alignment[0]) alignment.abbrev = alignment.id alignment.newick = "({},{})".format(g1.newick, g2.newick) maximum_score = max_alignment[1] alig_one = g1 alig_two = g2 counter += 1 self.graph_list.remove(alig_one) self.graph_list.remove(alig_two) self.remove_element(self.already_done, (alig_one.id, alig_two.id)) alignment_graph = self.make_graph_real(alignment) alignment_graph = self.generate_graph_bools(alignment_graph) self.graph_list.append(alignment_graph) self.intermediates.append(alignment_graph) if Node("null", []) in alignment.nodes and self.algorithm == "VF2": raise Exception( "VF2 could not produce a multiple alignment of all the given graphs. \n The classical VF2 algorithm can only process *graph-subgraph*-isomorphism. \n Please consider using subVF2 algorithm instead." ) self.greedy()
def progressive_alignment( self ): #consider renaming this to UPGMA or give user choice of additional linking methods (NJ, SL, CL, WPGMA) #warn user of asumptions of UPGMA print( '"Warning: executing UPGMA algorithm with graph distances. Be aware of the strong assumptions of UPGMA, namely, that for any thwo input pairs, a "parent" graph may be constructed with equivalent distance to both input graphs and preserving distance to all other graphs. Since this may not be a reasonable assumption for our present way of aligning and scoring graphs, be careful with the output of UPGMA clustering.' ) dist = np.zeros((len(self.graph_list), len(self.graph_list))) #square distace matrix #check scoring matrix here print("\nScoring Matrix:\n{}\n".format(self.scoring_matrix)) if self.scoring_matrix == "-1": #no scoring given pass #create some reasonable default values here else: #user specified scoring scoremin = min(self.scoring_matrix.items()) scoremax = max(self.scoring_matrix.items()) if scoremin == scoremax: #should not be possible raise (AttributeError("Scoring Matrix seems broken: {}".format( self.scoring_matrix))) elif scoremax <= 0 and scoremin < scoremax: #negative valued distance measure assumed, zero for identity assumed pass #negative should be a metric, warn for conversion elif scoremin >= 0 and scoremax > scoremin: #gotta check for gap value to determine similarity or distance measure pass #inform user about conversion else: #mixed scoring, most likely positive matches, negative mismatches, neutral gap pass #convert to a proper metric! #fill matrix initially for i1, i2 in zip(range(len(self.graph_list)), range(len(self.graph_list)) ): #assuming that graph_list is indeed a list g1 = self.graph_list[i1] g2 = self.graph_list[i2] if i1 == i2: dist[i1, i2] = -1 elif (g1, g2) in self.already_done: dist[i1, i2] = self.already_done[( g1, g2 )][1] #note to michel: abuse of tuple indexing is almost criminal here else: max_alignment = self.apply_algorithm(g1, g2)[0] self.already_done[(g1, g2)] = self.already_done[( g2, g1)] = max_alignment dist[i1, i2] = self.already_done[(g1, g2)][1] while len(self.graph_list) > 1: #find indices of highest score, if the highest score occurs twice, choice is arbitrary i1, i2 = np.unravel_index(np.argmax(dist, axis=None), dist.shape) #(g1,g2) g1 = self.graph_list[i1] g2 = self.graph_list[i2] #compute alignment if not already present if not (g1, g2) in self.already_done: max_alignment = self.apply_algorithm(g1, g2)[0] self.already_done[(g1, g2)] = max_alignment self.already_done[(g2, g1)] = max_alignment #if max_alignment[1] > maximum_score: #subVF2 #necessary? alignment = Graph("{}-{}".format(g1.abbrev, g2.abbrev), self.already_done[(g1, g2)][0]) alignment.abbrev = alignment.id alignment.newick = "({},{})".format(g1.newick, g2.newick) if Node("null", []) in alignment.nodes and self.algorithm == "VF2": raise Exception( "VF2 could not produce a multiple alignment of all the given graphs. \n The classical VF2 algorithm can only process *graph-subgraph*-isomorphism. \n Please consider using subVF2 algorithm instead." ) #remove old graphs self.graph_list.remove(g1) self.graph_list.remove(g2) self.remove_element(self.already_done, (g1, g2)) self.remove_element(self.already_done, (g2, g1)) #generate alignment graph and add to list alignment_graph = self.make_graph_real(alignment) alignment_graph = self.generate_graph_bools(alignment_graph) self.graph_list.append(alignment_graph) #appends at the end? self.intermediates.append(alignment_graph) #calculate distances according to algo (UPGMA for now), drop rows/columns from dist table and append new row for alignement graph len_g1 = len(g1.id) len_g2 = len(g2.id) computed_distances = [ (dist[k, i1] + dist[k, i2]) / 2 for k in range(dist.shape[0]) if not any((k == i1, k == i2)) ] #this is actually WPGMA, find and normalize by alignment size of graphs dist = np.delete(dist, [i1, i2], axis=0) dist = np.delete(dist, [i1, i2], axis=1) #probably need to transpose one of those dist = np.append(dist, [computed_distances], axis=0) dist = np.append(dist, np.append(computed_distances, -1).reshape( (-1, 1)), axis=1) #alignment should be done now try: res = self.graph_list[0] except: return res.edges = set() res.create_undirected_edges() self.result = res self.newick = self.graph_list[0].newick return
def t2node(t): i, ls = t return Node(str(i), ls)
def append_result_subgraph(self, result): ''' creates a graph which contains the concatenated mapped nodes from subgraph. Then, it adds the neighbours to the new nodes following the original neighbours. ''' node_dict = {} #used to reconstruct the neighbours final_node_set = set() in_l_and_mapped = set() node_label_len = len(next(iter( self.core_s)).mult_id) # label length of nodes from smaller graph mapping_label_len = len(next(iter( self.core_l)).mult_id) # label length of nodes from larger graph for node, mapping in result[0].items(): if mapping: # nodes that were actually mapped cur_node = Node( "{}.{}".format(node.id, mapping.id), #id node.label + mapping.label, #label ) cur_node.mult_id = node.mult_id + mapping.mult_id in_l_and_mapped.add(mapping) node_dict[mapping] = cur_node node_dict[node] = cur_node else: # nodes from small graph that were not mapped against a node from larger graph new_label = node.get_label() for i in range(mapping_label_len): new_label.append("-") cur_node = Node("{}.".format(node.id), new_label) cur_node.mult_id = node.get_mult_id() for i in range(mapping_label_len): cur_node.mult_id.append("_____") node_dict[node] = cur_node final_node_set.add(cur_node) for node, mapping in self.core_l.items(): if node not in in_l_and_mapped: # nodes from large graph that were not mapped against nodes from smaller graph cur_node = Node(".{}".format(node.id), node.get_label()) for i in range(node_label_len): cur_node.label.insert(0, "-") cur_node.mult_id = node.get_mult_id() for i in range(node_label_len): cur_node.mult_id.insert(0, "_____") node_dict[node] = cur_node final_node_set.add(cur_node) # reconstructing the neighbours i = 1 for node1 in list(node_dict.keys())[:-1]: for node2 in list(node_dict.keys())[i:]: if node2 in node1.neighbours: node_dict[node1].neighbours.add(node_dict[node2]) node_dict[node2].neighbours.add(node_dict[node1]) i += 1 result_graph = Graph( "{}-{}#{}".format(self.small_g.id, self.large_g.id, len(self.result_graphs) + 1), final_node_set) self.result_graphs.append(result_graph) self.results.append((result_graph.nodes, result[1]))
def parse_graph(doc): ''' parses a graph written in a custom .graph file format ''' check_list = [ ] #contains #nodes #edges, if they are labelled and if graph is directed nodes = set() edges = set() limit = 5000 # graphs with an edge amount above this number will trigger additional print statements to indicate progress with open(doc) as d: indicator = 0 #counts the empty lines (0: check_list, 1: nodes, 2: edges) edge_counter = 0 #counts the processed edges to give some completion feedback for line in d: line = line.replace("\n", "") split_list = line.split(";") #if line is empty if not line: indicator += 1 continue #building check_list elif indicator == 0: arg = split_list[-1] #last element in row is interpreted if arg.upper() == "COMMENT": continue elif line.startswith("//"): continue elif arg.upper() in ("TRUE", "FALSE"): check_list.append(arg.upper() == "TRUE") elif line.upper().startswith("AUTHOR"): print("Reading {} from {}".format(os.path.basename(doc), line.split(" ", 1)[1])) continue else: try: check_list.append( int(arg)) #indicates number of nodes/edges except: print( "Something's wrong with the first paragraph. Please check and try again." ) print("Aborting...") raise Exception( "Parsing one of your graphs was not successful.") #building nodes elif indicator == 1: cur_node = Node(*split_list) if cur_node.label: cur_node.label = cur_node.label.split(labelsep) else: cur_node.label = [no_label_dummy] nodes.add(cur_node) #building labeled and/or directed edges elif indicator == 2: for node in nodes: if node.id == split_list[0]: split_list[0] = node elif node.id == split_list[1]: split_list[1] = node cur_edge = Edge(*split_list) edges.add(cur_edge) edge_counter += 1 if edge_counter % limit == 0: print("Already processed {} edges".format(edge_counter)) else: print( "Wrong input file format. File contains too many empty lines." ) print("Aborting...") raise Exception( "Parsing one of your graphs was not successful.") print_if_big(limit, edges, "Some illegalities are tested...") issues = "" if check_list[0] != len(nodes): issues += "Indicated number of nodes ({}) doesn't fit actual number of nodes ({}). \n".format( check_list[0], len(nodes)) if check_list[1] != len(edges): issues += "Indicated number of edges ({}) doesn't fit actual number of edges ({}). \n".format( check_list[1], len(edges)) if not check_list[2]: for node in nodes: if node.label != [no_label_dummy]: issues += "One or more nodes are labelled. If this is intended, please indicate this at the beginning of the graph file \n" break if not check_list[3]: #if edges are not labelled for edge in edges: if edge.label != "": issues += "One or more edges are labelled. If this is intended, please indicate this at the beginning of the graph file \n" break #This test is not suitable for big graphs and must therefore be skipped if not check_list[4]: #if graph is undirected if len(edges) < limit: if edges_contain_doubles(edges): #(a,b) and (b,a) issues += "Undirected graph can contain any edge only once. \n" else: print( "Warning: Due to the graph size (number of edges exceeding " + str(limit) + "), it is not controlled whether there are doubled edges. Please make sure your undirected graph does not contain edges as in (n1,n2) and (n2,n1)" ) print_if_big(limit, edges, "Done.") #evaluates if any issues have been detected. If not, parsing continues. if issues == "": print_if_big(limit, edges, "Getting node neighbours...") get_node_neighbours(limit, nodes, edges) print_if_big(limit, edges, "Done.") g = Graph( os.path.basename(doc)[:-6], # doc.split("/")[-1][:-6], # removes path and '.graph' extension nodes, edges, check_list[2], check_list[3], check_list[4]) print("Successfully parsed " + os.path.basename(doc)[:-6] + "\n") return g else: print("There are some issues with the input file: \n") print(issues) print("Aborting...") exit()
def upgma(self): if len(self.graph_list) <= 1: try: res = self.graph_list[0] except: return res.edges = set() res.create_undirected_edges() self.result = res self.newick = self.graph_list[0].newick self.print_alignment(self.result) return maximum = 0 # is used to save the maximum number of mapped nodes counter = 1 # makes sure that every graph couple is only processed once for g1 in self.graph_list[:-1]: for g2 in self.graph_list[counter:]: if g1.id == g2.id: continue max_alignment = self.apply_algorithm(g1, g2) if len(max_alignment) > maximum: alignment = Graph("{}-{}".format(g1.abbrev, g2.abbrev), max_alignment) alignment.abbrev = alignment.id alignment.newick = "({},{})".format(g1.newick, g2.newick) maximum = len(max_alignment) alig_one = g1 alig_two = g2 counter += 1 self.graph_list.remove(alig_one) self.graph_list.remove(alig_two) alignment_graph = self.make_graph_real(alignment) alignment_graph = self.generate_graph_bools(alignment_graph) self.graph_list.append(alignment_graph) self.intermediates.append(alignment_graph) if Node("null", "") in alignment.nodes and not self.save_all: raise Exception( "VF2 could not produce a multiple alignment of all the given graphs. \n The classical VF2 algorithm can only process *graph-subgraph*-isomorphism. \n Please consider using BK algorithm or -s to save all intermediate graphs until the error occurrs." ) elif Node("null", "") in alignment.nodes and self.save_all: print( "Multiple alignment was not successful. VF2 could not align the graphs {} and {} properly. \n Maybe BK is more appropriate for this alignment." .format(alig_one.id, alig_two.id)) print("Removing last graph. Continuing alignment...") self.graph_list.remove(alignment_graph) self.intermediates.remove(alignment_graph) self.upgma()