Пример #1
0
    def clique_to_node_set( self ):
        '''repairs the edges from clique to real alignment graph,
        because cliques may contain more edges than the original graph(s)'''

        results = self.get_coopt()
        res_list = []

        for clique in results:

            for node in clique:
                for neighbour in list(node.neighbours)[:]:
                    if not neighbour in clique:
                        node.remove_neighbour(neighbour)

            curr_node_set = set()
            for node in clique:

                corr_n = self.get_corr_node( node )
                new_neighbours = set()

                for neighbour in node.neighbours:

                    for corr_neighbour in corr_n.neighbours:
                        if set(corr_neighbour.mult_id.split(".")).issubset(set(neighbour.mult_id.split("."))):
                            new_neighbours.add(neighbour)
                curr_node = Node( node.id, node.label, new_neighbours)
                curr_node.mult_id = node.mult_id
                curr_node_set.add(curr_node)

            res_list.append(curr_node_set)

        return res_list
Пример #2
0
    def apply_algorithm(self, graph1, graph2):
        if self.algorithm == "BK":
            mp = MP(graph1, graph2)
            # print(graph2)
            bk = BK(graph1, graph2)
            x = set()
            r = set()
            p = list(mp.modp)
            bk.bk_pivot(r, p, x)
            results = bk.clique_to_node_set()

            max_res = results[0]
            max_res_neighbour_sum = 0
            for res in results:
                for max_node in max_res:
                    max_res_neighbour_sum += len(max_node.neighbours)
                neighbour_sum = 0
                for node in res:
                    neighbour_sum += len(node.neighbours)
                if neighbour_sum > max_res_neighbour_sum:
                    max_res = res
                    max_res_neighbour_sum = neighbour_sum
            return max_res

        elif self.algorithm == "VF2":
            vf2 = VF2(graph1, graph2)
            vf2.match()
            if not vf2.results:
                vf2.results.append([Node("null", "")])
            return max(vf2.results)
Пример #3
0
    def match( self, last_mapped=(Node("-1", ""), Node("-1", "")), depth=0 ):

        if self.s_in_small_g():
            self.append_result_graph( self.core_s )
            self.restore_ds( last_mapped[0], last_mapped[1], depth )
            return

        td = self.set_inout( last_mapped[0], last_mapped[1], depth )
        p = self.compute_p(td)

        for tup in p:

            if self.is_feasible(tup[0], tup[1], depth, td):
                self.compute_s_( tup[0], tup[1] )

                self.match( tup, depth+1 )

        self.restore_ds( last_mapped[0], last_mapped[1], depth )
Пример #4
0
    def find_max_pivot( self, p, x ):
        p_union_x = p + list(x)
        helper = 0
        piv = Node('-1', '')

        for v in p_union_x:
            cur_len_intersection = len([n for n in v.neighbours if n in p_union_x])
            if cur_len_intersection > helper:
                piv = v
                helper = cur_len_intersection

        return piv
Пример #5
0
    def append_result_graph(self, result):
        '''creates a graph which contains the concatenated mapped nodes.
        Then, it adds the neighbours to the new nodes following the
        original neighbours.'''

        result_graph = Graph(
            "({},{})#{}".format(self.small_g.id, self.large_g.id,
                                len(self.result_graphs) + 1), set())
        for key, value in result.items():
            cur_node = Node("{}.{}".format(key.id, value.id),
                            "{}".format(key.label))
            cur_node.mult_id = "{}.{}".format(key.mult_id, value.mult_id)

            for node in result_graph.nodes:  # f.ex. 1.2
                orig_node = Node("")
                for n in result.keys():  # original nodes from small graph
                    if node.id.split(
                            "."
                    )[:
                      -1][0] == n.id:  # comparing the first part of already mapped node id and original node id
                        orig_node = n
                        break
                if key in orig_node.neighbours:
                    node.add_neighbour(cur_node)
                    cur_node.add_neighbour(node)
            result_graph.nodes.add(cur_node)

        self.result_graphs.append(result_graph)
        self.results.append(result_graph.nodes)
Пример #6
0
    def match(self, last_mapped=(Node("0", ""), Node("0", "")), depth=0):

        if self.s_in_small_g():
            self.append_result_graph(self.core_s)
            self.restore_ds(last_mapped[0], last_mapped[1], depth)
            return

        td = self.set_inout(last_mapped[0], last_mapped[1], depth)
        p = self.compute_p(td)

        pprint.pprint(self.core_s)

        print("")
        print("td")
        print(td)
        print("")
        pprint.pprint(self.in_l)
        pprint.pprint(self.out_l)
        print("")
        pprint.pprint(self.in_s)
        pprint.pprint(self.out_s)
        print("")
        print("\ndepth {}\n".format(depth))

        print("p")
        pprint.pprint(p)

        for tup in p:

            if self.is_feasible(tup[0], tup[1], depth, td):
                print("feasible")
                self.compute_s_(tup[0], tup[1], depth)

                self.match(tup, depth + 1)

        self.restore_ds(last_mapped[0], last_mapped[1], depth)
Пример #7
0
    def apply_algorithm(self, graph1, graph2):
        '''
        performs pairwise alignment using the algorithm provided. At the moment,
        SUBVF2 algorithm is the only working algorithm for multiple alignment
        '''

        if self.algorithm == "BK":
            raise Exception(
                "BK algorithm is not usable for multiple alignment at the moment. But it is slow as hell anyway. Please use 'subVF2', which is also the default algorithm."
            )

            mp = MP(graph1, graph2)

            bk = BK(graph1, graph2)
            x = set()
            r = set()
            p = list(mp.modp)
            bk.bk_pivot(r, p, x)
            results = bk.clique_to_node_set()

            max_res = results[0]
            max_res_neighbour_sum = 0
            for res in results:
                for max_node in max_res:
                    max_res_neighbour_sum += len(max_node.neighbours)
                neighbour_sum = 0
                for node in res:
                    neighbour_sum += len(node.neighbours)
                if neighbour_sum > max_res_neighbour_sum:
                    max_res = res
                    max_res_neighbour_sum = neighbour_sum
            return max_res

        elif self.algorithm == "VF2":
            raise Exception(
                "VF2 algorithm is not really what you want for multiple alignment. Trust me. Really. Use 'subVF2' instead, please. Thanks."
            )

            vf2 = VF2(graph1, graph2)
            vf2.match()
            if not vf2.results:
                vf2.results.append([Node("null", "")])
            return max(vf2.results)

        elif self.algorithm == "SUBVF2":
            subvf2 = subVF2(graph1, graph2, self.scoring_matrix)
            subvf2.match()
            return subvf2.results
Пример #8
0
    def __init__(self, g, h):

        self.null_n = Node("-1", "")

        self.g = g
        self.h = h

        '''makes sure, that small_g is the smaller graph'''
        self.type = 'subgraph'
        if h == g:
            self.type = 'isomorphism'

        self.small_g, self.large_g = g, h
        if g.int_size() > h.int_size():
            self.small_g = h
            self.large_g = g

        # if the graph is undirected, the inverse edges (1,2 -> 2,1) are
        # constructed to work with the original VF2 algorithm
        if not self.large_g.is_directed:
            self.large_g.create_fake_directions()
        if not self.small_g.is_directed:
            self.small_g.create_fake_directions()

        self.large_g.get_inout_neighbours()
        self.small_g.get_inout_neighbours()

        # Initializing the two core dictionaries that store each node of the
        # Corresponding graph as key and the node of the other graph where it maps
        # As soon as it mapps for now we use self.null_n as inital value
        self.core_s = self.small_g.gen_dict( self.null_n )
        self.core_l = self.large_g.gen_dict( self.null_n )

        # initialiazing the terminal sets for each graph. These are dictionaries
        # that store the node as values and the recursion depth as keys where the
        # nodes entered the corresponding set. For now we initialiazing them with 0 '''
        self.in_s = self.small_g.gen_dict( 0 )
        self.out_s = self.small_g.gen_dict( 0 )
        self.in_l = self.large_g.gen_dict( 0 )
        self.out_l = self.large_g.gen_dict( 0 )

        self.result_graphs = []
        self.results = []
Пример #9
0
    def greedy(self):
        '''
        performs multiple alignment following a greedy approach: Every pairwise alignment is
        calculated and the best scoring co-optimal is chosen. Then, the best scored pairwise
        alignment is chosen. Then, all pairwise alignments with this new graph are calculated
        and scored and so on.
        '''

        if len(self.graph_list) <= 1:
            try:
                res = self.graph_list[0]
            except:
                return

            res.edges = set()
            res.create_undirected_edges()
            self.result = res
            self.newick = self.graph_list[0].newick
            return

        maximum_score = float(
            '-inf')  # is used to save the maximum number of mapped nodes
        counter = 1  # makes sure that every graph couple is only processed once

        for g1 in self.graph_list[:-1]:
            for g2 in self.graph_list[counter:]:
                if g1.id == g2.id:
                    continue

                if (g1.id, g2.id) in self.already_done.keys():
                    max_alignment = self.already_done[(g1.id, g2.id)]
                else:
                    # print()
                    # print("Aligning {} and {}...".format(g1.id,g2.id))
                    # print(g1)
                    # print(g2)
                    # print()
                    max_alignment = self.apply_algorithm(g1, g2)[0]
                    self.already_done[(g1.id, g2.id)] = max_alignment

                if max_alignment[1] > maximum_score:  #subVF2

                    alignment = Graph("{}-{}".format(g1.abbrev, g2.abbrev),
                                      max_alignment[0])
                    alignment.abbrev = alignment.id
                    alignment.newick = "({},{})".format(g1.newick, g2.newick)

                    maximum_score = max_alignment[1]
                    alig_one = g1
                    alig_two = g2

            counter += 1

        self.graph_list.remove(alig_one)
        self.graph_list.remove(alig_two)

        self.remove_element(self.already_done, (alig_one.id, alig_two.id))

        alignment_graph = self.make_graph_real(alignment)
        alignment_graph = self.generate_graph_bools(alignment_graph)

        self.graph_list.append(alignment_graph)
        self.intermediates.append(alignment_graph)

        if Node("null", []) in alignment.nodes and self.algorithm == "VF2":
            raise Exception(
                "VF2 could not produce a multiple alignment of all the given graphs. \n The classical VF2 algorithm can only process *graph-subgraph*-isomorphism. \n Please consider using subVF2 algorithm instead."
            )

        self.greedy()
Пример #10
0
    def progressive_alignment(
        self
    ):  #consider renaming this to UPGMA or give user choice of additional linking methods (NJ, SL, CL, WPGMA)

        #warn user of asumptions of UPGMA
        print(
            '"Warning: executing UPGMA algorithm with graph distances. Be aware of the strong assumptions of UPGMA, namely, that for any thwo input pairs, a "parent" graph may be constructed with equivalent distance to both input graphs and preserving distance to all other graphs. Since this may not be a reasonable assumption for our present way of aligning and scoring graphs, be careful with the output of UPGMA clustering.'
        )

        dist = np.zeros((len(self.graph_list),
                         len(self.graph_list)))  #square distace matrix

        #check scoring matrix here
        print("\nScoring Matrix:\n{}\n".format(self.scoring_matrix))
        if self.scoring_matrix == "-1":  #no scoring given

            pass  #create some reasonable default values here
        else:  #user specified scoring
            scoremin = min(self.scoring_matrix.items())
            scoremax = max(self.scoring_matrix.items())
            if scoremin == scoremax:  #should not be possible
                raise (AttributeError("Scoring Matrix seems broken: {}".format(
                    self.scoring_matrix)))
            elif scoremax <= 0 and scoremin < scoremax:  #negative valued distance measure assumed, zero for identity assumed
                pass  #negative should be a metric, warn for conversion
            elif scoremin >= 0 and scoremax > scoremin:  #gotta check for gap value to determine similarity or distance measure
                pass  #inform user about conversion
            else:  #mixed scoring, most likely positive matches, negative mismatches, neutral gap
                pass  #convert to a proper metric!

        #fill matrix initially
        for i1, i2 in zip(range(len(self.graph_list)),
                          range(len(self.graph_list))
                          ):  #assuming that graph_list is indeed a list
            g1 = self.graph_list[i1]
            g2 = self.graph_list[i2]
            if i1 == i2:

                dist[i1, i2] = -1
            elif (g1, g2) in self.already_done:
                dist[i1, i2] = self.already_done[(
                    g1, g2
                )][1]  #note to michel: abuse of tuple indexing is almost criminal here
            else:
                max_alignment = self.apply_algorithm(g1, g2)[0]
                self.already_done[(g1, g2)] = self.already_done[(
                    g2, g1)] = max_alignment
                dist[i1, i2] = self.already_done[(g1, g2)][1]

        while len(self.graph_list) > 1:
            #find indices of highest score, if the highest score occurs twice, choice is arbitrary
            i1, i2 = np.unravel_index(np.argmax(dist, axis=None),
                                      dist.shape)  #(g1,g2)
            g1 = self.graph_list[i1]
            g2 = self.graph_list[i2]

            #compute alignment if not already present
            if not (g1, g2) in self.already_done:
                max_alignment = self.apply_algorithm(g1, g2)[0]

                self.already_done[(g1, g2)] = max_alignment
                self.already_done[(g2, g1)] = max_alignment

            #if max_alignment[1] > maximum_score: #subVF2 #necessary?
            alignment = Graph("{}-{}".format(g1.abbrev, g2.abbrev),
                              self.already_done[(g1, g2)][0])
            alignment.abbrev = alignment.id
            alignment.newick = "({},{})".format(g1.newick, g2.newick)

            if Node("null", []) in alignment.nodes and self.algorithm == "VF2":
                raise Exception(
                    "VF2 could not produce a multiple alignment of all the given graphs. \n The classical VF2 algorithm can only process *graph-subgraph*-isomorphism. \n Please consider using subVF2 algorithm instead."
                )

            #remove old graphs
            self.graph_list.remove(g1)
            self.graph_list.remove(g2)
            self.remove_element(self.already_done, (g1, g2))
            self.remove_element(self.already_done, (g2, g1))

            #generate alignment graph and add to list
            alignment_graph = self.make_graph_real(alignment)
            alignment_graph = self.generate_graph_bools(alignment_graph)
            self.graph_list.append(alignment_graph)  #appends at the end?
            self.intermediates.append(alignment_graph)

            #calculate distances according to algo (UPGMA for now), drop rows/columns from dist table and append new row for alignement graph
            len_g1 = len(g1.id)
            len_g2 = len(g2.id)
            computed_distances = [
                (dist[k, i1] + dist[k, i2]) / 2 for k in range(dist.shape[0])
                if not any((k == i1, k == i2))
            ]  #this is actually WPGMA, find and normalize by alignment size of graphs
            dist = np.delete(dist, [i1, i2], axis=0)
            dist = np.delete(dist, [i1, i2], axis=1)
            #probably need to transpose one of those
            dist = np.append(dist, [computed_distances], axis=0)

            dist = np.append(dist,
                             np.append(computed_distances, -1).reshape(
                                 (-1, 1)),
                             axis=1)

        #alignment should be done now
        try:
            res = self.graph_list[0]
        except:
            return

        res.edges = set()
        res.create_undirected_edges()
        self.result = res
        self.newick = self.graph_list[0].newick
        return
Пример #11
0
 def t2node(t):
     i, ls = t
     return Node(str(i), ls)
Пример #12
0
    def append_result_subgraph(self, result):
        '''
        creates a graph which contains the concatenated mapped nodes from
        subgraph. Then, it adds the neighbours to the new nodes following the
        original neighbours.
        '''

        node_dict = {}  #used to reconstruct the neighbours
        final_node_set = set()
        in_l_and_mapped = set()

        node_label_len = len(next(iter(
            self.core_s)).mult_id)  # label length of nodes from smaller graph
        mapping_label_len = len(next(iter(
            self.core_l)).mult_id)  # label length of nodes from larger graph

        for node, mapping in result[0].items():

            if mapping:  # nodes that were actually mapped
                cur_node = Node(
                    "{}.{}".format(node.id, mapping.id),  #id
                    node.label + mapping.label,  #label
                )
                cur_node.mult_id = node.mult_id + mapping.mult_id
                in_l_and_mapped.add(mapping)
                node_dict[mapping] = cur_node
                node_dict[node] = cur_node
            else:  # nodes from small graph that were not mapped against a node from larger graph
                new_label = node.get_label()
                for i in range(mapping_label_len):
                    new_label.append("-")
                cur_node = Node("{}.".format(node.id), new_label)
                cur_node.mult_id = node.get_mult_id()
                for i in range(mapping_label_len):
                    cur_node.mult_id.append("_____")

                node_dict[node] = cur_node

            final_node_set.add(cur_node)

        for node, mapping in self.core_l.items():
            if node not in in_l_and_mapped:  # nodes from large graph that were not mapped against nodes from smaller graph
                cur_node = Node(".{}".format(node.id), node.get_label())
                for i in range(node_label_len):
                    cur_node.label.insert(0, "-")

                cur_node.mult_id = node.get_mult_id()
                for i in range(node_label_len):
                    cur_node.mult_id.insert(0, "_____")

                node_dict[node] = cur_node
                final_node_set.add(cur_node)

        # reconstructing the neighbours
        i = 1
        for node1 in list(node_dict.keys())[:-1]:
            for node2 in list(node_dict.keys())[i:]:
                if node2 in node1.neighbours:
                    node_dict[node1].neighbours.add(node_dict[node2])
                    node_dict[node2].neighbours.add(node_dict[node1])

            i += 1

        result_graph = Graph(
            "{}-{}#{}".format(self.small_g.id, self.large_g.id,
                              len(self.result_graphs) + 1), final_node_set)

        self.result_graphs.append(result_graph)
        self.results.append((result_graph.nodes, result[1]))
Пример #13
0
def parse_graph(doc):
    '''
    parses a graph written in a custom .graph file format
    '''

    check_list = [
    ]  #contains #nodes #edges, if they are labelled and if graph is directed
    nodes = set()
    edges = set()

    limit = 5000  # graphs with an edge amount above this number will trigger additional print statements to indicate progress

    with open(doc) as d:
        indicator = 0  #counts the empty lines (0: check_list, 1: nodes, 2: edges)
        edge_counter = 0  #counts the processed edges to give some completion feedback

        for line in d:
            line = line.replace("\n", "")
            split_list = line.split(";")

            #if line is empty
            if not line:
                indicator += 1
                continue

            #building check_list
            elif indicator == 0:

                arg = split_list[-1]  #last element in row is interpreted

                if arg.upper() == "COMMENT":
                    continue

                elif line.startswith("//"):
                    continue

                elif arg.upper() in ("TRUE", "FALSE"):
                    check_list.append(arg.upper() == "TRUE")

                elif line.upper().startswith("AUTHOR"):
                    print("Reading {} from {}".format(os.path.basename(doc),
                                                      line.split(" ", 1)[1]))
                    continue

                else:
                    try:
                        check_list.append(
                            int(arg))  #indicates number of nodes/edges

                    except:
                        print(
                            "Something's wrong with the first paragraph. Please check and try again."
                        )
                        print("Aborting...")
                        raise Exception(
                            "Parsing one of your graphs was not successful.")

            #building nodes
            elif indicator == 1:

                cur_node = Node(*split_list)
                if cur_node.label:
                    cur_node.label = cur_node.label.split(labelsep)
                else:
                    cur_node.label = [no_label_dummy]

                nodes.add(cur_node)

            #building labeled and/or directed edges
            elif indicator == 2:

                for node in nodes:
                    if node.id == split_list[0]:
                        split_list[0] = node

                    elif node.id == split_list[1]:
                        split_list[1] = node

                cur_edge = Edge(*split_list)
                edges.add(cur_edge)
                edge_counter += 1

                if edge_counter % limit == 0:
                    print("Already processed {} edges".format(edge_counter))

            else:
                print(
                    "Wrong input file format. File contains too many empty lines."
                )
                print("Aborting...")
                raise Exception(
                    "Parsing one of your graphs was not successful.")

    print_if_big(limit, edges, "Some illegalities are tested...")
    issues = ""

    if check_list[0] != len(nodes):
        issues += "Indicated number of nodes ({}) doesn't fit actual number of nodes ({}). \n".format(
            check_list[0], len(nodes))

    if check_list[1] != len(edges):
        issues += "Indicated number of edges ({}) doesn't fit actual number of edges ({}). \n".format(
            check_list[1], len(edges))

    if not check_list[2]:
        for node in nodes:
            if node.label != [no_label_dummy]:
                issues += "One or more nodes are labelled. If this is intended, please indicate this at the beginning of the graph file \n"
                break

    if not check_list[3]:  #if edges are not labelled
        for edge in edges:
            if edge.label != "":
                issues += "One or more edges are labelled. If this is intended, please indicate this at the beginning of the graph file \n"
                break

    #This test is not suitable for big graphs and must therefore be skipped
    if not check_list[4]:  #if graph is undirected
        if len(edges) < limit:
            if edges_contain_doubles(edges):  #(a,b) and (b,a)
                issues += "Undirected graph can contain any edge only once. \n"
        else:
            print(
                "Warning: Due to the graph size (number of edges exceeding " +
                str(limit) +
                "), it is not controlled whether there are doubled edges. Please make sure your undirected graph does not contain edges as in (n1,n2) and (n2,n1)"
            )

    print_if_big(limit, edges, "Done.")

    #evaluates if any issues have been detected. If not, parsing continues.
    if issues == "":
        print_if_big(limit, edges, "Getting node neighbours...")
        get_node_neighbours(limit, nodes, edges)
        print_if_big(limit, edges, "Done.")
        g = Graph(
            os.path.basename(doc)[:-6],
            # doc.split("/")[-1][:-6], # removes path and '.graph' extension
            nodes,
            edges,
            check_list[2],
            check_list[3],
            check_list[4])

        print("Successfully parsed " + os.path.basename(doc)[:-6] + "\n")
        return g

    else:
        print("There are some issues with the input file: \n")
        print(issues)
        print("Aborting...")
        exit()
Пример #14
0
    def upgma(self):
        if len(self.graph_list) <= 1:
            try:
                res = self.graph_list[0]
            except:
                return

            res.edges = set()
            res.create_undirected_edges()

            self.result = res

            self.newick = self.graph_list[0].newick
            self.print_alignment(self.result)

            return

        maximum = 0  # is used to save the maximum number of mapped nodes
        counter = 1  # makes sure that every graph couple is only processed once

        for g1 in self.graph_list[:-1]:

            for g2 in self.graph_list[counter:]:

                if g1.id == g2.id:
                    continue

                max_alignment = self.apply_algorithm(g1, g2)

                if len(max_alignment) > maximum:

                    alignment = Graph("{}-{}".format(g1.abbrev, g2.abbrev),
                                      max_alignment)
                    alignment.abbrev = alignment.id
                    alignment.newick = "({},{})".format(g1.newick, g2.newick)

                    maximum = len(max_alignment)
                    alig_one = g1
                    alig_two = g2

            counter += 1

        self.graph_list.remove(alig_one)
        self.graph_list.remove(alig_two)

        alignment_graph = self.make_graph_real(alignment)
        alignment_graph = self.generate_graph_bools(alignment_graph)

        self.graph_list.append(alignment_graph)
        self.intermediates.append(alignment_graph)

        if Node("null", "") in alignment.nodes and not self.save_all:
            raise Exception(
                "VF2 could not produce a multiple alignment of all the given graphs. \n The classical VF2 algorithm can only process *graph-subgraph*-isomorphism. \n Please consider using BK algorithm or -s to save all intermediate graphs until the error occurrs."
            )
        elif Node("null", "") in alignment.nodes and self.save_all:
            print(
                "Multiple alignment was not successful. VF2 could not align the graphs {} and {} properly. \n Maybe BK is more appropriate for this alignment."
                .format(alig_one.id, alig_two.id))
            print("Removing last graph. Continuing alignment...")
            self.graph_list.remove(alignment_graph)
            self.intermediates.remove(alignment_graph)

        self.upgma()