def pair_same_node_groups(dmrs1, dmrs2): """ Finds which nodes in dmrs1 are equivalent to which nodes in dmrs2. :param dmrs1 A DMRS object. For matching, the small dmrs. :param dmrs2 A DMRS object. For matching, the large dmrs. :return A list of tuples (pred, nodes from dmrs1, nodes from dmrs2). All nodes in nodes from dmrs1 and nodes form dmrs2 are quivalent. The pred is their common predicate. The list of tuples is sorted by pred. """ grouped_nodes1 = group_same_nodes(dmrs1.nodes) grouped_nodes2 = group_same_nodes(dmrs2.nodes) grouped_nodes = [] i = 0 j = 0 while i < len(grouped_nodes1) and j < len(grouped_nodes2): pred1, group1 = grouped_nodes1[i] pred2, group2 = grouped_nodes2[j] if pred1 == pred2 and are_equal_nodes(dmrs1[group1[0]], dmrs2[group2[0]]): grouped_nodes.append((pred1, group1, group2)) i += 1 j += 1 else: if pred1 > pred2: j += 1 else: i += 1 return grouped_nodes
def group_same_nodes(nodes): """ Groups nodeids of equivalent nodes into sublists, using are_equal_nodes as the equivalency criterion. :param nodes A list of nodes. :return A list of tuples (pred, id list) sorted by pred. The pred is the shared predicate of the group; the id_list is a list of nodeids of equivalent nodes. """ grouped_nodes = [] group_node_type = None current_group = [] sorted_nodes = sorted(nodes, key=lambda n: str(n.pred)) for node in sorted_nodes: if not group_node_type: group_node_type = node current_group.append(node.nodeid) elif are_equal_nodes(node, group_node_type): current_group.append(node.nodeid) else: grouped_nodes.append((group_node_type.pred, current_group)) current_group = [node.nodeid] group_node_type = node grouped_nodes.append((group_node_type.pred, current_group)) return grouped_nodes
def pair_same_node_groups(dmrs1, dmrs2): """ Finds which nodes in dmrs1 are equivalent to which nodes in dmrs2. :param dmrs1 A DMRS object. For matching, the small dmrs. :param dmrs2 A DMRS object. For matching, the large dmrs. :return A list of tuples (pred, nodes from dmrs1, nodes from dmrs2). All nodes in nodes from dmrs1 and nodes form dmrs2 are quivalent. The pred is their common predicate. The list of tuples is sorted by pred. """ grouped_nodes1 = group_same_nodes(dmrs1.nodes) grouped_nodes2 = group_same_nodes(dmrs2.nodes) grouped_nodes = [] i = 0 j = 0 while i < len(grouped_nodes1) and j < len(grouped_nodes2): pred1, group1 = grouped_nodes1[i] pred2, group2 = grouped_nodes2[j] if pred1 == pred2 and are_equal_nodes(dmrs1[group1[0]], dmrs2[group2[0]]): grouped_nodes.append((pred1, group1, group2)) i += 1 j += 1 else: if str(pred1) > str(pred2): j += 1 else: i += 1 return grouped_nodes
def group_same_nodes(nodes): """ Groups nodeids of equivalent nodes into sublists, using are_equal_nodes as the equivalency criterion. :param nodes A list of nodes. :return A list of tuples (pred, id list) sorted by pred. The pred is the shared predicate of the group; the id_list is a list of nodeids of equivalent nodes. """ grouped_nodes = [] group_node_type = None current_group = [] sorted_nodes = sorted(nodes, key=lambda n: str(n.pred)) for node in sorted_nodes: if not group_node_type: group_node_type = node current_group.append(node.nodeid) elif are_equal_nodes(node, group_node_type): current_group.append(node.nodeid) else: grouped_nodes.append((group_node_type.pred, current_group)) current_group = [node.nodeid] group_node_type = node grouped_nodes.append((group_node_type.pred, current_group)) return grouped_nodes
def extend_match(match, start_nodeids, dmrs1, dmrs2, underspecified=True): """ Finds a match between dmrs1 and dmrs2. :param match: A Match object to be extended. :param start_nodeids: A tuple of matching nodeids with which to start to match extension. :param dmrs1 A DMRS object. For matching, the small dmrs. :param dmrs2 A DMRS object. For matching, the large dmrs. :param underspecified: If True (default), treat underspecified nodes as equal. The two start nodes should be equivalent by are_equal_nodes criterion. The function finds any links shared by the two start nodes (equivalent according to are"equal_links) and follows them. The pairs of nodes at other end of the links are added to a queue. Then the function calls itself recursively with the queued pairs of nodes as the start nodes. The recursion stops when no shared links are found and the queue is empty. :return A Match composed of updated matched_nodes, matched_links. """ match.nodeid_pairs.append(start_nodeids) matched_first = set(x[0] for x in match.nodeid_pairs) if match.link_pairs: matched_links1, matched_links2 = tuple( set(x) for x in zip(*match.link_pairs)) else: matched_links1, matched_links2 = set(), set() node_queue = [] start_id1, start_id2 = start_nodeids links1 = dmrs1.get_out(start_id1) links1.update(dmrs1.get_in(start_id1)) links1.update(dmrs1.get_eq(start_id1)) links2 = dmrs2.get_out(start_id2) links2.update(dmrs2.get_in(start_id2)) links2.update(dmrs2.get_eq(start_id2)) for link1 in links1: if link1 not in matched_links1: for link2 in links2: if link2 not in matched_links2: if are_equal_links(link1, link2, dmrs1, dmrs2): if link1.start in matched_first and match.get_second( link1.start) != link2.start: continue if link1.end in matched_first and match.get_second( link1.end) != link2.end: continue match.link_pairs.append((link1, link2)) matched_links1.add(link1) matched_links2.add(link2) paired1 = link1.start if link1.end == start_id1 else link1.end paired2 = link2.start if link2.end == start_id2 else link2.end node_queue.append((paired1, paired2)) break for nodeid1, nodeid2 in node_queue: if (nodeid1, nodeid2) not in match.nodeid_pairs and are_equal_nodes( dmrs1[nodeid1], dmrs2[nodeid2], underspecified): extend_match(match, (nodeid1, nodeid2), dmrs1, dmrs2, underspecified)
def find_match(start_id1, start_id2, dmrs1, dmrs2, matched_nodes, matched_links): """ Finds a match between dmrs1 and dmrs2. :param dmrs1 A DMRS object. For matching, the small dmrs. :param dmrs2 A DMRS object. For matching, the large dmrs. :param start_id1 A nodeid of a node from dmrs1 from which the graph traversal should be started. :param start_id2 A nodeid of a node from dmrs2 from which the graph traversal should be started. :param matched_nodes Nodes matched so far during the graph traversal Gets updated during recursion. Use an empty list for the top call. :param matched_link Link matched so far during the graph traversal. Gets updated during recursion. Use an empty list for the top call. The two start nodes should be equivalent by are_equal_nodes criterion. The function finds any links shared by the two start nodes (equivalent according to are"equal_links) and follows them. The pairs of nodes at other end of the links are added to a queue. Then the function calls itself recursively with the queued pairs of nodes as the start nodes. The recursion stops when no shared links are found and the queue is empty. :return A Match composed of updated matched_nodes, matched_links. """ assert(are_equal_nodes(dmrs1[start_id1], dmrs2[start_id2])) matched_nodes.append((start_id1, start_id2)) node_queue = [] links1 = dmrs1.get_out(start_id1) links2 = dmrs2.get_out(start_id2) for link1 in links1: if link1 not in [pair[0] for pair in matched_links]: for link2 in links2: if link2 not in [pair[1] for pair in matched_links]: if are_equal_links(link1, link2, dmrs1, dmrs2): matched_links.append((link1, link2)) node_queue.append((link1.end, link2.end)) break links1 = dmrs1.get_in(start_id1) links2 = dmrs2.get_in(start_id2) for link1 in links1: if link1 not in [pair[0] for pair in matched_links]: for link2 in links2: if link2 not in [pair[1] for pair in matched_links]: if are_equal_links(link1, link2, dmrs1, dmrs2): matched_links.append((link1, link2)) node_queue.append((link1.start, link2.start)) break for nodeid1, nodeid2 in node_queue: if (nodeid1, nodeid2) not in matched_nodes: find_match(nodeid1, nodeid2, dmrs1, dmrs2, matched_nodes, matched_links) return Match(matched_nodes, matched_links)
def fill_tree(root, sorted_nodes1, sorted_nodes2): """ Builds a tree, starting at root, of matching subsequences between sorted_nodes1 and sorted_nodes2. The subsequences match if the order of the nodes is preserved and they satisfy are_equal_nodes. :param root A TreeNode root of the constructed tree. :param sorted_nodes1 A list of nodes (from the smaller DMRS) sorted by sort_nodes. :param sorted_nodes2 A list of nodes (from the larger DMRS) sorted by sort_nodes. """ for id1 in range(root.id1+1, len(sorted_nodes1)): for id2 in range(root.id2+1, len(sorted_nodes2)): if are_equal_nodes(sorted_nodes1[id1], sorted_nodes2[id2]): child_node = TreeNode(id1, id2) fill_tree(child_node, sorted_nodes1, sorted_nodes2) root.add_child(child_node)
def find_match(start_id1, start_id2, dmrs1, dmrs2, matched_nodes, matched_links): """ Finds a match between dmrs1 and dmrs2. :param dmrs1 A DMRS object. For matching, the small dmrs. :param dmrs2 A DMRS object. For matching, the large dmrs. :param start_id1 A nodeid of a node from dmrs1 from which the graph traversal should be started. :param start_id2 A nodeid of a node from dmrs2 from which the graph traversal should be started. :param matched_nodes Nodes matched so far during the graph traversal Gets updated during recursion. Use an empty list for the top call. :param matched_links Link matched so far during the graph traversal. Gets updated during recursion. Use an empty list for the top call. The two start nodes should be equivalent by are_equal_nodes criterion. The function finds any links shared by the two start nodes (equivalent according to are"equal_links) and follows them. The pairs of nodes at other end of the links are added to a queue. Then the function calls itself recursively with the queued pairs of nodes as the start nodes. The recursion stops when no shared links are found and the queue is empty. :return A Match composed of updated matched_nodes, matched_links. """ assert (are_equal_nodes(dmrs1[start_id1], dmrs2[start_id2])) matched_nodes.append((start_id1, start_id2)) node_queue = [] links1 = dmrs1.get_out(start_id1) links1.update(dmrs1.get_in(start_id1)) links2 = dmrs2.get_out(start_id2) links2.update(dmrs2.get_in(start_id2)) for link1 in links1: if link1 not in [pair[0] for pair in matched_links]: for link2 in links2: if link2 not in [pair[1] for pair in matched_links]: if are_equal_links(link1, link2, dmrs1, dmrs2): matched_links.append((link1, link2)) paired1 = link1.start if link1.end == start_id1 else link1.end paired2 = link2.start if link2.end == start_id2 else link2.end node_queue.append((paired1, paired2)) break for nodeid1, nodeid2 in node_queue: if (nodeid1, nodeid2) not in matched_nodes: find_match(nodeid1, nodeid2, dmrs1, dmrs2, matched_nodes, matched_links) return Match(matched_nodes, matched_links)
def find_all_matches(dmrs1, dmrs2, underspecified=False): """ Finds all regions with potential matches between two DMRS graphs. :param dmrs1 A DMRS object. For matching, the small dmrs. :param dmrs2 A DMRS object. For matching, the large dmrs. :param underspecified: If True, the underspecified nodes in dmrs1 will be matched to more specific ones in dmrs2. The function initiates a extend_match top call and repeats it until all possible pairings are explored. GPreds and quantifiers 'a' and 'the' are not allowed as the start ndoes of extend_match to narrow down the search space. :return A list of Match objects where pairs come from (dmrs1, dmrs2). """ node_pairings = pair_same_node_groups(dmrs1, dmrs2, underspecified) matches = [] checked_node_pairs = [] # Sort pairs so that the ones with fewer matching combination are considered first. # Exclude GPreds and some quantifiers from the pool of start nodes. filter_func = lambda pairing: isinstance(pairing[0], RealPred) and pairing[ 0].lemma not in ['a', 'the'] filtered_pairings = filter(filter_func, node_pairings) sorted_pairings = sorted( filtered_pairings, key=lambda pairing: len(pairing[1]) * len(pairing[2])) if not sorted_pairings: sorted_pairings = node_pairings for pred, group1, group2 in sorted_pairings: all_pairs = product(group1, group2) for pair in all_pairs: if pair not in checked_node_pairs and are_equal_nodes( dmrs1[pair[0]], dmrs2[pair[1]], underspecified=underspecified): match = Match([], []) extend_match(match, (pair[0], pair[1]), dmrs1, dmrs2, underspecified) checked_node_pairs.extend(match.nodeid_pairs) matches.append(match) return matches # (matched_nodes, matched_links)