def generate_derivation(hg # type: HyperGraph ): lexicons = list(cfg.generate_words()) assert len(lexicons) == len(spans) rules = list(cfg.generate_rules()) for span, lexicon in zip(spans, lexicons): lexicon.span = span count = 1 last_new_edge = None for rule in rules: new_span = (rule.child[0].span[0], rule.child[-1].span[1]) rule.span = new_span result = detect_func(hg, rule) if result is None: rule.has_semantics = False continue else: rule.has_semantics = True all_edges, internal_nodes, external_nodes = result new_edge = HyperEdge(external_nodes, rule.tag, False, new_span) new_nodes = hg.nodes - internal_nodes new_edges = (hg.edges - all_edges) | {new_edge} hg_new = HyperGraph(new_nodes, new_edges) node_rename_map, hrg_rule = HRGRule.extract( all_edges, internal_nodes, external_nodes, rule.tag) if draw: pic_path = "/tmp/a3/{}/{}".format(sent_id, count) pics.append( cls.draw(hg, pic_path, all_edges, internal_nodes, external_nodes, last_new_edge, draw_format=draw_format)) hg = hg_new last_new_edge = new_edge count += 1 hrg_rule.cfg = cls.convert_cfg_node(rule) yield node_rename_map, hrg_rule if draw: pic_path = "/tmp/a3/{}/{}".format(sent_id, count) pics.append( cls.draw(hg, pic_path, last_new_edge=last_new_edge, draw_format=draw_format))
def transform_edge(self, edge, lexicon): if "NEWLEMMA" in edge.label: word = lexicon.string.replace("_", "+") if "_u_unknown" in edge.label: item = word else: pos = edge.label[edge.label.find("NEWLEMMA") + 10] if pos in ("n", "v", "a"): item = self.lemmatizer.lemmatize(word, pos) else: item = self.lemmatizer.lemmatize(lexicon.string.replace("_", "+")) new_label = edge.label.format(NEWLEMMA=item) # print(edge.label, lexicon, item, new_label) return HyperEdge(edge.nodes, new_label, edge.is_terminal, edge.span) return edge
def generate_derivation(hg # type: HyperGraph ): rules = list(cfg.generate_rules()) # root last count = 1 last_new_edge = None for rule in rules: new_span = (rule.child[0].span[0], rule.child[-1].span[1]) rule.span = new_span result = detect_func(hg, rule) # null semantic node if result is None: rule.has_semantics = False if lexicalize_null_semantic: cfg_rhs = tuple((j, None) for j in rule.generate_words( )) # type: Tuple[Tuple[Lexicon, None]] else: cfg_rhs = tuple( (i if isinstance(i, Lexicon) else i.tag, None) for i in rule.child) yield CFGRule(rule.tag, cfg_rhs, None) continue else: rule.has_semantics = True all_edges, internal_nodes, external_nodes = result new_edge = HyperEdge(external_nodes, rule.tag, False, new_span) new_nodes = hg.nodes - internal_nodes new_edges = (hg.edges - all_edges) | {new_edge} hg_new = HyperGraph(new_nodes, new_edges) node_rename_map, hrg_rule = HRGRule.extract( all_edges, internal_nodes, external_nodes, rule.tag, rule) if draw: pic_path = "/tmp/a3/{}/{}".format(sent_id, count) pics.append( HRGDerivation.draw(hg, pic_path, all_edges, internal_nodes, external_nodes, last_new_edge, draw_format=draw_format)) hg = hg_new last_new_edge = new_edge count += 1 if isinstance(rule.child[0], Lexicon): # leaf node assert len(rule.child) == 1 cfg_rhs = ((rule.child[0], None), ) else: # internal node assert all(isinstance(i, ConstTree) for i in rule.child) cfg_rhs = [] for i in rule.child: if not i.has_semantics: if lexicalize_null_semantic: cfg_rhs.extend( (j, None) for j in i.generate_words()) else: cfg_rhs.append((i.tag, None)) else: # find corresponding hyperedge in hrg rule for this tree node target_edges = [ j for j in all_edges if j.span == i.span ] assert len(target_edges) == 1 if target_edges[0].label != i.tag: print("Non-consistent CFG and HRG: ", " ".join(j.string for j in rule.generate_words()), file=sys.stderr) cfg_rhs = None break target_edges_r = HyperEdge( (node_rename_map[node] for node in target_edges[0].nodes), target_edges[0].label, target_edges[0].is_terminal) cfg_rhs.append((i.tag, target_edges_r)) if cfg_rhs is not None: yield CFGRule(rule.tag, tuple(cfg_rhs), hrg_rule) else: yield CFGRule(rule.tag, cfg_rhs, None) if draw: pic_path = "/tmp/a3/{}/{}".format(sent_id, count) pics.append( HRGDerivation.draw(hg, pic_path, last_new_edge=last_new_edge, draw_format=draw_format))
def extract( cls, edges, # type: Set[HyperEdge] internal_nodes, # type: Set[GraphNode] external_nodes, # type: Set[GraphNode] label, # type: str cfg_rule=None): nodes = internal_nodes.union(external_nodes) edge_by_node = defaultdict( list) # node -> (edge, index of this node in this edge) for edge in edges: for idx, node in enumerate(edge.nodes): edge_by_node[node].append((edge, idx)) default_hash = hashlib.md5(b"13").digest() node_hashes = {node: default_hash for node in nodes} # node -> hash def get_edge_hashes( node_hashes, # type: Dict[GraphNode, bytes] edge, # type: HyperEdge idx # type: int ): md5_obj = hashlib.md5((edge.label + "#" + str(idx)).encode()) for adj_node in edge.nodes: md5_obj.update(node_hashes[adj_node] + b"#") return md5_obj.digest() def get_sibling_hashes( node_hashes, # type: Dict[GraphNode, bytes] node # type: GraphNode ): md5_obj = hashlib.md5() edge_hashes = sorted( get_edge_hashes(node_hashes, edge, idx) for edge, idx in edge_by_node[node]) for h in edge_hashes: md5_obj.update(h) return md5_obj.digest() for cycle in range(10): new_node_hashes = {} # recalculate hashes for node in nodes: md5_obj = hashlib.md5() md5_obj.update(get_sibling_hashes(node_hashes, node)) md5_obj.update(b'\x01' if node in external_nodes else b'\x00') new_node_hashes[node] = md5_obj.digest() node_hashes = new_node_hashes nodes_in_order = sorted(node_hashes.items(), key=itemgetter(1)) node_rename_map = {} for node_idx, (node, hash_value) in enumerate(nodes_in_order): node_rename_map[node] = GraphNode(str(node_idx)) # get rhs new_edges = [] for edge in edges: new_edges.append( HyperEdge((node_rename_map[node] for node in edge.nodes), edge.label, edge.is_terminal)) rhs = HyperGraph(frozenset(node_rename_map.values()), frozenset(new_edges)) # determine external nodes permutation def get_external_nodes_permutation(): if len(external_nodes) == 2: for permutation in permutations(external_nodes): if any(edge.nodes == permutation for edge in edges): return [node_rename_map[i] for i in permutation] if cfg_rule is not None and len(cfg_rule.child) == 2: left_span = cfg_rule.child[0].span right_span = cfg_rule.child[1].span left_node = [ edge.nodes[0] for edge in edges if len(edge.nodes) == 1 and edge.span == left_span ] right_node = [ edge.nodes[0] for edge in edges if len(edge.nodes) == 1 and edge.span == right_span ] if left_node and right_node and { left_node[0], right_node[0] } == external_nodes: # print("Permutation rule 2 used") return [ node_rename_map[left_node[0]], node_rename_map[right_node[0]] ] return sorted((node_rename_map[i] for i in external_nodes), key=lambda x: int(x.name)) # get lhs lhs = HyperEdge(get_external_nodes_permutation(), label=label, is_terminal=False) return node_rename_map, cls(lhs, rhs)
def transform_edge(mapping, edge, span): return HyperEdge((mapping[i] for i in edge.nodes), edge.label, edge.is_terminal, span)
def transform_edge(mapping, edge): """ transform the edge in the rule into edge in concrete graph.""" return HyperEdge((mapping[i] for i in edge.nodes), edge.label, edge.is_terminal, None)
def transform_edge_2(mapping, edge): """ transform the edge in the rule into edge in concrete graph.""" return HyperEdge(((mapping.get(i) or i) for i in edge.nodes), edge.label, edge.is_terminal, edge.span)
def sync_grammar_fallback_2(self, tree_node): rule_name, main_node_count = tree_node.tag.rsplit("#", 1) word = tree_node.children[0].string main_node_count = int(main_node_count) if main_node_count == 1: main_node = GraphNode("0") surface = tree_node.children[0].string if self.pattern_number.match(surface): label = "card" elif rule_name.find("generic_proper") >= 0: label = "named" else: lemma = self.lemmatizer.lemmatize(word) if rule_name.find("n_-_c-pl-unk_le") >= 0: label = "_{}/nns_u_unknown".format(lemma) elif rule_name.find("n_-_mc_le") >= 0 or rule_name.find("n_-_c_le") >= 0: label = "_{}_n_1".format(lemma) # more number is used elif rule_name.find("generic_mass_count_noun") >= 0: label = "_{}/nn_u_unknown".format(lemma) # more number is used else: candidates = self.lexicon_mapping[HLexicon(word), main_node_count] if candidates: return candidates else: label = "named" old_edge = HyperEdge( nodes=[main_node], label=rule_name, is_terminal=False ) main_edge = HyperEdge( nodes=[main_node], label=label, is_terminal=True ) fallback = CFGRule(lhs=rule_name, rhs=((tree_node.children[0], None),), hrg=HRGRule( lhs=old_edge, rhs=HyperGraph( nodes=frozenset([main_node]), edges=frozenset({main_edge}) ) )) else: ret1 = self.terminal_mapping.get(tree_node.tag) if ret1: return Counter([ret1.most_common(1)[0][0]]) connected_nodes = [GraphNode(str(i)) for i in range(main_node_count)] centural_node = GraphNode(str(main_node_count + 1)) old_edge = HyperEdge( nodes=connected_nodes, label=rule_name, is_terminal=False ) main_edges = [HyperEdge( nodes=[centural_node, i], label="???", is_terminal=True ) for i in connected_nodes] fallback = CFGRule(lhs=rule_name, rhs=((tree_node.children[0], None),), hrg=HRGRule( lhs=old_edge, rhs=HyperGraph( nodes=frozenset(connected_nodes + [centural_node]), edges=frozenset(main_edges) ) )) return Counter([fallback])