def parse_bitext(self, obj1, obj2): """ Parse a single pair of objects (two strings, two graphs, or string/graph). """ rhs1type, rhs2type = self.grammar.rhs1_type, self.grammar.rhs2_type assert rhs1type in ["string", "hypergraph"] and rhs2type in ["string", "hypergraph"] # Remember size of input objects and figure out Item subclass if rhs1type == "string": obj1size = len(obj1) elif rhs1type == "hypergraph": obj1size = len(obj1.triples()) if rhs2type == "string": obj2size = len(obj2) elif rhs2type == "hypergraph": obj2size = len(obj2.triples()) grammar = self.grammar start_time = time.clock() log.chatter("parse...") # initialize data structures and lookups # we use various tables to provide constant-time lookup of fragments available # for shifting, completion, etc. chart = ddict(set) # TODO: command line filter to switch rule filter on/off pgrammar = [grammar[r] for r in grammar.reachable_rules(obj1, obj2)] # grammar.values() queue = deque() # the items left to be visited pending = set() # a copy of queue with constant-time lookup attempted = set() # a cache of previously-attempted item combinations visited = set() # a cache of already-visited items nonterminal_lookup = ddict(set) # a mapping from labels to graph edges reverse_lookup = ddict(set) # a mapping from outside symbols to open items # mapping from words to string indices for each string word_terminal_lookup1 = ddict(set) word_terminal_lookup2 = ddict(set) if rhs1type == "string": for i in range(len(obj1)): word_terminal_lookup1[obj1[i]].add(i) if rhs2type == "string": for i in range(len(obj2)): word_terminal_lookup2[obj2[i]].add(i) # mapping from edge labels to graph edges for each graph edge_terminal_lookup1 = ddict(set) edge_terminal_lookup2 = ddict(set) if rhs1type == "hypergraph": for edge in obj1.triples(nodelabels=self.nodelabels): edge_terminal_lookup1[edge[1]].add(edge) if rhs2type == "hypergraph": for edge in obj2.triples(nodelabels=self.nodelabels): edge_terminal_lookup2[edge[1]].add(edge) for rule in pgrammar: item1class = CfgItem if rhs1type == "string" else HergItem item2class = CfgItem if rhs2type == "string" else HergItem axiom = SynchronousItem(rule, item1class, item2class, nodelabels=self.nodelabels) queue.append(axiom) pending.add(axiom) if axiom.outside_is_nonterminal: reverse_lookup[axiom.outside_symbol].add(axiom) # keep track of whether we found any complete derivation success = False # parse while queue: item = queue.popleft() pending.remove(item) visited.add(item) log.debug("handling", item) if item.closed: log.debug(" is closed.") # check if it's a complete derivation if self.successful_biparse(obj1, obj2, item, obj1size, obj2size): chart["START"].add((item,)) success = True # add to nonterminal lookup nonterminal_lookup[item.rule.symbol].add(item) # wake up any containing rules # Unlike in ordinary state-space search, it's possible that we will have # to re-visit items which couldn't be merged with anything the first time # we saw them, and are waiting for the current item. The reverse_lookup # indexes all items by their outside symbol, so we re-append to the queue # all items looking for something with the current item's symbol. for ritem in reverse_lookup[item.rule.symbol]: if ritem not in pending: queue.append(ritem) pending.add(ritem) else: if item.outside_is_nonterminal: # complete reverse_lookup[item.outside_symbol].add(item) for oitem in nonterminal_lookup[item.outside_symbol]: log.debug(" oitem:", oitem) if (item, oitem) in attempted: # don't repeat combinations we've tried before continue attempted.add((item, oitem)) if not item.can_complete(oitem): log.debug(" fail") continue log.debug(" ok") nitem = item.complete(oitem) chart[nitem].add((item, oitem)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) else: # shift ; this depends on the configuration (string/graph -> string/graph) if not item.outside1_is_nonterminal and not item.item1.closed: if rhs1type == "string": new_items = [ item.shift_word1(item.outside_object1, index) for index in word_terminal_lookup1[item.outside_object1] if item.can_shift_word1(item.outside_object1, index) ] else: assert rhs1type is "hypergraph" new_items = [ item.shift_edge1(edge) for edge in edge_terminal_lookup1[item.outside_object1] if item.can_shift_edge1(edge) ] else: assert not item.outside2_is_nonterminal # Otherwise shift would not be called if rhs2type == "string": new_items = [ item.shift_word2(item.outside_object2, index) for index in word_terminal_lookup2[item.outside_object2] if item.can_shift_word2(item.outside_object2, index) ] else: assert rhs2type is "hypergraph" new_items = [ item.shift_edge2(edge) for edge in edge_terminal_lookup2[item.outside_object2] if item.can_shift_edge2(edge) ] for nitem in new_items: log.debug(" shift", nitem, nitem.shifted) chart[nitem].add((item,)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) if success: log.chatter(" success!") etime = time.clock() - start_time log.chatter("done in %.2fs" % etime) # TODO return partial chart return chart
def parse(self, string, graph): """ Parses the given string and/or graph. """ # This is a long function, so let's start with a high-level overview. This is # a "deductive-proof-style" parser: We begin with one "axiomatic" chart item # for each rule, and combine these items with each other and with fragments of # the object(s) being parsed to deduce new items. We can think of these items # as defining a search space in which we need to find a path to the goal item. # The parser implemented here performs a BFS of this search space. grammar = self.grammar # remember when we started start_time = time.clock() log.chatter("parse...") # specify what kind of items we're working with if string and graph: axiom_class = CfgHergItem elif string: axiom_class = CfgItem else: axiom_class = HergItem # remember the size of the example if string: string_size = len(string) else: string_size = -1 if graph: graph_size = len(graph.triples(nodelabels=self.nodelabels)) else: graph_size = -1 # initialize data structures and lookups # we use various tables to provide constant-time lookup of fragments available # for shifting, completion, etc. chart = ddict(set) # TODO: Command line option to switch grammar filter on/off if string: pgrammar = [grammar[r] for r in grammar.reachable_rules(string, None)] # grammar.values() if graph: pgrammar = [grammar[r] for r in grammar.reachable_rules(graph, None)] # grammar.values() queue = deque() # the items left to be visited pending = set() # a copy of queue with constant-time lookup attempted = set() # a cache of previously-attempted item combinations visited = set() # a cache of already-visited items word_terminal_lookup = ddict(set) nonterminal_lookup = ddict(set) # a mapping from labels to graph edges reverse_lookup = ddict(set) # a mapping from outside symbols open items if string: word_terminal_lookup = ddict(set) # mapping from words to string indices for i in range(len(string)): word_terminal_lookup[string[i]].add(i) if graph: edge_terminal_lookup = ddict(set) # mapping from edge labels to graph edges for edge in graph.triples(nodelabels=self.nodelabels): edge_terminal_lookup[edge[1]].add(edge) for rule in pgrammar: axiom = axiom_class(rule, nodelabels=self.nodelabels) queue.append(axiom) pending.add(axiom) if axiom.outside_is_nonterminal: reverse_lookup[axiom.outside_symbol].add(axiom) # keep track of whether we found any complete derivation success = False # parse while queue: item = queue.popleft() pending.remove(item) visited.add(item) log.debug("handling", item) if item.closed: log.debug(" is closed.") # check if it's a complete derivation if self.successful_parse(string, graph, item, string_size, graph_size): chart["START"].add((item,)) success = True # add to nonterminal lookup nonterminal_lookup[item.rule.symbol].add(item) # wake up any containing rules # Unlike in ordinary state-space search, it's possible that we will have # to re-visit items which couldn't be merged with anything the first time # we saw them, and are waiting for the current item. The reverse_lookup # indexes all items by their outside symbol, so we re-append to the queue # all items looking for something with the current item's symbol. for ritem in reverse_lookup[item.rule.symbol]: if ritem not in pending: queue.append(ritem) pending.add(ritem) else: if item.outside_is_nonterminal: # complete reverse_lookup[item.outside_symbol].add(item) for oitem in nonterminal_lookup[item.outside_symbol]: log.debug(" oitem:", oitem) if (item, oitem) in attempted: # don't repeat combinations we've tried before continue attempted.add((item, oitem)) if not item.can_complete(oitem): log.debug(" fail") continue log.debug(" ok") nitem = item.complete(oitem) chart[nitem].add((item, oitem)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) else: # shift if string and graph: if not item.outside_word_is_nonterminal: new_items = [ item.shift_word(item.outside_word, index) for index in word_terminal_lookup[item.outside_word] if item.can_shift_word(item.outside_word, index) ] else: assert not item.outside_edge_is_nonterminal new_items = [ item.shift_edge(edge) for edge in edge_terminal_lookup[item.outside_edge] if item.can_shift_edge(edge) ] elif string: new_items = [ item.shift(item.outside_word, index) for index in word_terminal_lookup[item.outside_word] if item.can_shift(item.outside_word, index) ] else: assert graph new_items = [ item.shift(edge) for edge in edge_terminal_lookup[item.outside_edge] if item.can_shift(edge) ] for nitem in new_items: log.debug(" shift", nitem, nitem.shifted) chart[nitem].add((item,)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) if success: log.chatter(" success!") etime = time.clock() - start_time log.chatter("done in %.2fs" % etime) # TODO return partial chart return chart
def parse_bitext(self, obj1, obj2): """ Parse a single pair of objects (two strings, two graphs, or string/graph). """ rhs1type, rhs2type = self.grammar.rhs1_type, self.grammar.rhs2_type assert rhs1type in ["string", "hypergraph" ] and rhs2type in ["string", "hypergraph"] # Remember size of input objects and figure out Item subclass if rhs1type == "string": obj1size = len(obj1) elif rhs1type == "hypergraph": obj1size = len(obj1.triples()) if rhs2type == "string": obj2size = len(obj2) elif rhs2type == "hypergraph": obj2size = len(obj2.triples()) grammar = self.grammar start_time = time.clock() log.chatter('parse...') # initialize data structures and lookups # we use various tables to provide constant-time lookup of fragments available # for shifting, completion, etc. chart = ddict(set) #TODO: command line filter to switch rule filter on/off pgrammar = [grammar[r] for r in grammar.reachable_rules(obj1, obj2) ] #grammar.values() queue = deque() # the items left to be visited pending = set() # a copy of queue with constant-time lookup attempted = set() # a cache of previously-attempted item combinations visited = set() # a cache of already-visited items nonterminal_lookup = ddict(set) # a mapping from labels to graph edges reverse_lookup = ddict( set) # a mapping from outside symbols to open items # mapping from words to string indices for each string word_terminal_lookup1 = ddict(set) word_terminal_lookup2 = ddict(set) if rhs1type == "string": for i in range(len(obj1)): word_terminal_lookup1[obj1[i]].add(i) if rhs2type == "string": for i in range(len(obj2)): word_terminal_lookup2[obj2[i]].add(i) # mapping from edge labels to graph edges for each graph edge_terminal_lookup1 = ddict(set) edge_terminal_lookup2 = ddict(set) if rhs1type == "hypergraph": for edge in obj1.triples(nodelabels=self.nodelabels): edge_terminal_lookup1[edge[1]].add(edge) if rhs2type == "hypergraph": for edge in obj2.triples(nodelabels=self.nodelabels): edge_terminal_lookup2[edge[1]].add(edge) for rule in pgrammar: item1class = CfgItem if rhs1type == "string" else HergItem item2class = CfgItem if rhs2type == "string" else HergItem axiom = SynchronousItem(rule, item1class, item2class, nodelabels=self.nodelabels) queue.append(axiom) pending.add(axiom) if axiom.outside_is_nonterminal: reverse_lookup[axiom.outside_symbol].add(axiom) # keep track of whether we found any complete derivation success = False # parse while queue: item = queue.popleft() pending.remove(item) visited.add(item) log.debug('handling', item) if item.closed: log.debug(' is closed.') # check if it's a complete derivation if self.successful_biparse(obj1, obj2, item, obj1size, obj2size): chart['START'].add((item, )) success = True # add to nonterminal lookup nonterminal_lookup[item.rule.symbol].add(item) # wake up any containing rules # Unlike in ordinary state-space search, it's possible that we will have # to re-visit items which couldn't be merged with anything the first time # we saw them, and are waiting for the current item. The reverse_lookup # indexes all items by their outside symbol, so we re-append to the queue # all items looking for something with the current item's symbol. for ritem in reverse_lookup[item.rule.symbol]: if ritem not in pending: queue.append(ritem) pending.add(ritem) else: if item.outside_is_nonterminal: # complete reverse_lookup[item.outside_symbol].add(item) for oitem in nonterminal_lookup[item.outside_symbol]: log.debug(" oitem:", oitem) if (item, oitem) in attempted: # don't repeat combinations we've tried before continue attempted.add((item, oitem)) if not item.can_complete(oitem): log.debug(" fail") continue log.debug(" ok") nitem = item.complete(oitem) chart[nitem].add((item, oitem)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) else: # shift ; this depends on the configuration (string/graph -> string/graph) if not item.outside1_is_nonterminal and not item.item1.closed: if rhs1type == "string": new_items = [ item.shift_word1(item.outside_object1, index) for index in word_terminal_lookup1[ item.outside_object1] if item.can_shift_word1( item.outside_object1, index) ] else: assert rhs1type is "hypergraph" new_items = [ item.shift_edge1(edge) for edge in edge_terminal_lookup1[item.outside_object1] if item.can_shift_edge1(edge) ] else: assert not item.outside2_is_nonterminal # Otherwise shift would not be called if rhs2type == "string": new_items = [ item.shift_word2(item.outside_object2, index) for index in word_terminal_lookup2[ item.outside_object2] if item.can_shift_word2( item.outside_object2, index) ] else: assert rhs2type is "hypergraph" new_items = [ item.shift_edge2(edge) for edge in edge_terminal_lookup2[item.outside_object2] if item.can_shift_edge2(edge) ] for nitem in new_items: log.debug(' shift', nitem, nitem.shifted) chart[nitem].add((item, )) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) if success: log.chatter(' success!') etime = time.clock() - start_time log.chatter('done in %.2fs' % etime) # TODO return partial chart return chart
def parse(self, string, graph): """ Parses the given string and/or graph. """ # This is a long function, so let's start with a high-level overview. This is # a "deductive-proof-style" parser: We begin with one "axiomatic" chart item # for each rule, and combine these items with each other and with fragments of # the object(s) being parsed to deduce new items. We can think of these items # as defining a search space in which we need to find a path to the goal item. # The parser implemented here performs a BFS of this search space. grammar = self.grammar # remember when we started start_time = time.clock() log.chatter('parse...') # specify what kind of items we're working with if string and graph: axiom_class = CfgHergItem elif string: axiom_class = CfgItem else: axiom_class = HergItem # remember the size of the example if string: string_size = len(string) else: string_size = -1 if graph: graph_size = len(graph.triples(nodelabels=self.nodelabels)) else: graph_size = -1 # initialize data structures and lookups # we use various tables to provide constant-time lookup of fragments available # for shifting, completion, etc. chart = ddict(set) # TODO: Command line option to switch grammar filter on/off if string: pgrammar = [ grammar[r] for r in grammar.reachable_rules(string, None) ] #grammar.values() if graph: pgrammar = [ grammar[r] for r in grammar.reachable_rules(graph, None) ] #grammar.values() queue = deque() # the items left to be visited pending = set() # a copy of queue with constant-time lookup attempted = set() # a cache of previously-attempted item combinations visited = set() # a cache of already-visited items word_terminal_lookup = ddict(set) nonterminal_lookup = ddict(set) # a mapping from labels to graph edges reverse_lookup = ddict( set) # a mapping from outside symbols open items if string: word_terminal_lookup = ddict( set) # mapping from words to string indices for i in range(len(string)): word_terminal_lookup[string[i]].add(i) if graph: edge_terminal_lookup = ddict( set) # mapping from edge labels to graph edges for edge in graph.triples(nodelabels=self.nodelabels): edge_terminal_lookup[edge[1]].add(edge) for rule in pgrammar: axiom = axiom_class(rule, nodelabels=self.nodelabels) queue.append(axiom) pending.add(axiom) if axiom.outside_is_nonterminal: reverse_lookup[axiom.outside_symbol].add(axiom) # keep track of whether we found any complete derivation success = False # parse while queue: item = queue.popleft() pending.remove(item) visited.add(item) log.debug('handling', item) if item.closed: log.debug(' is closed.') # check if it's a complete derivation if self.successful_parse(string, graph, item, string_size, graph_size): chart['START'].add((item, )) success = True # add to nonterminal lookup nonterminal_lookup[item.rule.symbol].add(item) # wake up any containing rules # Unlike in ordinary state-space search, it's possible that we will have # to re-visit items which couldn't be merged with anything the first time # we saw them, and are waiting for the current item. The reverse_lookup # indexes all items by their outside symbol, so we re-append to the queue # all items looking for something with the current item's symbol. for ritem in reverse_lookup[item.rule.symbol]: if ritem not in pending: queue.append(ritem) pending.add(ritem) else: if item.outside_is_nonterminal: # complete reverse_lookup[item.outside_symbol].add(item) for oitem in nonterminal_lookup[item.outside_symbol]: log.debug(" oitem:", oitem) if (item, oitem) in attempted: # don't repeat combinations we've tried before continue attempted.add((item, oitem)) if not item.can_complete(oitem): log.debug(" fail") continue log.debug(" ok") nitem = item.complete(oitem) chart[nitem].add((item, oitem)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) else: # shift if string and graph: if not item.outside_word_is_nonterminal: new_items = [ item.shift_word(item.outside_word, index) for index in word_terminal_lookup[ item.outside_word] if item.can_shift_word( item.outside_word, index) ] else: assert not item.outside_edge_is_nonterminal new_items = [ item.shift_edge(edge) for edge in edge_terminal_lookup[item.outside_edge] if item.can_shift_edge(edge) ] elif string: new_items = [ item.shift(item.outside_word, index) for index in word_terminal_lookup[item.outside_word] if item.can_shift(item.outside_word, index) ] else: assert graph new_items = [ item.shift(edge) for edge in edge_terminal_lookup[item.outside_edge] if item.can_shift(edge) ] for nitem in new_items: log.debug(' shift', nitem, nitem.shifted) chart[nitem].add((item, )) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) if success: log.chatter(' success!') etime = time.clock() - start_time log.chatter('done in %.2fs' % etime) # TODO return partial chart return chart
def parse(self, graph): """ Parses the given graph with the provided grammar. """ # This function is very similar to its counterpart in the regular # (non-tree-decomposing) parser. Read the comments there to understand how it # works. start_time = time.clock() log.chatter('parse...') # ensure that the input graph has its shortest-path table precomputed graph.compute_fw_table() chart = ddict(set) # TODO command line option to switch rule filtering on/off pgrammar = [ self.grammar[r] for r in self.grammar.reachable_rules(graph, None) ] queue = deque() pending = set() attempted = set() visited = set() terminal_lookup = ddict(set) passive_item_lookup = ddict(set) tree_node_lookup = ddict(set) passive_item_rev_lookup = ddict(set) tree_node_rev_lookup = ddict(set) for edge in graph.triples(nodelabels=self.nodelabels): terminal_lookup[edge[1]].add(edge) for rule in pgrammar: for leaf in rule.tree_leaves: axiom = self.item_class(rule, leaf, graph, nodelabels=self.nodelabels) queue.append(axiom) pending.add(axiom) assert leaf not in rule.tree_to_edge success = False while queue: item = queue.popleft() pending.remove(item) visited.add(item) log.debug('handling', item, item.subgraph) if item.target == Item.NONE: log.debug(' none') tree_node_lookup[item.self_key].add(item) for ritem in tree_node_rev_lookup[item.self_key]: if ritem not in pending: queue.append(ritem) pending.add(ritem) elif item.target == Item.ROOT: log.debug(' root') if self.is_goal(item): chart['START'].add((item, )) success = True log.debug("success!") passive_item_lookup[item.self_key].add(item) for ritem in passive_item_rev_lookup[item.self_key]: if ritem not in pending: log.debug(' retrieving', ritem) queue.append(ritem) pending.add(ritem) elif item.target == Item.TERMINAL: log.debug(' terminal') new_items = [ item.terminal(edge) for edge in terminal_lookup[item.next_key] ] new_items = [i for i in new_items if i] for nitem in new_items: chart[nitem].add((item, )) if nitem not in pending and nitem not in visited: log.debug(' new item!', nitem) queue.append(nitem) pending.add(nitem) else: if item.target == Item.BINARY: log.debug(' binary') rev_lookup = tree_node_rev_lookup lookup = tree_node_lookup action = self.item_class.binary elif item.target == Item.NONTERMINAL: log.debug(' nonterminal') rev_lookup = passive_item_rev_lookup lookup = passive_item_lookup action = self.item_class.nonterminal else: assert False rev_lookup[item.next_key].add(item) for oitem in lookup[item.next_key]: if (item, oitem) in attempted: continue attempted.add((item, oitem)) log.debug(' try', oitem, oitem.subgraph) nitem = action(item, oitem) if not nitem: continue log.debug(' new item!', nitem) chart[nitem].add((item, oitem)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) if success: log.chatter(' success!') etime = time.clock() - start_time log.chatter('done in %.2fs' % etime) return chart
def parse(self, graph): """ Parses the given graph with the provided grammar. """ # This function is very similar to its counterpart in the regular # (non-tree-decomposing) parser. Read the comments there to understand how it # works. start_time = time.clock() log.chatter('parse...') # ensure that the input graph has its shortest-path table precomputed graph.compute_fw_table() chart = ddict(set) # TODO command line option to switch rule filtering on/off pgrammar = [self.grammar[r] for r in self.grammar.reachable_rules(graph, None)] queue = deque() pending = set() attempted = set() visited = set() terminal_lookup = ddict(set) passive_item_lookup = ddict(set) tree_node_lookup = ddict(set) passive_item_rev_lookup = ddict(set) tree_node_rev_lookup = ddict(set) for edge in graph.triples(nodelabels = self.nodelabels): terminal_lookup[edge[1]].add(edge) for rule in pgrammar: for leaf in rule.tree_leaves: axiom = self.item_class(rule, leaf, graph, nodelabels = self.nodelabels) queue.append(axiom) pending.add(axiom) assert leaf not in rule.tree_to_edge success = False while queue: item = queue.popleft() pending.remove(item) visited.add(item) log.debug('handling', item, item.subgraph) if item.target == Item.NONE: log.debug(' none') tree_node_lookup[item.self_key].add(item) for ritem in tree_node_rev_lookup[item.self_key]: if ritem not in pending: queue.append(ritem) pending.add(ritem) elif item.target == Item.ROOT: log.debug(' root') if self.is_goal(item): chart['START'].add((item,)) success = True log.debug("success!") passive_item_lookup[item.self_key].add(item) for ritem in passive_item_rev_lookup[item.self_key]: if ritem not in pending: log.debug(' retrieving', ritem) queue.append(ritem) pending.add(ritem) elif item.target == Item.TERMINAL: log.debug(' terminal') new_items = [item.terminal(edge) for edge in terminal_lookup[item.next_key]] new_items = [i for i in new_items if i] for nitem in new_items: chart[nitem].add((item,)) if nitem not in pending and nitem not in visited: log.debug(' new item!', nitem) queue.append(nitem) pending.add(nitem) else: if item.target == Item.BINARY: log.debug(' binary') rev_lookup = tree_node_rev_lookup lookup = tree_node_lookup action = self.item_class.binary elif item.target == Item.NONTERMINAL: log.debug(' nonterminal') rev_lookup = passive_item_rev_lookup lookup = passive_item_lookup action = self.item_class.nonterminal else: assert False rev_lookup[item.next_key].add(item) for oitem in lookup[item.next_key]: if (item, oitem) in attempted: continue attempted.add((item, oitem)) log.debug(' try', oitem, oitem.subgraph) nitem = action(item, oitem) if not nitem: continue log.debug(' new item!', nitem) chart[nitem].add((item, oitem)) if nitem not in pending and nitem not in visited: queue.append(nitem) pending.add(nitem) if success: log.chatter(' success!') etime = time.clock() - start_time log.chatter('done in %.2fs' % etime) return chart