class SlicedEarley(object): """ """ def __init__(self, wcfg, wfsa, slice_vars): """ """ self._wcfg = wcfg self._wfsa = wfsa self._agenda = Agenda(active_container_type=ActiveQueue) self._predictions = set() # (LHS, start) self._item_factory = ItemFactory() self.slice_vars = slice_vars def get_item(self, rule, dot, inner=[]): return self._item_factory.get_item(rule, dot, inner) def advance(self, item, dot): """returns a new item whose dot has been advanced""" return self.get_item(item.rule, dot, item.inner + (item.dot,)) def axioms(self, symbol, start): rules = self._wcfg.get(symbol, None) if rules is None: # impossible to rewrite the symbol return False if (symbol, start) in self._predictions: # already predicted return True # otherwise add rewritings to the agenda self._predictions.add((symbol, start)) self._agenda.extend(self.get_item(rule, start) for rule in rules) return True def prediction(self, item): """ This operation tris to create items from the rules associated with the nonterminal ahead of the dot. It returns True when prediction happens, and False if it already happened before. """ if (item.next, item.dot) in self._predictions: # prediction already happened return False self._predictions.add((item.next, item.dot)) new_items = [self.get_item(rule, item.dot) for rule in self._wcfg.get(item.next, frozenset())] self._agenda.extend(new_items) return True def scan(self, item): """ This operation tries to scan over as many terminals as possible, but we only go as far as determinism allows. If we get to a nondeterminism, we stop scanning and add the relevant items to the agenda. """ states = [item.dot] for sym in item.nextsymbols(): if is_terminal(sym): arcs = self._wfsa.get_arcs(origin=states[-1], symbol=sym) if len(arcs) == 0: # cannot scan the symbol return False elif len(arcs) == 1: # symbol is scanned deterministically sto, _ = arcs[0] states.append(sto) # we do not create intermediate items, instead we scan as much as we can else: # here we found a nondeterminism, we create all relevant items and add them to the agenda # create items for sto, w in arcs: self._agenda.add(self.get_item(item.rule, sto, item.inner + tuple(states))) return True else: # that's it, scan bumped into a nonterminal symbol, time to wrap up break # here we should have scanned at least one terminal symbol # and we defined a deterministic path self._agenda.add(self.get_item(item.rule, states[-1], item.inner + tuple(states[:-1]))) return True def complete_others(self, item): """ This operation creates new item by advancing the dot of passive items that are waiting for a certain given complete item. It returns whether or not at least one passive item awaited for the given complete item. """ if self._agenda.is_generating(item.rule.lhs, item.start, item.dot): return True new_items = [self.advance(incomplete, item.dot) for incomplete in self._agenda.iterwaiting(item.rule.lhs, item.start)] self._agenda.extend(new_items) return len(new_items) > 0 # was there any item waiting for the complete one? def complete_itself(self, item): """ This operation tries to merge a given incomplete item with a previosly completed one. """ new_items = [self.advance(item, sto) for sto in self._agenda.itercompletions(item.next, item.dot)] self._agenda.extend(new_items) return len(new_items) > 0 def do(self, root='[S]', goal='[GOAL]'): wfsa = self._wfsa wcfg = self._wcfg agenda = self._agenda # start items of the kind # GOAL -> * ROOT, where * is an intial state of the wfsa if not any(self.axioms(root, start) for start in wfsa.iterinitial()): raise ValueError('No rule for the start symbol %s' % root) new_roots = set() while agenda: item = agenda.pop() # always returns an active item if item.is_complete(): # get slice variable for the current completed item u = self.slice_vars.get(item.rule.lhs, item.start, item.dot) # check whether the probability of the current completed item is above the threshold determined by # the slice variable if item.rule.log_prob > u: # complete root item spanning from a start wfsa state to a final wfsa state if item.rule.lhs == root and wfsa.is_initial(item.start) and wfsa.is_final(item.dot): agenda.make_complete(item) new_roots.add((root, item.start, item.dot)) agenda.make_passive(item) else: if self.complete_others(item): agenda.make_complete(item) agenda.make_passive(item) else: # a complete state is only kept in case it could potentially complete others agenda.discard(item) else: if is_terminal(item.next): # fire the operation 'scan' self.scan(item) agenda.discard(item) # scanning renders incomplete items of this kind useless else: if not wcfg.can_rewrite(item.next): # if the NT does not exist this item is useless agenda.discard(item) else: if not self.prediction(item): # try to predict, otherwise try to complete itself self.complete_itself(item) agenda.make_passive(item) # converts complete items into rules logging.debug('Making forest...') return self.get_cfg(goal, root) def get_intersected_rule(self, item): lhs = make_symbol(item.rule.lhs, item.start, item.dot) positions = item.inner + (item.dot,) rhs = [make_symbol(sym, positions[i], positions[i + 1]) for i, sym in enumerate(item.rule.rhs)] return Rule(lhs, rhs, item.rule.log_prob) def get_cfg(self, goal, root): """ Constructs the CFG by visiting complete items in a top-down fashion. This is effectively a reachability test and it serves the purpose of filtering nonterminal symbols that could never be reached from the root. Note that bottom-up intersection typically does enumerate a lot of useless (unreachable) items. This is the recursive procedure described in the paper (Nederhof and Satta, 2008). """ G = WCFG() processed = set() fsa = self._wfsa itergenerating = self._agenda.itergenerating itercomplete = self._agenda.itercomplete def make_rules(lhs, start, end): if (start, lhs, end) in processed: return processed.add((lhs, start, end)) for item in itercomplete(lhs, start, end): G.add(self.get_intersected_rule(item)) fsa_states = item.inner + (item.dot,) for i, sym in itertools.ifilter(lambda (_, s): is_nonterminal(s), enumerate(item.rule.rhs)): if (sym, fsa_states[i], fsa_states[ i + 1]) not in processed: # Nederhof does not perform this test, but in python it turned out crucial make_rules(sym, fsa_states[i], fsa_states[i + 1]) # create goal items for start, ends in itergenerating(root): if not fsa.is_initial(start): continue for end in itertools.ifilter(lambda q: fsa.is_final(q), ends): make_rules(root, start, end) G.add(Rule(make_symbol(goal, None, None), [make_symbol(root, start, end)], 0.0)) return G
class Nederhof(object): """ This is an implementation of the CKY-inspired intersection due to Nederhof and Satta (2008). """ def __init__(self, wcfg, wfsa): self._wcfg = wcfg self._wfsa = wfsa self._agenda = Agenda(active_container_type=ActiveQueue) self._firstsym = defaultdict( set) # index rules by their first RHS symbol self._item_factory = ItemFactory() def get_item(self, rule, dot, inner=[]): return self._item_factory.get_item(rule, dot, inner) def advance(self, item, dot): """returns a new item whose dot has been advanced""" return self.get_item(item.rule, dot, item.inner + (item.dot, )) def add_symbol(self, sym, sfrom, sto): """ This operation: 1) completes items waiting for `sym` from `sfrom` 2) instantiate delayed axioms Returns False if the annotated symbol had already been added, True otherwise """ if self._agenda.is_generating(sym, sfrom, sto): return False # every item waiting for `sym` from `sfrom` for item in self._agenda.iterwaiting(sym, sfrom): self._agenda.add(self.advance(item, sto)) # you may interpret this as a delayed axiom # every compatible rule in the grammar for r in self._firstsym.get(sym, set()): self._agenda.add(self.get_item( r, sto, inner=(sfrom, ))) # can be interpreted as a lazy axiom return True def axioms(self): """ The axioms of the program are based on the FSA transitions. """ # you may interpret the following as a sort of lazy axiom (based on grammar rules) for r in self._wcfg: self._firstsym[r.rhs[0]].add(r) # these are axioms based on the transitions of the automaton for sfrom, sto, sym, w in self._wfsa.iterarcs(): self.add_symbol(sym, sfrom, sto) def inference(self): """Exhausts the queue of active items""" agenda = self._agenda while agenda: item = agenda.pop() # always returns an ACTIVE item # complete other items (by calling add_symbol), in case the input item is complete if item.is_complete(): self.add_symbol(item.rule.lhs, item.start, item.dot) # prove the symbol agenda.make_complete(item) # mark the item as complete else: # merges the input item with previously completed items effectively moving the input item's dot forward agenda.make_passive(item) for sto in agenda.itercompletions(item.next, item.dot): agenda.add(self.advance(item, sto)) # move the dot forward def do(self, root='[S]', goal='[GOAL]'): """Runs the program and returns the intersected CFG""" self.axioms() self.inference() return get_cfg(goal, root, self._wfsa, self._agenda)
class SlicedEarley(object): """ """ def __init__(self, wcfg, wfsa, slice_vars): """ """ self._wcfg = wcfg self._wfsa = wfsa self._agenda = Agenda(active_container_type=ActiveQueue) self._predictions = set() # (LHS, start) self._item_factory = ItemFactory() self.slice_vars = slice_vars def get_item(self, rule, dot, inner=[]): return self._item_factory.get_item(rule, dot, inner) def advance(self, item, dot): """returns a new item whose dot has been advanced""" return self.get_item(item.rule, dot, item.inner + (item.dot, )) def axioms(self, symbol, start): rules = self._wcfg.get(symbol, None) if rules is None: # impossible to rewrite the symbol return False if (symbol, start) in self._predictions: # already predicted return True # otherwise add rewritings to the agenda self._predictions.add((symbol, start)) self._agenda.extend(self.get_item(rule, start) for rule in rules) return True def prediction(self, item): """ This operation tris to create items from the rules associated with the nonterminal ahead of the dot. It returns True when prediction happens, and False if it already happened before. """ if (item.next, item.dot) in self._predictions: # prediction already happened return False self._predictions.add((item.next, item.dot)) new_items = [ self.get_item(rule, item.dot) for rule in self._wcfg.get(item.next, frozenset()) ] self._agenda.extend(new_items) return True def scan(self, item): """ This operation tries to scan over as many terminals as possible, but we only go as far as determinism allows. If we get to a nondeterminism, we stop scanning and add the relevant items to the agenda. """ states = [item.dot] for sym in item.nextsymbols(): if is_terminal(sym): arcs = self._wfsa.get_arcs(origin=states[-1], symbol=sym) if len(arcs) == 0: # cannot scan the symbol return False elif len(arcs) == 1: # symbol is scanned deterministically sto, _ = arcs[0] states.append( sto ) # we do not create intermediate items, instead we scan as much as we can else: # here we found a nondeterminism, we create all relevant items and add them to the agenda # create items for sto, w in arcs: self._agenda.add( self.get_item(item.rule, sto, item.inner + tuple(states))) return True else: # that's it, scan bumped into a nonterminal symbol, time to wrap up break # here we should have scanned at least one terminal symbol # and we defined a deterministic path self._agenda.add( self.get_item(item.rule, states[-1], item.inner + tuple(states[:-1]))) return True def complete_others(self, item): """ This operation creates new item by advancing the dot of passive items that are waiting for a certain given complete item. It returns whether or not at least one passive item awaited for the given complete item. """ if self._agenda.is_generating(item.rule.lhs, item.start, item.dot): return True new_items = [ self.advance(incomplete, item.dot) for incomplete in self._agenda.iterwaiting( item.rule.lhs, item.start) ] self._agenda.extend(new_items) return len( new_items) > 0 # was there any item waiting for the complete one? def complete_itself(self, item): """ This operation tries to merge a given incomplete item with a previosly completed one. """ new_items = [ self.advance(item, sto) for sto in self._agenda.itercompletions(item.next, item.dot) ] self._agenda.extend(new_items) return len(new_items) > 0 def do(self, root='[S]', goal='[GOAL]'): wfsa = self._wfsa wcfg = self._wcfg agenda = self._agenda # start items of the kind # GOAL -> * ROOT, where * is an intial state of the wfsa if not any(self.axioms(root, start) for start in wfsa.iterinitial()): raise ValueError('No rule for the start symbol %s' % root) new_roots = set() while agenda: item = agenda.pop() # always returns an active item if item.is_complete(): # get slice variable for the current completed item u = self.slice_vars.get(item.rule.lhs, item.start, item.dot) # check whether the probability of the current completed item is above the threshold determined by # the slice variable if item.rule.log_prob > u: # complete root item spanning from a start wfsa state to a final wfsa state if item.rule.lhs == root and wfsa.is_initial( item.start) and wfsa.is_final(item.dot): agenda.make_complete(item) new_roots.add((root, item.start, item.dot)) agenda.make_passive(item) else: if self.complete_others(item): agenda.make_complete(item) agenda.make_passive(item) else: # a complete state is only kept in case it could potentially complete others agenda.discard(item) else: if is_terminal(item.next): # fire the operation 'scan' self.scan(item) agenda.discard( item ) # scanning renders incomplete items of this kind useless else: if not wcfg.can_rewrite( item.next ): # if the NT does not exist this item is useless agenda.discard(item) else: if not self.prediction( item ): # try to predict, otherwise try to complete itself self.complete_itself(item) agenda.make_passive(item) # converts complete items into rules logging.debug('Making forest...') return self.get_cfg(goal, root) def get_intersected_rule(self, item): lhs = make_symbol(item.rule.lhs, item.start, item.dot) positions = item.inner + (item.dot, ) rhs = [ make_symbol(sym, positions[i], positions[i + 1]) for i, sym in enumerate(item.rule.rhs) ] return Rule(lhs, rhs, item.rule.log_prob) def get_cfg(self, goal, root): """ Constructs the CFG by visiting complete items in a top-down fashion. This is effectively a reachability test and it serves the purpose of filtering nonterminal symbols that could never be reached from the root. Note that bottom-up intersection typically does enumerate a lot of useless (unreachable) items. This is the recursive procedure described in the paper (Nederhof and Satta, 2008). """ G = WCFG() processed = set() fsa = self._wfsa itergenerating = self._agenda.itergenerating itercomplete = self._agenda.itercomplete def make_rules(lhs, start, end): if (start, lhs, end) in processed: return processed.add((lhs, start, end)) for item in itercomplete(lhs, start, end): G.add(self.get_intersected_rule(item)) fsa_states = item.inner + (item.dot, ) for i, sym in itertools.ifilter( lambda (_, s): is_nonterminal(s), enumerate(item.rule.rhs)): if ( sym, fsa_states[i], fsa_states[i + 1] ) not in processed: # Nederhof does not perform this test, but in python it turned out crucial make_rules(sym, fsa_states[i], fsa_states[i + 1]) # create goal items for start, ends in itergenerating(root): if not fsa.is_initial(start): continue for end in itertools.ifilter(lambda q: fsa.is_final(q), ends): make_rules(root, start, end) G.add( Rule(make_symbol(goal, None, None), [make_symbol(root, start, end)], 0.0)) return G
class SlicedNederhof(object): """ This is an implementation of the CKY-inspired intersection due to Nederhof and Satta (2008). """ def __init__(self, wcfg, wfsa, slice_vars): self._wcfg = wcfg self._wfsa = wfsa self._agenda = Agenda(active_container_type=ActiveQueue) self._firstsym = defaultdict(set) # index rules by their first RHS symbol self._item_factory = ItemFactory() self.slice_vars = slice_vars def get_item(self, rule, dot, inner=[]): return self._item_factory.get_item(rule, dot, inner) def advance(self, item, dot): """returns a new item whose dot has been advanced""" return self.get_item(item.rule, dot, item.inner + (item.dot,)) def add_symbol(self, sym, sfrom, sto): """ This operation: 1) completes items waiting for `sym` from `sfrom` 2) instantiate delayed axioms Returns False if the annotated symbol had already been added, True otherwise """ if self._agenda.is_generating(sym, sfrom, sto): return False # every item waiting for `sym` from `sfrom` for item in self._agenda.iterwaiting(sym, sfrom): self._agenda.add(self.advance(item, sto)) # you may interpret this as a delayed axiom # every compatible rule in the grammar for r in self._firstsym.get(sym, set()): self._agenda.add(self.get_item(r, sto, inner=(sfrom,))) # can be interpreted as a lazy axiom return True def axioms(self): """ The axioms of the program are based on the FSA transitions. """ # you may interpret the following as a sort of lazy axiom (based on grammar rules) for r in self._wcfg: self._firstsym[r.rhs[0]].add(r) # these are axioms based on the transitions of the automaton for sfrom, sto, sym, w in self._wfsa.iterarcs(): self.add_symbol(sym, sfrom, sto) def inference(self): """Exhausts the queue of active items""" agenda = self._agenda while agenda: item = agenda.pop() # always returns an ACTIVE item # complete other items (by calling add_symbol), in case the input item is complete if item.is_complete(): u = self.slice_vars.get(item.rule.lhs, item.start, item.dot) # check whether the probability of the current completed item is above the threshold determined by # the slice variable if item.rule.log_prob > u: self.add_symbol(item.rule.lhs, item.start, item.dot) # prove the symbol agenda.make_complete(item) # mark the item as complete else: # merges the input item with previously completed items effectively moving the input item's dot forward agenda.make_passive(item) for sto in agenda.itercompletions(item.next, item.dot): agenda.add(self.advance(item, sto)) # move the dot forward def do(self, root='[S]', goal='[GOAL]'): """Runs the program and returns the intersected CFG""" self.axioms() self.inference() return get_cfg(goal, root, self._wfsa, self._agenda)