def scan(fsa: FSA, item: Item, eps_symbol: Terminal) -> list: """ Scan a terminal (compatible with CKY and Earley). Inference rule: [X -> alpha * x beta, [q, ..., r]] ------------------------------------ where (r, x, s) \in FSA and x != \epsilon [X -> alpha x * beta, [q, ..., r, s]] If x == \epsilon, we have a different rule [X -> alpha * \epsilon beta, [q, ..., r]] --------------------------------------------- [X -> alpha \epsilon * beta, [q, ..., r, r]] that is, the dot moves over the empty string and we loop into the same FSA state (r) :param item: an active Item :param eps_symbol: a list/tuple of terminals (set to None to disable epsilon rules) :returns: scanned items """ assert item.next.is_terminal( ), 'Only terminal symbols can be scanned, got %s' % item.next if eps_symbol is not None and item.next.root() == eps_symbol: return [item.advance(item.dot)] else: # we call .obj() because labels are strings, not Terminals return [ item.advance(destination) for destination in fsa.destinations(origin=item.dot, label=item.next.root().obj()) ]
def get_source_word(fsa: FSA, origin: int, destination: int) -> str: """Returns the python string representing a source word from origin to destination (assuming there's a single one)""" labels = list(fsa.labels(origin, destination)) if len(labels) == 0: return '-EPS-' assert len( labels ) == 1, 'Use this function only when you know the path is unambiguous, found %d labels %s for (%d, %d)' % ( len(labels), labels, origin, destination) return labels[0]
def axioms(cfg: CFG, fsa: FSA, s: Symbol) -> list: """ Axioms for Earley. Inference rule: -------------------- (S -> alpha) \in R and q0 \in I [S -> * alpha, [q0]] R is the rule set of the grammar. I is the set of initial states of the automaton. :param cfg: a CFG :param fsa: an FSA :param s: the CFG's start symbol (S) :returns: a list of items that are Earley axioms """ items = [] for q0 in fsa.iterinitial(): for rule in cfg.get(s): items.append(Item(rule, [q0])) return items
def language_of_fsa(fsa: FSA, eps_str='-EPS-') -> set: """Return the set of strings in the FSA: this runs in exponential time, use with very small FSA only""" # then we enumerate paths in this FSA #from collections import Counter strings = set() def visit_fsa_state(state, string: tuple): if fsa.is_final(state): strings.add(' '.join(x for x in string)) # for label, destinations in fsa.iterarcs(state, group_by='label'): if label != eps_str: for destination in destinations: visit_fsa_state(destination, string + (label, )) else: for destination in destinations: visit_fsa_state(destination, string) for initial in fsa.iterinitial(): visit_fsa_state(initial, tuple()) return strings
def forest_to_fsa(forest: CFG, start_symbol: Symbol) -> FSA: """ Note that this algorithm only works with acyclic forests. Even for such forests, this runs in exponential time, so make sure to only try it with very small forests. :param forest: acyclic forest :param start_symbol: :return FSA """ fsa = FSA() # here we find out which spans end in an accepting state (the spans of top rules contain that information) #accepting = set() #for rule in forest.iter_rules(start_symbol): # S' -> S:initial-final # for sym in rule.rhs: # the RHS contains of top rules contain the accepting states # s, initial, final = sym.obj() # accepting.add(final) def visit_forest_node(symbol: Symbol, bos, eos, parent: Symbol): """Visit a symbol spanning from bos to eos given a parent symbol""" if symbol.is_terminal(): fsa.add_arc(bos, eos, symbol.root().obj()) #if isinstance(parent, Span) and parent.obj()[-1] in accepting: # fsa.make_final(eos) else: for rule in forest.get(symbol): # generate the internal states states = [bos] states.extend([fsa.add_state() for _ in range(rule.arity - 1)]) states.append(eos) # recursively call on nonterminal children for i, child in enumerate(rule.rhs): visit_forest_node(child, states[i], states[i + 1], symbol) fsa.add_state(initial=True) # state 0 fsa.add_state(final=True) # state 1 visit_forest_node(start_symbol, 0, 1, None) return fsa
def simple_features( edge: Rule, src_fsa: FSA, eps=Terminal('-EPS-'), tgt_sent='', sparse_del=False, sparse_ins=False, sparse_trans=False, src_tgt=defaultdict(lambda: defaultdict(float)), tgt_src=defaultdict(lambda: defaultdict(float)) ) -> dict: """ Featurises an edge given * rule and spans * src sentence as an FSA * TODO: target sentence length n * TODO: extract IBM1 dense features crucially, note that the target sentence y is not available! """ fmap = defaultdict(float) fset = set() # stores the features we've added if len(edge.rhs) == 2: # binary rule fmap['type:binary'] += 1.0 fset.add('type:binary') # here we could have sparse features of the source string as a function of spans being concatenated (ls1, ls2), (lt1, lt2) = get_bispans(edge.rhs[0]) # left of RHS (rs1, rs2), (rt1, rt2) = get_bispans(edge.rhs[1]) # right of RHS # TODO: double check these, assign features, add some more if ls1 == ls2: # deletion of source left child fmap['type:deletion-slc'] += 1.0 fset.add('type:deletion-slc') if rs1 == rs2: # deletion of source right child fmap['type:deletion-src'] += 1.0 fset.add('type:deletion-src') if ls2 == rs1: # monotone fmap['type:monotone'] += 1.0 fset.add('type:monotone') if ls1 == rs2: # inverted fmap['type:inverted'] += 1.0 fset.add('type:inverted') # add features: # type: inverted:span # type: monotone:span # source span feature of rhs src_span_lc = ls2 - ls1 src_span_rc = rs2 - rs1 fmap['span:rhs:src-lc:{}'.format(src_span_lc)] += 1.0 fmap['span:rhs:src-lc:{0}-{1}'.format(ls1, ls2)] += 1.0 fmap['span:rhs:src-rc:{}'.format(src_span_rc)] += 1.0 fmap['span:rhs:src-rc:{0}-{1}'.format(rs1, rs2)] += 1.0 fset.update({ 'span:rhs:src-lc:{}'.format(src_span_lc), 'span:rhs:src-rc:{}'.format(src_span_rc), 'span:rhs:src-lc:{0}-{1}'.format(ls1, ls2), 'span:rhs:src-rc:{0}-{1}'.format(rs1, rs2) }) # target span feature of rhs tgt_span_lc = lt2 - lt1 tgt_span_rc = rt2 - rt1 fmap['span:rhs:tgt-lc:{}'.format(tgt_span_lc)] += 1.0 fmap['span:rhs:tgt-lc:{0}-{1}'.format(lt1, lt2)] += 1.0 fmap['span:rhs:tgt-rc:{}'.format(tgt_span_rc)] += 1.0 fmap['span:rhs:tgt-rc:{0}-{1}'.format(rt1, rt2)] += 1.0 fset.update({ 'span:rhs:tgt-lc:{}'.format(tgt_span_lc), 'span:rhs:tgt-rc:{}'.format(tgt_span_rc), 'span:rhs:tgt-lc:{0}-{1}'.format(lt1, lt2), 'span:rhs:tgt-rc:{0}-{1}'.format(rt1, rt2) }) else: # unary symbol = edge.rhs[0] if symbol.is_terminal(): # terminal rule fmap['type:terminal'] += 1.0 fset.add('type:terminal') (s1, s2), (t1, t2) = get_bispans(symbol) if symbol.root( ) == eps: # symbol.root() gives us a Terminal free of annotation # for sure there is a source word src_word = get_source_word(src_fsa, s1, s2) fmap['type:deletion'] += 1.0 fset.add('type:deletion') # sparse version if sparse_del: fmap['del:%s' % src_word] += 1.0 fset.add('del:%s' % src_word) else: # for sure there's a target word tgt_word = get_target_word(symbol) if s1 == s2: # has not consumed any source word, must be an eps rule fmap['type:insertion'] += 1.0 fset.add('type:insertion') # sparse version if sparse_ins: fmap['ins:%s' % tgt_word] += 1.0 fset.add('ins:%s' % tgt_word) else: # for sure there's a source word src_word = get_source_word(src_fsa, s1, s2) fmap['type:translation'] += 1.0 fset.add('type:translation') # sparse version if sparse_trans: fmap['trans:%s/%s' % (src_word, tgt_word)] += 1.0 fset.add('trans:%s/%s' % (src_word, tgt_word)) # add features for source skip-bigram l_word = '-START-' if s1 == 0 else get_source_word( src_fsa, s1 - 1, s1) r_word = '-END-' if s2 + 1 == src_fsa.nb_states( ) else get_source_word(src_fsa, s2, s2 + 1) skip_feature = 'skip-bigram:{0}*{1}'.format(l_word, r_word) fmap[skip_feature] += 1 fset.add(skip_feature) # source span feature of rhs src_span = s2 - s1 fmap['span:rhs:src:{}'.format(src_span)] += 1.0 fset.add('span:rhs:src:{}'.format(src_span)) # target span feature of rhs tgt_span = t2 - t1 fmap['span:rhs:tgt:{}'.format(tgt_span)] += 1.0 fset.add('span:rhs:tgt:{}'.format(tgt_span)) else: # S -> X fmap['top'] += 1.0 fset.add('top') # bispans of lhs of edge for source and target (source and target sentence lengths) if isinstance(edge.lhs.obj()[0], Span): # exclude the (Nonterminal('D(x)'), 0, 2) rules (s1, s2), (t1, t2) = get_bispans(edge.lhs) # source span feature of lhs src_span = s2 - s1 fmap['span:lhs:src:{}'.format(src_span)] += 1.0 fmap['span:lhs:src:{0}-{1}'.format(s1, s2)] += 1.0 fset.update({ 'span:lhs:src:{}'.format(src_span), 'span:lhs:src:{0}-{1}'.format(s1, s2) }) # target span feature of lhs tgt_span = t2 - t1 fmap['span:lhs:tgt:{}'.format(tgt_span)] += 1.0 fmap['span:lhs:tgt:{0}-{1}'.format(t1, t2)] += 1.0 fset.update({ 'span:lhs:tgt:{}'.format(tgt_span), 'span:lhs:tgt:{0}-{1}'.format(t1, t2) }) # finally add source sentence length fmap['scr-sent:length:{}'.format(src_fsa.nb_states())] += 1.0 fset.add('scr-sent:length:{}'.format(src_fsa.nb_states())) # and target sentence length fmap['tgt-sent:length:{}'.format(len(tgt_sent.split()))] += 1.0 fset.add('tgt-sent:length:{}'.format(len(tgt_sent.split()))) return fmap, fset