Exemplo n.º 1
0
def scan(fsa: FSA, item: Item, eps_symbol: Terminal) -> list:
    """
    Scan a terminal (compatible with CKY and Earley).

    Inference rule:

        [X -> alpha * x beta, [q, ..., r]]
        ------------------------------------    where (r, x, s) \in FSA and x != \epsilon
        [X -> alpha x * beta, [q, ..., r, s]]
        
        
    If x == \epsilon, we have a different rule
    
        [X -> alpha * \epsilon beta, [q, ..., r]]
        ---------------------------------------------   
        [X -> alpha \epsilon * beta, [q, ..., r, r]]
    
    that is, the dot moves over the empty string and we loop into the same FSA state (r)

    :param item: an active Item
    :param eps_symbol: a list/tuple of terminals (set to None to disable epsilon rules)
    :returns: scanned items
    """
    assert item.next.is_terminal(
    ), 'Only terminal symbols can be scanned, got %s' % item.next
    if eps_symbol is not None and item.next.root() == eps_symbol:
        return [item.advance(item.dot)]
    else:
        # we call .obj() because labels are strings, not Terminals
        return [
            item.advance(destination)
            for destination in fsa.destinations(origin=item.dot,
                                                label=item.next.root().obj())
        ]
def get_source_word(fsa: FSA, origin: int, destination: int) -> str:
    """Returns the python string representing a source word from origin to destination (assuming there's a single one)"""
    labels = list(fsa.labels(origin, destination))
    if len(labels) == 0:
        return '-EPS-'
    assert len(
        labels
    ) == 1, 'Use this function only when you know the path is unambiguous, found %d labels %s for (%d, %d)' % (
        len(labels), labels, origin, destination)
    return labels[0]
Exemplo n.º 3
0
def axioms(cfg: CFG, fsa: FSA, s: Symbol) -> list:
    """
    Axioms for Earley.

    Inference rule:
        -------------------- (S -> alpha) \in R and q0 \in I
        [S -> * alpha, [q0]] 
        
    R is the rule set of the grammar.
    I is the set of initial states of the automaton.

    :param cfg: a CFG
    :param fsa: an FSA
    :param s: the CFG's start symbol (S)
    :returns: a list of items that are Earley axioms  
    """
    items = []
    for q0 in fsa.iterinitial():
        for rule in cfg.get(s):
            items.append(Item(rule, [q0]))
    return items
def language_of_fsa(fsa: FSA, eps_str='-EPS-') -> set:
    """Return the set of strings in the FSA: this runs in exponential time, use with very small FSA only"""
    # then we enumerate paths in this FSA
    #from collections import Counter
    strings = set()

    def visit_fsa_state(state, string: tuple):
        if fsa.is_final(state):
            strings.add(' '.join(x for x in string))  #
        for label, destinations in fsa.iterarcs(state, group_by='label'):
            if label != eps_str:
                for destination in destinations:
                    visit_fsa_state(destination, string + (label, ))
            else:
                for destination in destinations:
                    visit_fsa_state(destination, string)

    for initial in fsa.iterinitial():
        visit_fsa_state(initial, tuple())

    return strings
def forest_to_fsa(forest: CFG, start_symbol: Symbol) -> FSA:
    """
    Note that this algorithm only works with acyclic forests.
    Even for such forests, this runs in exponential time, so make sure to only try it with very small forests.
    
    :param forest: acyclic forest
    :param start_symbol:
    :return FSA
    """
    fsa = FSA()

    # here we find out which spans end in an accepting state (the spans of top rules contain that information)
    #accepting = set()
    #for rule in forest.iter_rules(start_symbol):  # S' -> S:initial-final
    #    for sym in rule.rhs:  # the RHS contains of top rules contain the accepting states
    #        s, initial, final = sym.obj()
    #        accepting.add(final)

    def visit_forest_node(symbol: Symbol, bos, eos, parent: Symbol):
        """Visit a symbol spanning from bos to eos given a parent symbol"""
        if symbol.is_terminal():
            fsa.add_arc(bos, eos, symbol.root().obj())
            #if isinstance(parent, Span) and parent.obj()[-1] in accepting:
            #    fsa.make_final(eos)
        else:
            for rule in forest.get(symbol):
                # generate the internal states
                states = [bos]
                states.extend([fsa.add_state() for _ in range(rule.arity - 1)])
                states.append(eos)
                # recursively call on nonterminal children
                for i, child in enumerate(rule.rhs):
                    visit_forest_node(child, states[i], states[i + 1], symbol)

    fsa.add_state(initial=True)  # state 0
    fsa.add_state(final=True)  # state 1
    visit_forest_node(start_symbol, 0, 1, None)

    return fsa
def simple_features(
    edge: Rule,
    src_fsa: FSA,
    eps=Terminal('-EPS-'),
    tgt_sent='',
    sparse_del=False,
    sparse_ins=False,
    sparse_trans=False,
    src_tgt=defaultdict(lambda: defaultdict(float)),
    tgt_src=defaultdict(lambda: defaultdict(float))
) -> dict:
    """
    Featurises an edge given
        * rule and spans
        * src sentence as an FSA
        * TODO: target sentence length n
        * TODO: extract IBM1 dense features
    crucially, note that the target sentence y is not available!
    """
    fmap = defaultdict(float)
    fset = set()  # stores the features we've added
    if len(edge.rhs) == 2:  # binary rule
        fmap['type:binary'] += 1.0
        fset.add('type:binary')
        # here we could have sparse features of the source string as a function of spans being concatenated
        (ls1, ls2), (lt1, lt2) = get_bispans(edge.rhs[0])  # left of RHS
        (rs1, rs2), (rt1, rt2) = get_bispans(edge.rhs[1])  # right of RHS

        # TODO: double check these, assign features, add some more
        if ls1 == ls2:  # deletion of source left child
            fmap['type:deletion-slc'] += 1.0
            fset.add('type:deletion-slc')
        if rs1 == rs2:  # deletion of source right child
            fmap['type:deletion-src'] += 1.0
            fset.add('type:deletion-src')
        if ls2 == rs1:  # monotone
            fmap['type:monotone'] += 1.0
            fset.add('type:monotone')
        if ls1 == rs2:  # inverted
            fmap['type:inverted'] += 1.0
            fset.add('type:inverted')

        # add features:
        # type: inverted:span
        # type: monotone:span

        # source span feature of rhs
        src_span_lc = ls2 - ls1
        src_span_rc = rs2 - rs1
        fmap['span:rhs:src-lc:{}'.format(src_span_lc)] += 1.0
        fmap['span:rhs:src-lc:{0}-{1}'.format(ls1, ls2)] += 1.0
        fmap['span:rhs:src-rc:{}'.format(src_span_rc)] += 1.0
        fmap['span:rhs:src-rc:{0}-{1}'.format(rs1, rs2)] += 1.0
        fset.update({
            'span:rhs:src-lc:{}'.format(src_span_lc),
            'span:rhs:src-rc:{}'.format(src_span_rc),
            'span:rhs:src-lc:{0}-{1}'.format(ls1, ls2),
            'span:rhs:src-rc:{0}-{1}'.format(rs1, rs2)
        })
        # target span feature of rhs
        tgt_span_lc = lt2 - lt1
        tgt_span_rc = rt2 - rt1
        fmap['span:rhs:tgt-lc:{}'.format(tgt_span_lc)] += 1.0
        fmap['span:rhs:tgt-lc:{0}-{1}'.format(lt1, lt2)] += 1.0
        fmap['span:rhs:tgt-rc:{}'.format(tgt_span_rc)] += 1.0
        fmap['span:rhs:tgt-rc:{0}-{1}'.format(rt1, rt2)] += 1.0
        fset.update({
            'span:rhs:tgt-lc:{}'.format(tgt_span_lc),
            'span:rhs:tgt-rc:{}'.format(tgt_span_rc),
            'span:rhs:tgt-lc:{0}-{1}'.format(lt1, lt2),
            'span:rhs:tgt-rc:{0}-{1}'.format(rt1, rt2)
        })

    else:  # unary
        symbol = edge.rhs[0]
        if symbol.is_terminal():  # terminal rule
            fmap['type:terminal'] += 1.0
            fset.add('type:terminal')
            (s1, s2), (t1, t2) = get_bispans(symbol)
            if symbol.root(
            ) == eps:  # symbol.root() gives us a Terminal free of annotation
                # for sure there is a source word
                src_word = get_source_word(src_fsa, s1, s2)
                fmap['type:deletion'] += 1.0
                fset.add('type:deletion')

                # sparse version
                if sparse_del:
                    fmap['del:%s' % src_word] += 1.0
                    fset.add('del:%s' % src_word)
            else:
                # for sure there's a target word
                tgt_word = get_target_word(symbol)
                if s1 == s2:  # has not consumed any source word, must be an eps rule
                    fmap['type:insertion'] += 1.0
                    fset.add('type:insertion')

                    # sparse version
                    if sparse_ins:
                        fmap['ins:%s' % tgt_word] += 1.0
                        fset.add('ins:%s' % tgt_word)
                else:
                    # for sure there's a source word
                    src_word = get_source_word(src_fsa, s1, s2)
                    fmap['type:translation'] += 1.0
                    fset.add('type:translation')

                    # sparse version
                    if sparse_trans:
                        fmap['trans:%s/%s' % (src_word, tgt_word)] += 1.0
                        fset.add('trans:%s/%s' % (src_word, tgt_word))

                    # add features for source skip-bigram
                    l_word = '-START-' if s1 == 0 else get_source_word(
                        src_fsa, s1 - 1, s1)
                    r_word = '-END-' if s2 + 1 == src_fsa.nb_states(
                    ) else get_source_word(src_fsa, s2, s2 + 1)
                    skip_feature = 'skip-bigram:{0}*{1}'.format(l_word, r_word)
                    fmap[skip_feature] += 1
                    fset.add(skip_feature)

            # source span feature of rhs
            src_span = s2 - s1
            fmap['span:rhs:src:{}'.format(src_span)] += 1.0
            fset.add('span:rhs:src:{}'.format(src_span))
            # target span feature of rhs
            tgt_span = t2 - t1
            fmap['span:rhs:tgt:{}'.format(tgt_span)] += 1.0
            fset.add('span:rhs:tgt:{}'.format(tgt_span))

        else:  # S -> X
            fmap['top'] += 1.0
            fset.add('top')

        # bispans of lhs of edge for source and target (source and target sentence lengths)
        if isinstance(edge.lhs.obj()[0],
                      Span):  # exclude the (Nonterminal('D(x)'), 0, 2) rules
            (s1, s2), (t1, t2) = get_bispans(edge.lhs)
            # source span feature of lhs
            src_span = s2 - s1
            fmap['span:lhs:src:{}'.format(src_span)] += 1.0
            fmap['span:lhs:src:{0}-{1}'.format(s1, s2)] += 1.0
            fset.update({
                'span:lhs:src:{}'.format(src_span),
                'span:lhs:src:{0}-{1}'.format(s1, s2)
            })
            # target span feature of lhs
            tgt_span = t2 - t1
            fmap['span:lhs:tgt:{}'.format(tgt_span)] += 1.0
            fmap['span:lhs:tgt:{0}-{1}'.format(t1, t2)] += 1.0
            fset.update({
                'span:lhs:tgt:{}'.format(tgt_span),
                'span:lhs:tgt:{0}-{1}'.format(t1, t2)
            })

        # finally add source sentence length
        fmap['scr-sent:length:{}'.format(src_fsa.nb_states())] += 1.0
        fset.add('scr-sent:length:{}'.format(src_fsa.nb_states()))
        # and target sentence length
        fmap['tgt-sent:length:{}'.format(len(tgt_sent.split()))] += 1.0
        fset.add('tgt-sent:length:{}'.format(len(tgt_sent.split())))

    return fmap, fset