示例#1
0
def make_tm(t, testfile):
    tm = fst.FST()
    tm.set_start('q0')
    tm.set_accept('q1')
    tm.add_transition(fst.Transition('q0', (fst.STOP, fst.STOP), 'q1'))
    known_words = set()
    # Store the transitions in a new format
    top_trans = defaultdict(dict)
    for trans, prob in t.items():
        if trans[1] == '∅':
            top_trans[trans[0]][fst.EPSILON] = prob
        else:
            top_trans[trans[0]][trans[1]] = prob
        known_words.add(trans[0])
    # Find and insert the top 10 translations
    for fw, trans in top_trans.items():
        for i, (ew, prob) in enumerate(
                sorted(trans.items(), key=operator.itemgetter(1),
                       reverse=True)):
            if i > 10:
                break
            tm.add_transition(fst.Transition('q0', (fw, ew), 'q0'), prob)
    # Add unknown words from the test data
    with open(testfile) as f:
        prob = math.pow(10, -100)
        for line in f:
            for w in line.rstrip().split():
                if w not in known_words:
                    tm.add_transition(
                        fst.Transition('q0', (w, fst.EPSILON), 'q0'), prob)
    return tm
示例#2
0
def make_fm(f):
    fm = fst.FST()
    fm.set_start(0)
    for i, fw in enumerate(f):
        fm.add_transition(fst.Transition(i, (fw, fw), i + 1))
    fm.add_transition(
        fst.Transition(len(fs), (fst.STOP, fst.STOP),
                       len(fs) + 1))
    fm.set_accept(len(f) + 1)
    return fm
示例#3
0
def get_fst_mw(word):
    m = fst.FST()
    m.set_start("q0")
    n = 1
    for w in word:
        m.add_transition(fst.Transition("q"+str(n-1), (w, w), "q"+str(n)))
        n += 1
    m.add_transition(fst.Transition("q"+str(n-1), (fst.STOP, fst.STOP), "q"+str(n)))
    m.set_accept("q"+str(n))
    return m
示例#4
0
def make_f(f):
    # Adapted from Homework 2 Solutions
    f = f.split()
    m = fst.FST()
    m.set_start(0)
    for (i,a) in enumerate(f):
        m.add_transition(fst.Transition(i, (a, a), i+1))
    m.add_transition(fst.Transition(len(f), (fst.STOP, fst.STOP), len(f)+1))
    m.set_accept(len(f)+1)
    return m
示例#5
0
def make_kneserney(data, n):
    """Create a Kneser-Ney smoothed language model of order `n`, 
    trained on `data`, as a `FST`.

    Note that the returned FST has epsilon transitions. To iterate
    over states in topological order, sort them using `lambda q:
    -len(q)` as the key.
    """

    # Estimate KN-smoothed models for orders 1, ..., n
    kn = {}
    for i in range(1, n + 1):
        kn[i] = KneserNey(data, i)

    # Create the FST. It has a state for every possible k-gram for k = 0, ..., n-1.
    m = fst.FST()
    m.set_start(("<s>", ) * (n - 1))
    m.set_accept(("</s>", ))

    for i in range(1, n + 1):
        for u in kn[i]._prob:
            if i > 1:
                # Add an epsilon transition that backs off from the i-gram model to the (i-1)-gram model
                m.add_transition(
                    fst.Transition(u, (fst.EPSILON, fst.EPSILON), u[1:]),
                    kn[i]._bow[u])
            else:
                # Smooth 1-gram model with uniform distribution
                types = len(kn[i]._prob[u]) + 1
                for w in kn[i]._prob[u]:
                    m.add_transition(fst.Transition(u, (w, w), (w, )),
                                     1 / types)
                m.add_transition(fst.Transition(u, ("<unk>", "<unk>"), ()),
                                 1 / types)

            # Create transitions for word probabilities
            for w in kn[i]._prob[u]:
                # If we are in state u and read w, then v is the new state.
                # This should be the longest suffix of uw that is observed
                # in the training data.
                if w == "</s>":
                    v = ("</s>", )
                else:
                    v = u + (w, )
                    while len(v) > 0 and (len(v) >= n
                                          or v not in kn[len(v) + 1]._prob):
                        v = v[1:]
                m.add_transition(fst.Transition(u, (w, w), v),
                                 kn[i]._prob[u][w])
    return m
示例#6
0
def make_tm(t, testfile):
    tm = fst.FST()
    tm.set_start('q0')
    tm.set_accept('q1')
    tm.add_transition(fst.Transition('q0', (fst.STOP, fst.STOP), 'q1'))
    known_words = set()
    for trans, prob in t.items():
        known_words.add(trans[0])
        if trans[1] == '∅':
            trans = (trans[0], fst.EPSILON)
        tm.add_transition(fst.Transition('q0', trans, 'q0'), prob)
    # Add unknown words from the test data
    with open(testfile) as f:
        prob = math.pow(10, -100)
        for line in f:
            for w in line.rstrip().split():
                if w not in known_words:
                    tm.add_transition(
                        fst.Transition('q0', (w, fst.EPSILON), 'q0'), prob)
    return tm
示例#7
0
def get_fst_mtm(old_data, new_data, initialize=True):
    m = fst.FST()
    m.set_start("q0")
    # get the old and modern alphabets from the traning data
    for old_line in old_data:
        for w in old_line:
            output_alphabet.add(w)
    for new_line in new_data:
        for w in new_line:
            input_alphabet.add(w)
    # generate the typo model
    for output_w in output_alphabet:
        m.add_transition(fst.Transition("q0", (fst.EPSILON, output_w), "q0")) #insert
    for input_w in input_alphabet:
        m.add_transition(fst.Transition("q0", (input_w, fst.EPSILON), "q1")) #delete
        for output_w in output_alphabet: # substitute
            m.add_transition(fst.Transition("q1", (input_w, output_w), "q0"))
            m.add_transition(fst.Transition("q0", (input_w, output_w), "q0"))
    # add terminal transitions
    m.add_transition(fst.Transition("q0", (fst.STOP, fst.STOP), "q2"))
    m.add_transition(fst.Transition("q1", (fst.STOP, fst.STOP), "q2"))
    m.set_accept("q2")
    # initialize the weights
    if initialize:
        for state in m.states:
            for transition in m.transitions_from[state].keys():
                # higher probability if going to the same character
                if transition.a[0] == transition.a[1]:
                    m.reweight_transition(transition, 100)
                else:
                    m.reweight_transition(transition, 1)
    m.normalize_cond()
    return m
示例#8
0
def make_TM():
    # Code adapted from Homework 2 Solution
    translations = read_translations()
    
    tm = fst.FST()
    tm.set_start(0)
    tm.set_accept(1)
    
    tm.add_transition(fst.Transition(0, ("</s>", "</s>"), 1), wt=1)

    for t in translations:
        for prob in translations[t]:
            tm.add_transition(fst.Transition(0, (t, prob[0]), 0), wt=float(prob[1]))

    test = '../data/final_data/test.tr'
    test_set = set()
    for test_line in open(test):
        for char in test_line.strip().split():
            test_set.add(char)

    for char in test_set:
        tm.add_transition(fst.Transition(0, (char, 'ε'), 0), wt=float('1.0e-100'))

    return tm