Exemplo n.º 1
0
def read_fst(inp):
    compiler = fst.Compiler()
    while True:
        key = inp.readline().strip()
        if len(key) == 0:
            return
        while True:
            line = inp.readline()
            if len(line.strip()) == 0:
                break
            parts = line.split()
            if len(parts) > 4:
                parts = parts[:4]
            if len(parts) == 2:
                parts = parts[:1]
            print(" ".join(parts), file=compiler)
        yield key, compiler.compile()
Exemplo n.º 2
0
 def _fst1():
     compiler = fst.Compiler()
     print("0 1 1 1 -1", file=compiler)
     print("0 1 2 2 -2", file=compiler)
     print("0 1 3 3 -3", file=compiler)
     print("1 2 1 1  1", file=compiler)
     print("1 2 2 2  0", file=compiler)
     print("1 2 3 3 -2", file=compiler)
     print("2 3 1 1 -2", file=compiler)
     print("2 3 2 2  0", file=compiler)
     print("2 3 3 3  0", file=compiler)
     print("3 4 1 1  1", file=compiler)
     print("3 4 2 2  2", file=compiler)
     print("3 4 3 3  3", file=compiler)
     print("4", file=compiler)
     f = compiler.compile()
     return f
Exemplo n.º 3
0
 def general():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 0 <#> <#>\n')
     compiler.write('0\n')
     # for special cases
     # compiler.write('0 1 i i\n')
     # compiler.write('1 2 n n\n')
     # compiler.write('2 3 g g\n')
     # compiler.write('3 4 <^> <^>\n')
     
     compiler.write('0 1 <^> <epsilon>\n')
     compiler.write('1 0 <other> <other>\n')
     compiler.write('1 0 <#> <#>\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c
Exemplo n.º 4
0
 def write_hypos(self, all_hypos):
     """Writes FST files with standard arcs for each
     sentence in ``all_hypos``. The created lattices are not 
     optimized in any way: We create a distinct path for each entry 
     in ``all_hypos``. We advise you to determinize/minimize them if
     you are planning to use them for further processing. 
     
     Args:
         all_hypos (list): list of nbest lists of hypotheses
     
     Raises:
         OSError. If the directory could not be created
         IOError. If something goes wrong while writing to the disk
     """
     try:
         os.makedirs(self.path)
     except OSError as exception:
         if exception.errno != errno.EEXIST:
             raise
         else:
             logging.warn("Output FST directory %s already exists." %
                          self.path)
     fst_idx = self.start_sen_id
     for hypos in all_hypos:
         fst_idx += 1
         c = fst.Compiler()
         # state ID 0 is start, 1 is final state
         next_free_id = 2
         for hypo in hypos:
             # Connect with start node
             c.write("0\t%d\t%d\t%d\t%f\n" %
                     (next_free_id, utils.GO_ID, utils.GO_ID,
                      -hypo.total_score))
             next_free_id += 1
             for sym in hypo.trgt_sentence:
                 c.write("%d\t%d\t%d\t%d\n" %
                         (next_free_id - 1, next_free_id, sym, sym))
                 next_free_id += 1
             # Connect with final node
             c.write("%d\t1\t%d\t%d\n" %
                     (next_free_id - 1, utils.EOS_ID, utils.EOS_ID))
         c.write("1\n")
         f = c.compile()
         f.write(self.file_pattern % fst_idx)
Exemplo n.º 5
0
 def write_hypos(self, all_hypos, sen_indices):
     """Writes FST files with sparse tuples for each sentence in 
     ``all_hypos``. The created lattices are not optimized in any
     way: We create a distinct path for each entry in 
     ``all_hypos``. We advise you to determinize/minimize them if 
     you are planning to use them for further processing.
     
     Args:
         all_hypos (list): list of nbest lists of hypotheses
         sen_indices (list): List of sentence indices (0-indexed)
     
     Raises:
         OSError. If the directory could not be created
         IOError. If something goes wrong while writing to the disk
     """
     _mkdir(self.path, "FST")
     for fst_idx, hypos in zip(sen_indices, all_hypos):
         fst_idx += 1
         c = fst.Compiler(arc_type="tropicalsparsetuple")
         # state ID 0 is start, 1 is final state
         next_free_id = 2
         for hypo in hypos:
             syms = hypo.trgt_sentence
             # Connect with start node
             c.write("0\t%d\t%d\t%d\n" %
                     (next_free_id, utils.GO_ID, utils.GO_ID))
             next_free_id += 1
             for pos in xrange(len(hypo.score_breakdown) - 1):
                 c.write("%d\t%d\t%d\t%d\t%s\n" % (
                     next_free_id - 1,  # last state id
                     next_free_id,  # next state id 
                     syms[pos],
                     syms[pos],  # arc labels
                     self.write_weight(hypo.score_breakdown[pos])))
                 next_free_id += 1
             # Connect with final node
             c.write("%d\t1\t%d\t%d\t%s\n" %
                     (next_free_id - 1, utils.EOS_ID, utils.EOS_ID,
                      self.write_weight(hypo.score_breakdown[-1])))
         c.write("1\n")  # Add final node
         f = c.compile()
         f.write(self.file_pattern % fst_idx)
Exemplo n.º 6
0
    def _get_basic_word_fst(self, text_arr):

        self.current_oov_queue = queue.Queue()
        compiler = fst.Compiler()
        state_counter = 0
        next_state = 0

        for arr in text_arr:
            if not arr:
                continue
            # single word, single expansion
            if len(arr) == 1:
                for w in arr:
                    int_val = self._get_int_value_word(w)
                    from_state = state_counter
                    if next_state != 0:
                        to_state = next_state
                        state_counter = next_state - 1
                        next_state = 0
                    else:
                        to_state = state_counter + 1
                    self._compile_entry(compiler, from_state, int_val, to_state)
                    state_counter += 1

            # multiple verbalization possibilities
            # we are working char by char, so store the state_counter, such that
            # all possibilities have the same from_state (starting state)
            else:
                from_state = state_counter
                to_state = state_counter + 1
                for i, w in enumerate(arr):
                    int_val = self._get_int_value_word(w)
                    self._compile_entry(compiler, from_state, int_val, to_state)
                    state_counter += 1

                next_state = to_state + 1
                state_counter = to_state

        compiler.write("{}\n\n".format(state_counter))

        input_fst = compiler.compile()
        return input_fst
Exemplo n.º 7
0
def make_mapping_loop_fst(mapping):
    lines = []
    start = '0'
    mid = '1'
    end = '2'

    lines.append('%s %s 0 0' % (start, mid))
    for (fro, to) in mapping:
        lines.append('%s %s %s %s' % (mid, mid, fro + 1, to + 1))

    lines.append('%s %s 0 0' % (mid, end))
    lines.append(str(end))

    compiler = openfst.Compiler()
    for line in lines:
        print >> compiler, line
        #compiler.write(line)   ## TODO test
    f = compiler.compile()
    f.arcsort(st="olabel")
    return f
Exemplo n.º 8
0
def linear_fst(elements, automata_op, keep_isymbols=True, **kwargs):
    """Produce a linear automata.

    Based on code from
    https://stackoverflow.com/questions/9390536/how-do-you-even-give-an-openfst-made-fst-input-where-does-the-output-go.

    Args: elements (list): ordered list of input symbols automata_op (Fst):
        automaton to apply keep_isymbols (bool): whether to keep the input
        symbols
    """
    compiler = fst.Compiler(isymbols=automata_op.input_symbols().copy(),
                            acceptor=keep_isymbols,
                            keep_isymbols=keep_isymbols,
                            **kwargs)

    for i, el in enumerate(elements):
        print("{} {} {}".format(i, i + 1, el), file=compiler)
    print(str(i + 1), file=compiler)

    return compiler.compile()
Exemplo n.º 9
0
    def fst_stringcompile(self, text):
        compiler = fst.Compiler()
        state_counter = 0
        for c in text:
            uni = self.utf8_symbols.find(c)
            if uni == -1:
                conv = '0x%04x' % ord(c)
                uni = self.utf8_symbols.find(conv)
            from_state = state_counter
            to_state = state_counter + 1
            entry = "{} {} {} {}\n".format(from_state, to_state, uni, uni)
            compiler.write(entry)
            state_counter += 1

        compiler.write("{}\n\n".format(state_counter))

        input_fst = compiler.compile()
        # need to convert to pynini-fst for the combination with the grammar fst
        pynini_fst = pn.Fst.from_pywrapfst(input_fst)

        return pynini_fst
    def build_automa(self, features_file, lex_in, lex_out):
        labels, tokens = self.load_data(features_file)
        #corpus = load_data(text_file)
        token_label_pair = [
            self.__filter(token, lex_in) + " " + labels[id]
            for id, token in enumerate(tokens)
        ]
        labels_counter = Counter(labels)
        token_label_pair_counter = Counter(token_label_pair)

        mean = np.asarray(list(token_label_pair_counter.values())).mean()
        sum = np.asarray(list(token_label_pair_counter.values())).sum()
        std = np.asarray(list(token_label_pair_counter.values())).std()
        # Compute probabilities
        token_label_pair_probabilities = {}
        for pair, count in token_label_pair_counter.items():
            token, label = pair.split(" ")
            token_label_pair_probabilities[pair] = -math.log(
                float(count) / float(labels_counter[label]))

        for label in labels:
            key = "<unk> " + label
            if key not in token_label_pair_probabilities.keys():
                token_label_pair_probabilities["<unk> " + label] = -math.log(
                    1 / float(len(labels_counter)))
                #token_label_pair_probabilities["<unk> "+label] = -math.log(uniform(mean, std) / float(labels_counter[label]))

        compiler = fst.Compiler(isymbols=lex_in, osymbols=lex_out)
        corpus = ""
        for pair, prob in token_label_pair_probabilities.items():
            token, label = pair.split(" ")
            corpus += "0 0 {0} {1} {2}\n".format(token, label, str(prob))

        print >> compiler, corpus
        print >> compiler, "0\0"
        with open("bin/automa.txt", "w") as file:
            file.write(corpus)
        automaton = compiler.compile()
        return automaton
Exemplo n.º 11
0
def buildNgramCounter(wmap,maxngram=1):
    ''' Build an n-gram counting transducer
    @wmap: file containing the vocabulary
    @maxngram: maximium order of the grams to be counted'''
    filenames = []
    counters = []
    for order in range(1,maxngram+1):
        initial_state = 0
        final_state = order+1
        filename = 'counter'+str(order)
        filenames.append(filename)
        with open('counter'+str(order),'w') as outfile:
            with open (wmap,'r') as infile:
                line = infile.readline()
                while line:
                    line = line.strip()
                    outfile.write(str(initial_state)+" "+str(initial_state)+" "+line+" "+str(0)+"\n")
                    for state in range(order):
                        outfile.write(str(state)+" "+str(state+1)+" "+line+" "+line+"\n")
                    outfile.write(str(final_state-1)+" "+str(final_state-1)+" "+line+" "+str(0)+"\n")
                    line = infile.readline()
                outfile.write(str(order)+"\n")
    compiler = fst.Compiler()
    for filename in filenames:
        with open(filename,'r') as f:
            for line in f:
                compiler.write(line)
        tmp=compiler.compile()
        tmp.write(filename+".fst")
    for filename in filenames:
        counters.append(fst.Fst.read(filename+".fst"))
    elem1 = counters[0].union(counters[1])
    elem2 = counters[2].union(counters[3])
    tmp = elem1.union(elem2).rmepsilon().arcsort()
    # tmp = fst.determinize(tmp).minimize()
    ngramCounter = fst.arcmap(tmp,map_type='to_log64',delta=0.0000001)
    ngramCounter.write("ngramCounter.fst")
    return ngramCounter
Exemplo n.º 12
0
def linear_fst(
    elements: List[str],
    automata_op: fst.Fst,
    keep_isymbols: bool = True,
    **kwargs: Mapping[Any, Any],
) -> fst.Fst:
    """Produce a linear automata."""
    assert len(elements) > 0, "No elements"
    compiler = fst.Compiler(
        isymbols=automata_op.input_symbols().copy(),
        acceptor=keep_isymbols,
        keep_isymbols=keep_isymbols,
        **kwargs,
    )

    num_elements = 0
    for i, el in enumerate(elements):
        print("{} {} {}".format(i, i + 1, el), file=compiler)
        num_elements += 1

    print(str(num_elements), file=compiler)

    return compiler.compile()
Exemplo n.º 13
0
def alternatives(sequence):
    # sequence is a list of words
    # produces the n_best alternative to sequence made of sub-units that are in words

    # Build FST
    compiler_sequence = fst.Compiler(isymbols=printable_ST,
                                     osymbols=printable_ST,
                                     keep_isymbols=True,
                                     keep_osymbols=True)
    c = 0
    for word in sequence:
        for char in word:
            print >> compiler_sequence, str(c) + ' ' + str(
                c + 1) + ' ' + char + ' ' + char
            c = c + 1
        print >> compiler_sequence, str(c) + ' ' + str(c + 1) + ' </w> </w>'
        c = c + 1
    print >> compiler_sequence, str(c)
    fst_sequence = compiler_sequence.compile()
    fst_sequence = fst_sequence.set_input_symbols(printable_ST)
    fst_sequence = fst_sequence.set_output_symbols(printable_ST)

    composition = fst.compose(fst_vocab,
                              fst.compose(grapheme_confusion,
                                          fst_sequence)).rmepsilon().arcsort()
    # composition.prune(weight = 3)
    alters = printstrings(composition,
                          nshortest=n_best,
                          syms=printable_ST,
                          weight=True)
    scores = []
    if alters:
        print alters
        scores = [float(alt[1]) for alt in alters]
        alters = [alt[0].split(' </w>')[:-1] for alt in alters]
        alters = [[''.join(word.split(' ')) for word in alt] for alt in alters]
    return alters, scores
Exemplo n.º 14
0
def make_t_lattice_SIMP2(dist, ind):
    '''
    version 2 -- don't go via strings
    '''

    #print 'target indexes'
    #print ind
    #print
    #print
    fst = openfst.Fst()
    start = fst.add_state()
    fst.set_start(start)

    frames, cands = np.shape(dist)

    #frames = 3

    for i in range(frames):

        end = start + 1
        for j in range(cands):

            frame_ix = ind[i, j]
            weight = dist[i, j]
            fst.append('%s %s %s %s %s' %
                       (start, end, frame_ix + 1, frame_ix + 1, weight))

        start = end
    fst.append('%s' % (end))

    compiler = openfst.Compiler()
    for line in fst:
        print >> compiler, line
        #compiler.write(line)   ## TODO test
    f = compiler.compile()
    f.arcsort(st="olabel")
    return f
Exemplo n.º 15
0
 def write_hypos(self, all_hypos, sen_indices):
     """Writes FST files with standard arcs for each
     sentence in ``all_hypos``. The created lattices are not 
     optimized in any way: We create a distinct path for each entry 
     in ``all_hypos``. We advise you to determinize/minimize them if
     you are planning to use them for further processing. 
     
     Args:
         all_hypos (list): list of nbest lists of hypotheses
         sen_indices (list): List of sentence indices (0-indexed)
     
     Raises:
         OSError. If the directory could not be created
         IOError. If something goes wrong while writing to the disk
     """
     _mkdir(self.path, "FST")
     for fst_idx, hypos in zip(sen_indices, all_hypos):
         fst_idx += 1
         c = fst.Compiler()
         # state ID 0 is start, 1 is final state
         next_free_id = 2
         for hypo in hypos:
             # Connect with start node
             c.write("0\t%d\t%d\t%d\t%f\n" %
                     (next_free_id, utils.GO_ID, utils.GO_ID,
                      -hypo.total_score))
             next_free_id += 1
             for sym in hypo.trgt_sentence:
                 c.write("%d\t%d\t%d\t%d\n" %
                         (next_free_id - 1, next_free_id, sym, sym))
                 next_free_id += 1
             # Connect with final node
             c.write("%d\t1\t%d\t%d\n" %
                     (next_free_id - 1, utils.EOS_ID, utils.EOS_ID))
         c.write("1\n")
         f = c.compile()
         f.write(self.file_pattern % fst_idx)
Exemplo n.º 16
0
def process_line(line):
    global isym
    global osym
    global tm
    global lm
    # Read input
    compiler = fst.Compiler()
    arr = line.strip().split() + ["</s>"]
    unks = []
    for i, x in enumerate(arr):
        if x not in isym:
            unks.append(x)
        xsym = isym[x] if x in isym else isym["<unk>"]
        print >> compiler, "%d %d %s %s" % (i, i + 1, xsym, xsym)
    print >> compiler, "%s" % (len(arr))
    ifst = compiler.compile()

    # Create the search graph and do search
    graph = fst.compose(ifst, tm)
    graph = fst.compose(graph, lm)
    graph = fst.shortestpath(graph)

    # Read off the output
    out = []
    unkspot = 0
    for state in graph.states():
        for arc in graph.arcs(state):
            if arc.olabel != 0:
                tok = osym[arc.olabel]
                # unk substitution (original words in same order)
                if unkspot < len(unks) and tok == "<unk>":
                    out.append(unks[unkspot])
                    unkspot += 1
                else:
                    out.append(tok)
    return " ".join(reversed(out[1:]))
Exemplo n.º 17
0
        c.write("%d\t%d\t%d\t%d\t0,1,%f\n" % (
            out_root,
            arc.nextstate,
            arc.ilabel,
            arc.olabel,
            w2f(arc.weight)))
        dfs(arc.nextstate, hist + [str(arc.ilabel)])

idx = 0
while True:
    idx += 1
    input_path = get_path(args.input, idx)
    if not input_path or not os.path.isfile(input_path):
        break
    lat = fst.Fst.read(input_path)
    c = fst.Compiler(arc_type="tropicalsparsetuple")
    hist2node = {}
    visited = {}
    dfs(lat.start(), [])
    # Add context states
    next_context_id = 20000000
    for key,cluster in hist2node.iteritems():
        hist_len = len(key.split())
        if len(cluster) < 2:
            continue # We don't need this context
        elif len(cluster) == 2: # Directly connect both nodes
            c.write("%d\t%d\t%d\t%d\t0,%d,1.0\n" % (
                cluster[0], out_state(cluster[1]), 0, 0, hist_len+1))
            c.write("%d\t%d\t%d\t%d\t0,%d,1.0\n" % (
                cluster[1], out_state(cluster[0]), 0, 0, hist_len+1))
        else: # Introduce a context node
Exemplo n.º 18
0
from fststr import fststr
import pywrapfst as fst

# Init FST
st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
compiler = fst.Compiler(isymbols=st,
                        osymbols=st,
                        keep_isymbols=True,
                        keep_osymbols=True)
fst_file = open('e-insertion.txt').read()
print(fst_file, file=compiler)
c = compiler.compile()
fststr.expand_other_symbols(c)

# Test FST
test_in = 'fox<^>s<#>'
print("input:", test_in)
print("output:", fststr.apply(test_in, c))
Exemplo n.º 19
0
    def _get_basic_tag_fst(self, text_arr):
        # create an fst from text_arr, extracting pos-tags where applicable
        # only use pos-tags where found, store the words for reconstruction of the utterance
        #self.current_oov_queue = queue.Queue()
        self.oov_dict = {}
        self.replacement_dict = {}
        compiler = fst.Compiler()
        state_counter = 0
        next_state = 0

        for i, arr in enumerate(text_arr):
            if not arr:
                continue
            # single word, single expansion
            if len(arr) == 1:
                for w in arr:
                    if '_' in w:
                        wrd = w[:w.index('_')]
                        w = w[w.index('_') + 1:]
                        if i in self.replacement_dict:
                            d = self.replacement_dict[i]
                            d[w] = wrd
                        else:
                            self.replacement_dict[i] = {}
                            d = self.replacement_dict[i]
                            d[w] = wrd

                    #elif  w == 'og':
                    #    #TODO: better solution ... tag in grammar
                    #    wrd = 'og'
                    #    w = 'c'
                    #    if i in self.replacement_dict:
                    #        d = self.replacement_dict[i]
                    #        d[w] = wrd
                    #    else:
                    #        self.replacement_dict[i] = {}
                    #        d = self.replacement_dict[i]
                    #        d[w] = wrd

                    int_val = self._get_int_value_word(w, i)
                    from_state = state_counter
                    if next_state != 0:
                        to_state = next_state
                        state_counter = next_state - 1
                        next_state = 0
                    else:
                        to_state = state_counter + 1
                    self._compile_entry(compiler, from_state, int_val, to_state)
                    state_counter += 1

            # multiple verbalization possibilities
            # we are working char by char, so store the state_counter, such that
            # all possibilities have the same from_state (starting state)
            else:
                from_state = state_counter
                to_state = state_counter + 1
                for w in arr:
                    if '_' in w:
                        wrd = w[:w.index('_')]
                        w = w[w.index('_') + 1:]
                        if i in self.replacement_dict:
                            d = self.replacement_dict[i]
                            d[w] = wrd
                        else:
                            self.replacement_dict[i] = {}
                            d = self.replacement_dict[i]
                            d[w] = wrd
                    int_val = self._get_int_value_word(w, i)
                    self._compile_entry(compiler, from_state, int_val, to_state)
                    state_counter += 1

                next_state = to_state + 1
                state_counter = to_state

        compiler.write("{}\n\n".format(state_counter))

        input_fst = compiler.compile()
        return input_fst
    def buildpostProcessFST(self, input_str):

        # initialize a FSTpost
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        initposts = '0\n'
        print(initposts, file=compiler)
        initFSTpost = compiler.compile()
        fststr.expand_other_symbols(initFSTpost)

        # read post FST txt files
        post_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_post_")
        ]
        # print(post_files)
        # compile txt files into FST, and union them into initFSTpost
        for f in post_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            post = open(f).read()
            print(post, file=compiler)
            post_FST = compiler.compile()
            fststr.expand_other_symbols(post_FST)
            initFSTpost = initFSTpost.union(post_FST)
            #print("checkpoint: ", fststr.apply(input_str, initFSTpost), '\n')

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # post = open('FST_post_withsign.txt').read()
        # print(post, file=compiler)
        # post_FST = compiler.compile()
        # fststr.expand_other_symbols(post_FST)
        # initFSTpost = initFSTpost.union(post_FST)

        # FST that take care of input is original form
        s = ''
        # loop through the character parts of the input
        tracker = 0
        for i in range(len(input_str)):
            if (input_str[i] == '+'):
                s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1,
                                                     tracker + 1)
                tracker += 1
                break
            else:
                s += '{} {} {} {}\n'.format(tracker, tracker + 1,
                                            input_str[tracker],
                                            input_str[tracker])
                tracker += 1
        # take care of <#> in the end, change it to +Guess
        s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        print(s, file=compiler)
        original_case_FST = compiler.compile()
        fststr.expand_other_symbols(original_case_FST)
        initFSTpost = initFSTpost.union(original_case_FST)

        # # Last FST, clear out any word ends with <#>, output words ends with +Guess and +Known
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        clear = open('FST_finalclearance.txt').read()
        print(clear, file=compiler)
        clear_FST = compiler.compile()
        fststr.expand_other_symbols(clear_FST)
        lastFST = fst.compose(initFSTpost.arcsort(sort_type="olabel"),
                              clear_FST.arcsort(sort_type="ilabel"))

        return lastFST
Exemplo n.º 21
0
 def __init__(self):
     self._compiler = pywrapfst.Compiler()
Exemplo n.º 22
0
    def get_in_vocab_fst(self):
        alphabet = fststr.EN_SYMB
        st = fststr.symbols_table_from_alphabet(
            alphabet)  # <class 'pywrapfst.SymbolTable'>
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        in_vocab_fst = compiler.compile()

        lemma = []
        verb_in_form = []
        with open('in_vocab_dictionary_verbs.txt', 'r') as f:
            for line in f.readlines():
                lemma.append(line.split(',')[0])
                verb_in_form.append(line.split(',')[1])

        for idx in range(len(lemma)):
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            compiler.write('0 0 <other> <other>\n')
            compiler.write('0 1 <#> <epsilon>\n')
            compiler.write('1\n')
            a_fst = compiler.compile()
            fststr.expand_other_symbols(a_fst)

            lemma_word = lemma[idx]
            lemma_word_length = len(lemma_word)
            form_word = verb_in_form[idx]
            form_word_length = len(form_word)
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            if form_word_length >= lemma_word_length:
                for i in range(form_word_length):
                    if i < lemma_word_length:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' ' + lemma_word[i] + '\n')
                    else:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' <epsilon>\n')

                compiler.write(
                    str(form_word_length) + ' ' + str(form_word_length + 1) +
                    ' <epsilon>' + ' +Known\n')
                compiler.write(str(form_word_length + 1))
                b_fst = compiler.compile()
                fststr.expand_other_symbols(b_fst)

                c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"),
                                    b_fst.arcsort(sort_type="ilabel"))
                in_vocab_fst.union(c_fst)
            else:
                for i in range(lemma_word_length):
                    if i < form_word_length:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' ' + lemma_word[i] + '\n')
                    else:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' <epsilon>' + ' ' +
                            lemma_word[i] + '\n')
                compiler.write(
                    str(lemma_word_length) + ' ' + str(lemma_word_length + 1) +
                    ' <epsilon> +Known\n')
                compiler.write(str(lemma_word_length + 1))
                b_fst = compiler.compile()
                fststr.expand_other_symbols(b_fst)

                c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"),
                                    b_fst.arcsort(sort_type="ilabel"))
                in_vocab_fst.union(c_fst)

        return in_vocab_fst
Exemplo n.º 23
0
 def generateFst(self, data, st):
     lines = []
     lineLst = data.split("\n")
     count = 0
     for line in lineLst:
         curFst = ""
         stemNinf = line.split(",")[:2]
         curFst = "0\n"  # 0 as final state
         curFst += "0 0 <other> <other>\n"
         stem = stemNinf[0]
         if stem == "":
             return rootFst
         #print("stem: %s",stem)
         #if len(stemNinf)>1:
         inf = stemNinf[1]
         #print("inf: %s",inf)
         for i in range(len(stem)):
             curFst += str(i)
             curFst += " "
             curFst += str(i + 1)
             curFst += " "
             if i >= len(inf):
                 curFst += "<epsilon>"
             else:
                 curFst += inf[i]
             curFst += " "
             curFst += stem[i]
             curFst += "\n"
         infLen = len(inf)
         stemLen = len(stem)
         index = stemLen
         if stemLen > infLen:
             continue
         else:
             toBeReplaced = inf[stemLen:]
             for i, s in enumerate(toBeReplaced):
                 index = i + stemLen
                 curFst += str(index)
                 curFst += " "
                 curFst += str(index + 1)
                 curFst += " "
                 curFst += s
                 curFst += " "
                 curFst += "<epsilon>"
                 curFst += "\n"
         curFst += str(index + 1)
         curFst += " "
         curFst += "0"
         curFst += " "
         curFst += "<#>"
         curFst += " "
         curFst += "+Known"
         compiler = fst.Compiler(isymbols=st,
                                 osymbols=st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
         #print("curFst",curFst)
         compiler.write(curFst)
         other = compiler.compile()
         fststr.expand_other_symbols(other)
         if count == 0:
             rootFst = other
         else:
             rootFst = rootFst.union(other)
         count += 1
     return rootFst
Exemplo n.º 24
0
def get_morph_fst(name):
    syms = ofst.SymbolTable.read_text(MORPH_PATH + 'symbols.txt')
    fst = ofst.Fst.read(MORPH_PATH + name.lower() + '.fst')
    return fst, ofst.Compiler(isymbols=syms, osymbols=syms,
                              acceptor=True), syms
Exemplo n.º 25
0
    def getAlloFST(self):
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write('')
        AlloFST = compiler.compile()
        yRepl = '0 \n0 0 <other> <other> \n0 1 i <epsilon> \n1 2 e <epsilon> \n1 12 <^> <epsilon> \n1 0 <epsilon> i \n2 3 <^> <epsilon> \n2 9 <epsilon> i \n3 4 s <epsilon> \n3 10 <epsilon> i \n4 5 <#> y \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n12 19 <epsilon> i \n13 14 d <epsilon> \n13 20 <epsilon> i \n14 15 <#> y \n15 16 <epsilon> <^> \n16 17 <epsilon> e \n17 18 <epsilon> d \n18 8 <epsilon> <^> \n19 0 <epsilon> <^> \n20 21 <epsilon> <^> \n21 0 <epsilon> e'
        kIns = '\n0 \n0 0 <other> <other> \n0 1 c c \n1 0 <other> <other> \n1 2 k <epsilon> \n2 0 <epsilon> k \n2 3 <^> <epsilon> \n3 4 i <epsilon> \n 3 10 e <epsilon> \n3 14 <epsilon> k \n4 5 n <^> \n5 6 g i \n6 7 <#> n \n7 8 <epsilon> g \n8 9 <epsilon> <#> \n9 \n10 11 d <^> \n11 12 <#> e \n12 8 <epsilon> d \n14 0 <epsilon> <^>'
        eDel = '0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n1 2 i <epsilon> \n1 11 e <epsilon> \n2 3 n <epsilon> \n3 4 g <epsilon> \n4 5 <#> e \n5 6 <epsilon> <^> \n6 7 <epsilon> i \n7 8 <epsilon> n \n8 9 <epsilon> g \n9 10 <epsilon> <#> \n10 \n11 12 d <epsilon> \n11 16 <epsilon> e \n12 13 <#> e \n13 14 <epsilon> <^> \n14 15 <epsilon> e \n15 9 <epsilon> d \n16 0 <epsilon> <^>'
        eInsch = '0 \n0 0 <other> <other> \n0 1 c <epsilon> \n1 2 h <epsilon> \n1 0 <epsilon> c \n2 3 e <epsilon> \n2 11 <epsilon> c \n3 4 <^> <epsilon> '
        eInsch += '\n3 12 <epsilon> c \n4 5 s <epsilon> \n4 12 <epsilon> c \n5 6 <#> c \n6 7 <epsilon> h \n7 8 <epsilon> <^> \n8 9 <epsilon> s \n9 10 <epsilon> <#> \n10 \n10 10 <other> <other> \n11 0 <epsilon> h '
        eInsch += '\n12 13 <epsilon> h \n13 0 <epsilon> e \n14 15 <epsilon> h \n15 16 <epsilon> e \n16 0 <epsilon> <^>'
        eInss = '\n0 \n0 0 <other> <other> \n0 1 s <epsilon> \n1 2 e <epsilon> \n1 12 h <epsilon> \n1 0 <epsilon> s \n2 3 <^> <epsilon> \n2 9 <epsilon> s \n3 4 s <epsilon> \n3 10 <epsilon> s \n4 5 <#> s \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n 12 20 <epsilon> s \n13 14 <^> <epsilon> \n13 21 <epsilon> s \n14 15 s <epsilon> \n14 23 <epsilon> s \n15 16 <#> s \n16 17 <epsilon> h \n17 18 <epsilon> <^> \n18 19 <epsilon> s \n19 8 <epsilon> <#> \n20 0 <epsilon> h \n21 22 <epsilon> h \n22 0 <epsilon> e \n23 24 <epsilon> h \n24 25 <epsilon> e \n25 0 <epsilon> <^>'
        xz = [*'xz']
        eInsxz = '\n0 \n0 0 <other> <other> '
        for i in range(len(xz)):
            c = xz[i]
            n = 11 * i
            eInsxz += '\n0 ' + str(
                n + 1) + ' ' + c + ' <epsilon> ' + '\n' + str(
                    n + 1) + ' ' + str(n + 2) + ' e <epsilon> \n' + str(
                        n + 1) + ' 0 <epsilon> ' + c + '\n' + str(n + 2)
            eInsxz += ' ' + str(n + 3) + ' <^> <epsilon> \n' + str(
                n + 2) + ' ' + str(n + 9) + ' <epsilon> ' + c + ' \n' + str(
                    n + 3) + ' ' + str(n + 4) + ' s <epsilon> \n' + str(n + 4)
            eInsxz += ' ' + str(n + 5) + ' <#> ' + c + ' \n' + str(
                n + 3) + ' ' + str(n + 10) + ' <epsilon> ' + c + ' \n' + str(
                    n + 5) + ' ' + str(n + 6) + ' <epsilon> <^> \n'
            eInsxz += str(n + 6) + ' ' + str(n + 7) + ' <epsilon> s \n' + str(
                n + 7) + ' ' + str(n + 8) + ' <epsilon> <#> \n' + str(
                    n + 8) + ' \n' + str(n +
                                         8) + ' ' + str(n +
                                                        8) + ' <other> <other>'
            eInsxz += ' \n' + str(n + 9) + ' 0 <epsilon> e \n' + str(
                n + 10) + ' ' + str(n + 11) + ' <epsilon> e \n' + str(
                    n + 11) + ' 0 <epsilon> <^>'
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(eInsch)
        fstInsch = compiler.compile()
        fststr.expand_other_symbols(fstInsch)
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(eInss)
        fstInss = compiler.compile()
        fststr.expand_other_symbols(fstInss)
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(eInsxz)
        fstInsxz = compiler.compile()
        fststr.expand_other_symbols(fstInsxz)
        fsteIns = fstInsch.union(fstInss.union(fstInsxz))

        consonants = [*'bcdfghjklmnpqrstvwxz']
        consDoub = ''
        consDoub += '\n0 \n0 1 a a \n0 1 e e \n0 1 i i \n0 1 o o \n0 1 u u \n0 1 y y \n0 0 <other> <other>'
        consDoub += '\n1 0 a a \n1 0 e e \n1 0 i i \n1 0 o o \n1 0 u u \n1 0 y y'
        consDoub += '\n2 \n2 2 <other> <other>'
        for i in range(len(consonants)):
            c = consonants[i]
            consDoub += '\n1 ' + str(8 * i + 3) + ' ' + c + ' ' + c
            consDoub += '\n' + str(8 * i +
                                   3) + ' ' + str(8 * i +
                                                  4) + ' ' + c + ' <epsilon>'
            consDoub += '\n' + str(8 * i + 3) + ' 0 <other> <other>'
            consDoub += '\n' + str(8 * i + 4) + ' ' + str(8 * i +
                                                          5) + ' <^> <^>'
            consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 6) + ' i i'
            consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 9) + ' e e'
            consDoub += '\n' + str(8 * i + 5) + ' 0 <other> <other>'
            consDoub += '\n' + str(8 * i + 6) + ' ' + str(8 * i + 7) + ' n n'
            consDoub += '\n' + str(8 * i + 7) + ' ' + str(8 * i + 8) + ' g g'
            consDoub += '\n' + str(8 * i + 8) + ' 2 <#> <#>'
            consDoub += '\n' + str(8 * i + 9) + ' ' + str(8 * i + 10) + ' d d'
            consDoub += '\n' + str(8 * i + 9) + ' 0 <other> <other>'
            consDoub += '\n' + str(8 * i + 10) + ' 2 <#> <#>'
        ycompiler = fst.Compiler(isymbols=self.st,
                                 osymbols=self.st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
        ycompiler.write(yRepl)
        yReplNew = ycompiler.compile()
        fststr.expand_other_symbols(yReplNew)

        kcompiler = fst.Compiler(isymbols=self.st,
                                 osymbols=self.st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
        kcompiler.write(kIns)
        kInsNew = kcompiler.compile()
        fststr.expand_other_symbols(kInsNew)

        edcompiler = fst.Compiler(isymbols=self.st,
                                  osymbols=self.st,
                                  keep_isymbols=True,
                                  keep_osymbols=True)
        edcompiler.write(eDel)
        eDelNew = edcompiler.compile()
        fststr.expand_other_symbols(eDelNew)

        cdcompiler = fst.Compiler(isymbols=self.st,
                                  osymbols=self.st,
                                  keep_isymbols=True,
                                  keep_osymbols=True)
        cdcompiler.write(consDoub)
        consDoubNew = cdcompiler.compile()
        fststr.expand_other_symbols(consDoubNew)

        AlloFST.union(
            yReplNew.union(
                kInsNew.union(fsteIns.union(eDelNew.union(consDoubNew)))))
        fststr.expand_other_symbols(AlloFST)
        return AlloFST
Exemplo n.º 26
0
  offset = 10000 # Something larger than the longest sequence
  offset2 = 2 * offset
  for idx, t in enumerate(terminals):
    state_id = idx + 2
    fst_arc(c, state_id - 1, state_id, t)
    fst_arc(c, state_id - 1 + offset2, state_id, t)
    for nt in closing_non_terminals:  # Self loop
      fst_arc(c, state_id, state_id + offset, nt)
      fst_arc(c, state_id + offset, state_id + offset, nt)
    if idx < len(terminals) - 1: # No opening at last position
      for nt in opening_non_terminals:
        fst_arc(c, state_id + offset, state_id + offset2, nt)
        fst_arc(c, state_id + offset2, state_id + offset2, nt)
  fst_arc(c, state_id, state_id + 1, args.eos_id)
  fst_arc(c, state_id + offset, state_id + 1, args.eos_id)
  c.write("%d\n" % (state_id + 1,))
  

for line_idx, line in enumerate(sys.stdin):
  terminals = [int(i) for i in line.strip().split()]
  c = fst.Compiler()
  # Debug with sys.stdout rather than c
  if args.format == 'layerbylayer':
    construct_layerbylayer_fst(c, non_terminals, terminals)
  elif args.format == 'layerbylayer_pop':
    construct_layerbylayer_fst(c, non_terminals, terminals, pop_id)
  else: # flat_*
    construct_flat_fst(c, closing_non_terminals, opening_non_terminals, terminals)
  f = c.compile()
  f.write("%s/%d.fst" % (args.output_dir, line_idx + 1))
Exemplo n.º 27
0
# Rename sil by <eps> in those tables
index['<eps>'] = index['sil']
del index['sil']

# Create the symbol table
printable_ST = fst.SymbolTable()
printable_ST.add_symbol('<eps>')
for c in index.keys():
    if c != '<eps>':
        printable_ST.add_symbol(c)
# save the symbol table
printable_ST.write_text('FSTs/symbol_table.txt')

# Build a sigma FST: accepting any and outputting epsilon
compiler = fst.Compiler(isymbols=printable_ST,
                        osymbols=printable_ST,
                        keep_isymbols=True,
                        keep_osymbols=True)
for c in index.keys():
    print >> compiler, '0 0 %s <eps>' % c
print >> compiler, '0'
compiler.compile().write('FSTs/sigma.fst')

# Build an FST which corrects the errors: maps <estim> to <truth> with probability confusion_matrix[<truth>][<estim>]
compiler = fst.Compiler(isymbols=printable_ST,
                        osymbols=printable_ST,
                        keep_isymbols=True,
                        keep_osymbols=True)
for truth in index.keys():  #[0:4]:
    for estim in index.keys():  #[0:4]:
        score = confusion_matrix[index[truth]][index[estim]]
        # score = confusion_matrix[index[estim]][index[truth]]
Exemplo n.º 28
0
class Lemmatizer:
    def e_insertion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('e-insertion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def k_insertion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('k-insertion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def get_morphotactics():
        suffix = ['', 's', 'ed', 'en', 'ing']
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        c = compiler.compile()
        
        for s in suffix:
            compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
            compiler.write('0 0 <other> <other>\n')
            compiler.write('0 1 +Guess <^>\n')
            l = len(s)
            for i in range(l):
                compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n')
            compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n')
            compiler.write(str(l+2))
            suffix_rule = compiler.compile()
            c = c.union(suffix_rule)
        fststr.expand_other_symbols(c)
        return c

    def general():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        compiler.write('0 0 <other> <other>\n')
        compiler.write('0 0 <#> <#>\n')
        compiler.write('0\n')
        # for special cases
        # compiler.write('0 1 i i\n')
        # compiler.write('1 2 n n\n')
        # compiler.write('2 3 g g\n')
        # compiler.write('3 4 <^> <^>\n')
        
        compiler.write('0 1 <^> <epsilon>\n')
        compiler.write('1 0 <other> <other>\n')
        compiler.write('1 0 <#> <#>\n')
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def e_deletion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('silent-e-deletion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c 

    def ch_sh_e_insertion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('ch_sh_e_insertion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def y_replacement():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('y_replacement.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def del_sharp():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        compiler.write('0\n')
        compiler.write('0 0 <other> <other>\n')
        compiler.write('0 1 <#> <epsilon>\n')
        compiler.write('1\n')
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def consonant_doubling():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z']
        vowel = ['a', 'e', 'i', 'o', 'u']
        compiler.write('0\n')
        compiler.write('0 0 <other> <other>\n')
        compiler.write('0 0 <#> <#>\n')
        for v in vowel:
            compiler.write('0 2 ' + v + ' ' + v + '\n')
        for c in consonant:
            compiler.write('0 1 ' + c + ' ' + c + '\n')
            compiler.write('1 1 ' + c + ' ' + c + '\n')
        for v in vowel:
            compiler.write('1 2 ' + v + ' ' + v + '\n')
        compiler.write('2 2 i i\n')
        compiler.write('2 2 u u\n')
        for i in range(len(consonant)):
            compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n')
            compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
            compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
            for c in consonant:
                compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n')
            for v in vowel:
                compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n')
        compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n')
        compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n')
        compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n')
        compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n')
        compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n')
        compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n')
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)

    lemma = []
    allomorphy = []

    with open("in_vocab_dictionary_verbs.txt", "r") as f:
        for line in f.readlines():
            lemma.append(line.split(',')[0])
            allomorphy.append(line.split(',')[1])

    compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
    rule = compiler.compile()

    for index in range(len(lemma)):
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        if len(allomorphy[index]) >= len(lemma[index]):
            for i in range(len(allomorphy[index])):
                if i < len(lemma[index]):
                    compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n')
                else:
                    compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' <epsilon>' + '\n')
            l = len(allomorphy[index])
            compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n')
            compiler.write(str(l+1))
            rule.union(compiler.compile())
        else:
            for i in range(len(lemma[index])):
                if i < len(allomorphy[index]):
                    compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n')
                else:
                    compiler.write(str(i) + ' ' + str(i+1) + ' <epsilon>' + ' ' + lemma[index][i] + '\n')
            l = len(lemma[index])
            compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n')
            compiler.write(str(l+1))
            rule.union(compiler.compile())

    de_iv_rule = rule.copy().invert()

    morphotactics_rule = get_morphotactics()
    e_insertion_rule = e_insertion()
    k_insertion_rule = k_insertion()
    e_deletion_rule = e_deletion()
    general_rule = general()
    ch_sh_e_insertion_rule = ch_sh_e_insertion()
    y_replacement_rule = y_replacement()
    del_sharp_rule = del_sharp()
    consonant_doubling_rule = consonant_doubling()

    new_rule = k_insertion_rule.union(e_insertion_rule).union(general_rule).union(e_deletion_rule).union(ch_sh_e_insertion_rule).union(y_replacement_rule).union(consonant_doubling_rule)
    de_oov = fst.compose(morphotactics_rule.arcsort(sort_type="olabel"), new_rule.arcsort(sort_type="ilabel"))
    de_oov_rule = fst.compose(de_oov.arcsort(sort_type="olabel"), del_sharp_rule.arcsort(sort_type="ilabel"))
    
    oov_rule = de_oov_rule.copy().invert()
    # The final rules for lemmatizer
    rule = rule.union(oov_rule)
    # The final rules for delemmatizer
    de_rule = de_iv_rule.union(de_oov_rule)

    def lemmatize(self, in_str):
        
        out_set = set()
        for i in fststr.apply(in_str, self.rule):
            out_set.add(i)
        if in_str[-3:] == 'ing' or in_str[-2:] == 'ed' or in_str[-2:] == 'en' or (in_str[-1] == 's' and in_str[-2] != 's'):
            out_set.remove(in_str+'+Guess')
        
        return out_set

    def delemmatize(self, in_str):
        out_set = set()
        for i in fststr.apply(in_str, self.de_rule):
            out_set.add(i)
        return out_set
def get_fst(name):
    syms = ofst.SymbolTable.read_text(FST_PATH + name + '/symbols.txt')
    fst = ofst.Fst.read(FST_PATH + name + '/' + name.lower() + '.fst')
    return fst, ofst.Compiler(isymbols=syms, osymbols=syms,
                              acceptor=True), syms
Exemplo n.º 30
0
def create_spelling_fst(word_table, alphabet_table, repeat_char,
                        self_transition_prob):
    assert isinstance(word_table, openfst.SymbolTable)
    assert isinstance(alphabet_table, openfst.SymbolTable)
    assert self_transition_prob > 0. and self_transition_prob < 1.

    WORD_EPSILON_STR = word_table.Find(EPSILON_INT)
    ALPHABET_EPSILON_STR = alphabet_table.Find(EPSILON_INT)
    next_transition_prob = 1. - self_transition_prob
    self_transition_cost = -math.log(self_transition_prob)
    next_transition_cost = -math.log(next_transition_prob)

    compiler = openfst.Compiler(fst_type="const",
                                arc_type="log",
                                isymbols=alphabet_table,
                                osymbols=word_table,
                                keep_isymbols=False,
                                keep_osymbols=False)
    build_fst = partial(print, file=compiler)

    start_state_index = 0
    final_state_index = 1

    state_index = 2
    edge_format_str = "{start} {end} {char} {word} {weight:.5f}"
    for word in openfst.SymbolTableIterator(word_table):
        if word not in word_table:
            raise ValueError("Word {0} not in vocabulary {1}".format(
                word, word_table))
        for i, char in enumerate(word):
            if char not in alphabet_table:
                raise ValueError("Character {0} not in alphabet {1}".format(
                    char, alphabet_table.name()))
            # Edge case: single-letter word?
            if i == 0:
                build_fst(
                    edge_format_str.format(start=start_state_index,
                                           end=state_index if len(word) != 1
                                           else final_state_index,
                                           char=char,
                                           word=WORD_EPSILON_STR,
                                           weight=next_transition_cost))
                build_fst(
                    edge_format_str.format(start=start_state_index,
                                           end=start_state_index,
                                           char=ALPHABET_EPSILON_STR,
                                           word=WORD_EPSILON_STR,
                                           weight=self_transition_cost))
            else:
                # It is possible that this letter could be a repeat. Warning:
                # This doesn't check if a letter happens 3 times in a row, but
                # I don't know of any words in English that ever do that.
                last_char = word[i - 1]
                char = repeat_char if char == last_char else char

                if i == len(word) - 1:
                    build_fst(
                        edge_format_str.format(start=state_index,
                                               end=final_state_index,
                                               char=char,
                                               word=word,
                                               weight=next_transition_cost))
                    build_fst(
                        edge_format_str.format(start=state_index,
                                               end=state_index,
                                               char=ALPHABET_EPSILON_STR,
                                               word=WORD_EPSILON_STR,
                                               weight=self_transition_cost))
                else:
                    build_fst(
                        edge_format_str.format(start=state_index,
                                               end=state_index + 1,
                                               char=char,
                                               word=word,
                                               weight=next_transition_cost))
                    build_fst(
                        edge_format_str.format(start=state_index,
                                               end=state_index,
                                               char=ALPHABET_EPSILON_STR,
                                               word=WORD_EPSILON_STR,
                                               weight=self_transition_cost))
            state_index += 1
    # Add final state
    build_fst(str(final_state_index))

    S = compiler.compile().determinize()
    S.minimize()
    S = S.arcsort("olabel")
    return S