def buildpreProcessFST_delemmatize(self):

        # initialize a FSTpre
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        initpres = '1\n'
        print(initpres, file=compiler)
        initFSTpre = compiler.compile()
        fststr.expand_other_symbols(initFSTpre)

        pre_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_pre_")
        ]
        # print(pre_files)
        # compile txt files into FST, and union them into initFSTpre
        for f in pre_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            pre = open(f).read()
            print(pre, file=compiler)
            pre_FST = compiler.compile()
            fststr.expand_other_symbols(pre_FST)
            initFSTpre = initFSTpre.union(pre_FST)

        return initFSTpre
Пример #2
0
 def consonant_doubling():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z']
     vowel = ['a', 'e', 'i', 'o', 'u']
     compiler.write('0\n')
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 0 <#> <#>\n')
     for v in vowel:
         compiler.write('0 2 ' + v + ' ' + v + '\n')
     for c in consonant:
         compiler.write('0 1 ' + c + ' ' + c + '\n')
         compiler.write('1 1 ' + c + ' ' + c + '\n')
     for v in vowel:
         compiler.write('1 2 ' + v + ' ' + v + '\n')
     compiler.write('2 2 i i\n')
     compiler.write('2 2 u u\n')
     for i in range(len(consonant)):
         compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n')
         compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
         compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
         for c in consonant:
             compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n')
         for v in vowel:
             compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n')
     compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n')
     compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n')
     compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n')
     compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n')
     compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n')
     compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c
    def buildpreProcessFST(self, curr_str):

        s = '0\n'
        tracker = 0
        for i in range(len(curr_str)):
            if (curr_str[i] == '+') or (curr_str[i] == '<'):
                s += '{} {} <epsilon> {}\n'.format(
                    tracker, tracker + 1, curr_str[tracker:len(curr_str)])
                # tracker +=1
                break
            else:
                s += '{} {} {} {}\n'.format(tracker, tracker + 1,
                                            curr_str[tracker],
                                            curr_str[tracker])
                tracker += 1

        s += '{} {} <epsilon> <#>\n{}\n'.format(tracker, tracker + 1,
                                                tracker + 1)
        # print(s)
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        print(s, file=compiler)
        FSTpre = compiler.compile()
        fststr.expand_other_symbols(FSTpre)

        return FSTpre
Пример #4
0
 def ch_sh_e_insertion():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     fst_file = open('ch_sh_e_insertion.txt').read()
     print(fst_file, file=compiler)
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c
Пример #5
0
 def y_replacement():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     fst_file = open('y_replacement.txt').read()
     print(fst_file, file=compiler)
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c
Пример #6
0
 def del_sharp():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     compiler.write('0\n')
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 1 <#> <epsilon>\n')
     compiler.write('1\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c
Пример #7
0
 def get_compiler_from_file_name(self, file_name):
   st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
   compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
   in_file = open(file_name)
   fst_file = in_file.read()
   print(fst_file, file=compiler)
   c = compiler.compile()
   fststr.expand_other_symbols(c)
   in_file.close()
   return c
    def buildInVocabFST(self):

        # initialize a FST1
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        inits1 = '0\n'
        print(inits1, file=compiler)
        initFST1 = compiler.compile()
        fststr.expand_other_symbols(initFST1)

        # read dictionary file
        dict_file = open('in_vocab_dictionary_verbs.txt', 'r')
        # read each line of the file
        dict_lines = dict_file.readlines()
        # build FST for each word
        for line in dict_lines:
            # make each line into a list, one list for one word,
            # including its lemma form, surface form, and the form name
            line = line.strip()
            line = line.rstrip(',')
            lineList = line.split(',')
            # print(lineList)
            # now build and update FST base on each line
            s = ''
            for i in range(len(lineList[1])):
                try:
                    s += '{} {} {} {}\n'.format(i, i + 1, lineList[1][i],
                                                lineList[0][i])
                except:
                    s += '{} {} {} <epsilon>\n'.format(i, i + 1,
                                                       lineList[1][i])
            s += '{} {} <#> <epsilon>\n'.format(len(lineList[1]),
                                                len(lineList[1]) + 1)
            s += '{} {} <epsilon> +Known\n{}\n'.format(
                len(lineList[1]) + 1,
                len(lineList[1]) + 2,
                len(lineList[1]) + 2)
            # print(s)
            # now union current FST into the initFST1
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            print(s, file=compiler)
            currFST = compiler.compile()
            fststr.expand_other_symbols(currFST)
            initFST1 = initFST1.union(currFST)

        return initFST1
Пример #9
0
 def general():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 0 <#> <#>\n')
     compiler.write('0\n')
     # for special cases
     # compiler.write('0 1 i i\n')
     # compiler.write('1 2 n n\n')
     # compiler.write('2 3 g g\n')
     # compiler.write('3 4 <^> <^>\n')
     
     compiler.write('0 1 <^> <epsilon>\n')
     compiler.write('1 0 <other> <other>\n')
     compiler.write('1 0 <#> <#>\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c
Пример #10
0
 def get_morphotactics():
     suffix = ['', 's', 'ed', 'en', 'ing']
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     c = compiler.compile()
     
     for s in suffix:
         compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
         compiler.write('0 0 <other> <other>\n')
         compiler.write('0 1 +Guess <^>\n')
         l = len(s)
         for i in range(l):
             compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n')
         compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n')
         compiler.write(str(l+2))
         suffix_rule = compiler.compile()
         c = c.union(suffix_rule)
     fststr.expand_other_symbols(c)
     return c
    def buildAllomFST(self):

        # initialize a FST3
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        inits3 = '0\n'
        print(inits3, file=compiler)
        initFST3 = compiler.compile()
        fststr.expand_other_symbols(initFST3)

        # read allom FST txt files
        allom_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_allom_")
        ]
        # print(allom_files)
        # compile txt files into FST, and union them into initFST3
        for f in allom_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            allom = open(f).read()
            print(allom, file=compiler)
            allom_FST = compiler.compile()
            fststr.expand_other_symbols(allom_FST)
            initFST3 = initFST3.union(allom_FST)

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # allom = open('FST_allom_EInsertion_shch.txt').read()
        # print(allom, file=compiler)
        # allom_FST = compiler.compile()
        # fststr.expand_other_symbols(allom_FST)
        # initFST3 = initFST3.union(allom_FST)

        return initFST3
    def buildMorphFST(self):

        # initialize a FST2
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        inits2 = '0\n'
        print(inits2, file=compiler)
        initFST2 = compiler.compile()
        fststr.expand_other_symbols(initFST2)

        # read morph FST txt files
        morph_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_morph_")
        ]
        # print(morph_files)
        # compile txt files into FST, and union them into initFST2
        for f in morph_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            morph = open(f).read()
            print(morph, file=compiler)
            morph_FST = compiler.compile()
            fststr.expand_other_symbols(morph_FST)
            initFST2 = initFST2.union(morph_FST)

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # morph = open('FST_morph_ing.txt').read()
        # print(morph, file=compiler)
        # morph_FST = compiler.compile()
        # fststr.expand_other_symbols(morph_FST)
        # initFST2 = initFST2.union(morph_FST)

        return initFST2
Пример #13
0
    def __init__(self):
        self.st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)

        # create preprocessing FST
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write('0 \n 0 0 <other> <other> \n0 0 <epsilon> <#>')
        preprocessFST = compiler.compile()
        fststr.expand_other_symbols(preprocessFST)

        # gets subFSTs for in-vocab FST
        self.inVocabFile = open("in_vocab_dictionary_verbs.txt")
        inVocabFST = self.getInVocabFST()
        morphoFST = self.getMorphoFST()
        alloFST = self.getAlloFST()

        # creates out-of-vocab FST
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(
            '\n0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n0 2 <#> +Guess \n1 1 <other> <epsilon> \n1 2 <#> +Guess \n2'
        )
        oovPostFST = compiler.compile()
        fststr.expand_other_symbols(oovPostFST)
        temp = alloFST.union(oovPostFST)
        oovFST = fst.compose(morphoFST.arcsort(sort_type="olabel"),
                             temp.arcsort(sort_type="ilabel"))
        oovFST = fst.compose(oovFST.arcsort(sort_type="olabel"),
                             oovPostFST.arcsort(sort_type="ilabel"))

        # creates overall FST as union of each of the sub-FSTs
        self.fstOverall = fst.compose(
            preprocessFST.arcsort(sort_type="olabel"),
            inVocabFST.union(oovFST).arcsort(sort_type="ilabel"))
Пример #14
0
    def getInVocabFST(self):
        st = self.st
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write('')
        inVocabFST = compiler.compile()

        # takes each lemma, conjugated pair from the in-vocab file and adds it to the FST
        lineList = [line.rstrip('\n') for line in self.inVocabFile]
        for line in lineList:
            parts = line.split(',')
            parts[0] = [*parts[0]]
            parts[1] = [*parts[1]]
            for i in range(len(parts[0]) - len(parts[1])):
                parts[1].append('<epsilon>')
            for i in range(len(parts[1]) - len(parts[0])):
                parts[0].append('<epsilon>')
            fstword = ''
            for i in range(len(parts[0])):
                fstword += '\n' + str(i) + ' ' + str(
                    i + 1) + ' ' + parts[1][i] + ' ' + parts[0][i]
            fstword += '\n' + str(len(parts[0]) + 1)
            fstword += '\n' + str(len(
                parts[0])) + ' ' + str(len(parts[0]) + 1) + ' <#> +Known'
            ncompiler = fst.Compiler(isymbols=self.st,
                                     osymbols=self.st,
                                     keep_isymbols=True,
                                     keep_osymbols=True)
            ncompiler.write(fstword)
            fstNew = ncompiler.compile()
            fststr.expand_other_symbols(fstNew)
            inVocabFST.union(fstNew)

        fststr.expand_other_symbols(inVocabFST)
        return inVocabFST
Пример #15
0
    def getMorphoFST(self):
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write('')
        MorphoFST = compiler.compile()
        edFST = '\n0 0 <other> <other> \n0 1 e <epsilon> \n1 2 d <epsilon> \n1 0 <epsilon> e \n2 4 <#> <^> \n2 3 <epsilon> e \n3 0 <epsilon> d \n4 5 <epsilon> e \n5 6 <epsilon> d \n6 7 <epsilon> <#> \n7 \n7 7 <other> <other>'
        ingFST = '\n 0 0 <other> <other> \n0 1 i <epsilon> \n1 2 n <epsilon> \n1 0 <epsilon> i \n2 3 g <epsilon> \n2 9 <epsilon> i \n3 4 <#> <^> \n3 10 <epsilon> i \n4 5 <epsilon> i \n5 6 <epsilon> n \n6 7 <epsilon> g \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> n \n10 11 <epsilon> n \n11 0 <epsilon> g'
        sFST = '\n 0 0 <other> <other> \n0 1 s <epsilon> \n1 2 <#> <^> \n1 0 <epsilon> s \n2 3 <epsilon> s \n3 4 <epsilon> <#> \n4 \n4 4 <other> <other>'
        enFST = '\n0 0 <other> <other> \n0 1 e <epsilon> \n1 2 n <epsilon> \n1 0 <epsilon> e \n2 4 <#> <^> \n2 3 <epsilon> e \n3 0 <epsilon> n \n4 5 <epsilon> e \n5 6 <epsilon> n \n6 7 <epsilon> <#> \n7 \n7 7 <other> <other>'
        asIsFST = '\n0 1 <#> <^> \n0 0 <other> <other> \n1 2 <epsilon> <#> \n2 \n2 2 <other> <other>'

        edcompiler = fst.Compiler(isymbols=self.st,
                                  osymbols=self.st,
                                  keep_isymbols=True,
                                  keep_osymbols=True)
        edcompiler.write(edFST)
        edFSTNew = edcompiler.compile()
        fststr.expand_other_symbols(edFSTNew)

        ingcompiler = fst.Compiler(isymbols=self.st,
                                   osymbols=self.st,
                                   keep_isymbols=True,
                                   keep_osymbols=True)
        ingcompiler.write(ingFST)
        ingFSTNew = ingcompiler.compile()
        fststr.expand_other_symbols(ingFSTNew)

        scompiler = fst.Compiler(isymbols=self.st,
                                 osymbols=self.st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
        scompiler.write(sFST)
        sFSTNew = scompiler.compile()
        fststr.expand_other_symbols(sFSTNew)

        encompiler = fst.Compiler(isymbols=self.st,
                                  osymbols=self.st,
                                  keep_isymbols=True,
                                  keep_osymbols=True)
        encompiler.write(enFST)
        enFSTNew = encompiler.compile()
        fststr.expand_other_symbols(enFSTNew)

        asiscompiler = fst.Compiler(isymbols=self.st,
                                    osymbols=self.st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
        asiscompiler.write(asIsFST)
        asIsFSTNew = asiscompiler.compile()
        fststr.expand_other_symbols(asIsFSTNew)

        MorphoFST.union(
            edFSTNew.union(
                ingFSTNew.union(sFSTNew.union(enFSTNew.union(asIsFSTNew)))))
        fststr.expand_other_symbols(MorphoFST)
        return MorphoFST
Пример #16
0
from fststr import fststr
import pywrapfst as fst

# Init FST
st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
compiler = fst.Compiler(isymbols=st,
                        osymbols=st,
                        keep_isymbols=True,
                        keep_osymbols=True)
fst_file = open('e-insertion.txt').read()
print(fst_file, file=compiler)
c = compiler.compile()
fststr.expand_other_symbols(c)

# Test FST
test_in = 'fox<^>s<#>'
print("input:", test_in)
print("output:", fststr.apply(test_in, c))
Пример #17
0
    def get_in_vocab_fst(self):
        alphabet = fststr.EN_SYMB
        st = fststr.symbols_table_from_alphabet(
            alphabet)  # <class 'pywrapfst.SymbolTable'>
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        in_vocab_fst = compiler.compile()

        lemma = []
        verb_in_form = []
        with open('in_vocab_dictionary_verbs.txt', 'r') as f:
            for line in f.readlines():
                lemma.append(line.split(',')[0])
                verb_in_form.append(line.split(',')[1])

        for idx in range(len(lemma)):
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            compiler.write('0 0 <other> <other>\n')
            compiler.write('0 1 <#> <epsilon>\n')
            compiler.write('1\n')
            a_fst = compiler.compile()
            fststr.expand_other_symbols(a_fst)

            lemma_word = lemma[idx]
            lemma_word_length = len(lemma_word)
            form_word = verb_in_form[idx]
            form_word_length = len(form_word)
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            if form_word_length >= lemma_word_length:
                for i in range(form_word_length):
                    if i < lemma_word_length:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' ' + lemma_word[i] + '\n')
                    else:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' <epsilon>\n')

                compiler.write(
                    str(form_word_length) + ' ' + str(form_word_length + 1) +
                    ' <epsilon>' + ' +Known\n')
                compiler.write(str(form_word_length + 1))
                b_fst = compiler.compile()
                fststr.expand_other_symbols(b_fst)

                c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"),
                                    b_fst.arcsort(sort_type="ilabel"))
                in_vocab_fst.union(c_fst)
            else:
                for i in range(lemma_word_length):
                    if i < form_word_length:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' ' + lemma_word[i] + '\n')
                    else:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' <epsilon>' + ' ' +
                            lemma_word[i] + '\n')
                compiler.write(
                    str(lemma_word_length) + ' ' + str(lemma_word_length + 1) +
                    ' <epsilon> +Known\n')
                compiler.write(str(lemma_word_length + 1))
                b_fst = compiler.compile()
                fststr.expand_other_symbols(b_fst)

                c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"),
                                    b_fst.arcsort(sort_type="ilabel"))
                in_vocab_fst.union(c_fst)

        return in_vocab_fst
Пример #18
0
 def generateFst(self, data, st):
     lines = []
     lineLst = data.split("\n")
     count = 0
     for line in lineLst:
         curFst = ""
         stemNinf = line.split(",")[:2]
         curFst = "0\n"  # 0 as final state
         curFst += "0 0 <other> <other>\n"
         stem = stemNinf[0]
         if stem == "":
             return rootFst
         #print("stem: %s",stem)
         #if len(stemNinf)>1:
         inf = stemNinf[1]
         #print("inf: %s",inf)
         for i in range(len(stem)):
             curFst += str(i)
             curFst += " "
             curFst += str(i + 1)
             curFst += " "
             if i >= len(inf):
                 curFst += "<epsilon>"
             else:
                 curFst += inf[i]
             curFst += " "
             curFst += stem[i]
             curFst += "\n"
         infLen = len(inf)
         stemLen = len(stem)
         index = stemLen
         if stemLen > infLen:
             continue
         else:
             toBeReplaced = inf[stemLen:]
             for i, s in enumerate(toBeReplaced):
                 index = i + stemLen
                 curFst += str(index)
                 curFst += " "
                 curFst += str(index + 1)
                 curFst += " "
                 curFst += s
                 curFst += " "
                 curFst += "<epsilon>"
                 curFst += "\n"
         curFst += str(index + 1)
         curFst += " "
         curFst += "0"
         curFst += " "
         curFst += "<#>"
         curFst += " "
         curFst += "+Known"
         compiler = fst.Compiler(isymbols=st,
                                 osymbols=st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
         #print("curFst",curFst)
         compiler.write(curFst)
         other = compiler.compile()
         fststr.expand_other_symbols(other)
         if count == 0:
             rootFst = other
         else:
             rootFst = rootFst.union(other)
         count += 1
     return rootFst
    def buildpostProcessFST(self, input_str):

        # initialize a FSTpost
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        initposts = '0\n'
        print(initposts, file=compiler)
        initFSTpost = compiler.compile()
        fststr.expand_other_symbols(initFSTpost)

        # read post FST txt files
        post_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_post_")
        ]
        # print(post_files)
        # compile txt files into FST, and union them into initFSTpost
        for f in post_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            post = open(f).read()
            print(post, file=compiler)
            post_FST = compiler.compile()
            fststr.expand_other_symbols(post_FST)
            initFSTpost = initFSTpost.union(post_FST)
            #print("checkpoint: ", fststr.apply(input_str, initFSTpost), '\n')

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # post = open('FST_post_withsign.txt').read()
        # print(post, file=compiler)
        # post_FST = compiler.compile()
        # fststr.expand_other_symbols(post_FST)
        # initFSTpost = initFSTpost.union(post_FST)

        # FST that take care of input is original form
        s = ''
        # loop through the character parts of the input
        tracker = 0
        for i in range(len(input_str)):
            if (input_str[i] == '+'):
                s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1,
                                                     tracker + 1)
                tracker += 1
                break
            else:
                s += '{} {} {} {}\n'.format(tracker, tracker + 1,
                                            input_str[tracker],
                                            input_str[tracker])
                tracker += 1
        # take care of <#> in the end, change it to +Guess
        s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        print(s, file=compiler)
        original_case_FST = compiler.compile()
        fststr.expand_other_symbols(original_case_FST)
        initFSTpost = initFSTpost.union(original_case_FST)

        # # Last FST, clear out any word ends with <#>, output words ends with +Guess and +Known
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        clear = open('FST_finalclearance.txt').read()
        print(clear, file=compiler)
        clear_FST = compiler.compile()
        fststr.expand_other_symbols(clear_FST)
        lastFST = fst.compose(initFSTpost.arcsort(sort_type="olabel"),
                              clear_FST.arcsort(sort_type="ilabel"))

        return lastFST
Пример #20
0
    def getAlloFST(self):
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write('')
        AlloFST = compiler.compile()
        yRepl = '0 \n0 0 <other> <other> \n0 1 i <epsilon> \n1 2 e <epsilon> \n1 12 <^> <epsilon> \n1 0 <epsilon> i \n2 3 <^> <epsilon> \n2 9 <epsilon> i \n3 4 s <epsilon> \n3 10 <epsilon> i \n4 5 <#> y \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n12 19 <epsilon> i \n13 14 d <epsilon> \n13 20 <epsilon> i \n14 15 <#> y \n15 16 <epsilon> <^> \n16 17 <epsilon> e \n17 18 <epsilon> d \n18 8 <epsilon> <^> \n19 0 <epsilon> <^> \n20 21 <epsilon> <^> \n21 0 <epsilon> e'
        kIns = '\n0 \n0 0 <other> <other> \n0 1 c c \n1 0 <other> <other> \n1 2 k <epsilon> \n2 0 <epsilon> k \n2 3 <^> <epsilon> \n3 4 i <epsilon> \n 3 10 e <epsilon> \n3 14 <epsilon> k \n4 5 n <^> \n5 6 g i \n6 7 <#> n \n7 8 <epsilon> g \n8 9 <epsilon> <#> \n9 \n10 11 d <^> \n11 12 <#> e \n12 8 <epsilon> d \n14 0 <epsilon> <^>'
        eDel = '0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n1 2 i <epsilon> \n1 11 e <epsilon> \n2 3 n <epsilon> \n3 4 g <epsilon> \n4 5 <#> e \n5 6 <epsilon> <^> \n6 7 <epsilon> i \n7 8 <epsilon> n \n8 9 <epsilon> g \n9 10 <epsilon> <#> \n10 \n11 12 d <epsilon> \n11 16 <epsilon> e \n12 13 <#> e \n13 14 <epsilon> <^> \n14 15 <epsilon> e \n15 9 <epsilon> d \n16 0 <epsilon> <^>'
        eInsch = '0 \n0 0 <other> <other> \n0 1 c <epsilon> \n1 2 h <epsilon> \n1 0 <epsilon> c \n2 3 e <epsilon> \n2 11 <epsilon> c \n3 4 <^> <epsilon> '
        eInsch += '\n3 12 <epsilon> c \n4 5 s <epsilon> \n4 12 <epsilon> c \n5 6 <#> c \n6 7 <epsilon> h \n7 8 <epsilon> <^> \n8 9 <epsilon> s \n9 10 <epsilon> <#> \n10 \n10 10 <other> <other> \n11 0 <epsilon> h '
        eInsch += '\n12 13 <epsilon> h \n13 0 <epsilon> e \n14 15 <epsilon> h \n15 16 <epsilon> e \n16 0 <epsilon> <^>'
        eInss = '\n0 \n0 0 <other> <other> \n0 1 s <epsilon> \n1 2 e <epsilon> \n1 12 h <epsilon> \n1 0 <epsilon> s \n2 3 <^> <epsilon> \n2 9 <epsilon> s \n3 4 s <epsilon> \n3 10 <epsilon> s \n4 5 <#> s \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n 12 20 <epsilon> s \n13 14 <^> <epsilon> \n13 21 <epsilon> s \n14 15 s <epsilon> \n14 23 <epsilon> s \n15 16 <#> s \n16 17 <epsilon> h \n17 18 <epsilon> <^> \n18 19 <epsilon> s \n19 8 <epsilon> <#> \n20 0 <epsilon> h \n21 22 <epsilon> h \n22 0 <epsilon> e \n23 24 <epsilon> h \n24 25 <epsilon> e \n25 0 <epsilon> <^>'
        xz = [*'xz']
        eInsxz = '\n0 \n0 0 <other> <other> '
        for i in range(len(xz)):
            c = xz[i]
            n = 11 * i
            eInsxz += '\n0 ' + str(
                n + 1) + ' ' + c + ' <epsilon> ' + '\n' + str(
                    n + 1) + ' ' + str(n + 2) + ' e <epsilon> \n' + str(
                        n + 1) + ' 0 <epsilon> ' + c + '\n' + str(n + 2)
            eInsxz += ' ' + str(n + 3) + ' <^> <epsilon> \n' + str(
                n + 2) + ' ' + str(n + 9) + ' <epsilon> ' + c + ' \n' + str(
                    n + 3) + ' ' + str(n + 4) + ' s <epsilon> \n' + str(n + 4)
            eInsxz += ' ' + str(n + 5) + ' <#> ' + c + ' \n' + str(
                n + 3) + ' ' + str(n + 10) + ' <epsilon> ' + c + ' \n' + str(
                    n + 5) + ' ' + str(n + 6) + ' <epsilon> <^> \n'
            eInsxz += str(n + 6) + ' ' + str(n + 7) + ' <epsilon> s \n' + str(
                n + 7) + ' ' + str(n + 8) + ' <epsilon> <#> \n' + str(
                    n + 8) + ' \n' + str(n +
                                         8) + ' ' + str(n +
                                                        8) + ' <other> <other>'
            eInsxz += ' \n' + str(n + 9) + ' 0 <epsilon> e \n' + str(
                n + 10) + ' ' + str(n + 11) + ' <epsilon> e \n' + str(
                    n + 11) + ' 0 <epsilon> <^>'
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(eInsch)
        fstInsch = compiler.compile()
        fststr.expand_other_symbols(fstInsch)
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(eInss)
        fstInss = compiler.compile()
        fststr.expand_other_symbols(fstInss)
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(eInsxz)
        fstInsxz = compiler.compile()
        fststr.expand_other_symbols(fstInsxz)
        fsteIns = fstInsch.union(fstInss.union(fstInsxz))

        consonants = [*'bcdfghjklmnpqrstvwxz']
        consDoub = ''
        consDoub += '\n0 \n0 1 a a \n0 1 e e \n0 1 i i \n0 1 o o \n0 1 u u \n0 1 y y \n0 0 <other> <other>'
        consDoub += '\n1 0 a a \n1 0 e e \n1 0 i i \n1 0 o o \n1 0 u u \n1 0 y y'
        consDoub += '\n2 \n2 2 <other> <other>'
        for i in range(len(consonants)):
            c = consonants[i]
            consDoub += '\n1 ' + str(8 * i + 3) + ' ' + c + ' ' + c
            consDoub += '\n' + str(8 * i +
                                   3) + ' ' + str(8 * i +
                                                  4) + ' ' + c + ' <epsilon>'
            consDoub += '\n' + str(8 * i + 3) + ' 0 <other> <other>'
            consDoub += '\n' + str(8 * i + 4) + ' ' + str(8 * i +
                                                          5) + ' <^> <^>'
            consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 6) + ' i i'
            consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 9) + ' e e'
            consDoub += '\n' + str(8 * i + 5) + ' 0 <other> <other>'
            consDoub += '\n' + str(8 * i + 6) + ' ' + str(8 * i + 7) + ' n n'
            consDoub += '\n' + str(8 * i + 7) + ' ' + str(8 * i + 8) + ' g g'
            consDoub += '\n' + str(8 * i + 8) + ' 2 <#> <#>'
            consDoub += '\n' + str(8 * i + 9) + ' ' + str(8 * i + 10) + ' d d'
            consDoub += '\n' + str(8 * i + 9) + ' 0 <other> <other>'
            consDoub += '\n' + str(8 * i + 10) + ' 2 <#> <#>'
        ycompiler = fst.Compiler(isymbols=self.st,
                                 osymbols=self.st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
        ycompiler.write(yRepl)
        yReplNew = ycompiler.compile()
        fststr.expand_other_symbols(yReplNew)

        kcompiler = fst.Compiler(isymbols=self.st,
                                 osymbols=self.st,
                                 keep_isymbols=True,
                                 keep_osymbols=True)
        kcompiler.write(kIns)
        kInsNew = kcompiler.compile()
        fststr.expand_other_symbols(kInsNew)

        edcompiler = fst.Compiler(isymbols=self.st,
                                  osymbols=self.st,
                                  keep_isymbols=True,
                                  keep_osymbols=True)
        edcompiler.write(eDel)
        eDelNew = edcompiler.compile()
        fststr.expand_other_symbols(eDelNew)

        cdcompiler = fst.Compiler(isymbols=self.st,
                                  osymbols=self.st,
                                  keep_isymbols=True,
                                  keep_osymbols=True)
        cdcompiler.write(consDoub)
        consDoubNew = cdcompiler.compile()
        fststr.expand_other_symbols(consDoubNew)

        AlloFST.union(
            yReplNew.union(
                kInsNew.union(fsteIns.union(eDelNew.union(consDoubNew)))))
        fststr.expand_other_symbols(AlloFST)
        return AlloFST
Пример #21
0
 def lemmatize(self, str):
     fststr.expand_other_symbols(self.fstOverall)
     return fststr.apply(str, self.fstOverall)
Пример #22
0
 def delemmatize(self, str):
     fststr.expand_other_symbols(self.fstOverall)
     toReturn = fststr.apply(str, self.fstOverall.invert())
     self.fstOverall.invert()
     return toReturn