Python symbols_table_from_alphabet 예제들, fststr.fststr.symbols_table_from_alphabet Python 예제들

예제 #1

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

    def buildpreProcessFST_delemmatize(self):

        # initialize a FSTpre
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        initpres = '1\n'
        print(initpres, file=compiler)
        initFSTpre = compiler.compile()
        fststr.expand_other_symbols(initFSTpre)

        pre_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_pre_")
        ]
        # print(pre_files)
        # compile txt files into FST, and union them into initFSTpre
        for f in pre_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            pre = open(f).read()
            print(pre, file=compiler)
            pre_FST = compiler.compile()
            fststr.expand_other_symbols(pre_FST)
            initFSTpre = initFSTpre.union(pre_FST)

        return initFSTpre

예제 #2

0

파일 보기

 def consonant_doubling():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z']
     vowel = ['a', 'e', 'i', 'o', 'u']
     compiler.write('0\n')
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 0 <#> <#>\n')
     for v in vowel:
         compiler.write('0 2 ' + v + ' ' + v + '\n')
     for c in consonant:
         compiler.write('0 1 ' + c + ' ' + c + '\n')
         compiler.write('1 1 ' + c + ' ' + c + '\n')
     for v in vowel:
         compiler.write('1 2 ' + v + ' ' + v + '\n')
     compiler.write('2 2 i i\n')
     compiler.write('2 2 u u\n')
     for i in range(len(consonant)):
         compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n')
         compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
         compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
         for c in consonant:
             compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n')
         for v in vowel:
             compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n')
     compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n')
     compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n')
     compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n')
     compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n')
     compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n')
     compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c

예제 #3

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

    def buildpreProcessFST(self, curr_str):

        s = '0\n'
        tracker = 0
        for i in range(len(curr_str)):
            if (curr_str[i] == '+') or (curr_str[i] == '<'):
                s += '{} {} <epsilon> {}\n'.format(
                    tracker, tracker + 1, curr_str[tracker:len(curr_str)])
                # tracker +=1
                break
            else:
                s += '{} {} {} {}\n'.format(tracker, tracker + 1,
                                            curr_str[tracker],
                                            curr_str[tracker])
                tracker += 1

        s += '{} {} <epsilon> <#>\n{}\n'.format(tracker, tracker + 1,
                                                tracker + 1)
        # print(s)
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        print(s, file=compiler)
        FSTpre = compiler.compile()
        fststr.expand_other_symbols(FSTpre)

        return FSTpre

예제 #4

0

파일 보기

 def ch_sh_e_insertion():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     fst_file = open('ch_sh_e_insertion.txt').read()
     print(fst_file, file=compiler)
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c

예제 #5

0

파일 보기

 def y_replacement():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     fst_file = open('y_replacement.txt').read()
     print(fst_file, file=compiler)
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c

예제 #6

0

파일 보기

 def del_sharp():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     compiler.write('0\n')
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 1 <#> <epsilon>\n')
     compiler.write('1\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c

예제 #7

0

파일 보기

파일: lemmatizer.py 프로젝트: brianmchoi/FST-Lemmatizer

 def get_compiler_from_file_name(self, file_name):
   st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
   compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
   in_file = open(file_name)
   fst_file = in_file.read()
   print(fst_file, file=compiler)
   c = compiler.compile()
   fststr.expand_other_symbols(c)
   in_file.close()
   return c

예제 #8

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

    def buildInVocabFST(self):

        # initialize a FST1
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        inits1 = '0\n'
        print(inits1, file=compiler)
        initFST1 = compiler.compile()
        fststr.expand_other_symbols(initFST1)

        # read dictionary file
        dict_file = open('in_vocab_dictionary_verbs.txt', 'r')
        # read each line of the file
        dict_lines = dict_file.readlines()
        # build FST for each word
        for line in dict_lines:
            # make each line into a list, one list for one word,
            # including its lemma form, surface form, and the form name
            line = line.strip()
            line = line.rstrip(',')
            lineList = line.split(',')
            # print(lineList)
            # now build and update FST base on each line
            s = ''
            for i in range(len(lineList[1])):
                try:
                    s += '{} {} {} {}\n'.format(i, i + 1, lineList[1][i],
                                                lineList[0][i])
                except:
                    s += '{} {} {} <epsilon>\n'.format(i, i + 1,
                                                       lineList[1][i])
            s += '{} {} <#> <epsilon>\n'.format(len(lineList[1]),
                                                len(lineList[1]) + 1)
            s += '{} {} <epsilon> +Known\n{}\n'.format(
                len(lineList[1]) + 1,
                len(lineList[1]) + 2,
                len(lineList[1]) + 2)
            # print(s)
            # now union current FST into the initFST1
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            print(s, file=compiler)
            currFST = compiler.compile()
            fststr.expand_other_symbols(currFST)
            initFST1 = initFST1.union(currFST)

        return initFST1

예제 #9

0

파일 보기

 def general():
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     compiler.write('0 0 <other> <other>\n')
     compiler.write('0 0 <#> <#>\n')
     compiler.write('0\n')
     # for special cases
     # compiler.write('0 1 i i\n')
     # compiler.write('1 2 n n\n')
     # compiler.write('2 3 g g\n')
     # compiler.write('3 4 <^> <^>\n')
     
     compiler.write('0 1 <^> <epsilon>\n')
     compiler.write('1 0 <other> <other>\n')
     compiler.write('1 0 <#> <#>\n')
     c = compiler.compile()
     fststr.expand_other_symbols(c)
     return c

예제 #10

0

파일 보기

 def get_morphotactics():
     suffix = ['', 's', 'ed', 'en', 'ing']
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
     c = compiler.compile()
     
     for s in suffix:
         compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
         compiler.write('0 0 <other> <other>\n')
         compiler.write('0 1 +Guess <^>\n')
         l = len(s)
         for i in range(l):
             compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n')
         compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n')
         compiler.write(str(l+2))
         suffix_rule = compiler.compile()
         c = c.union(suffix_rule)
     fststr.expand_other_symbols(c)
     return c

예제 #11

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

    def buildAllomFST(self):

        # initialize a FST3
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        inits3 = '0\n'
        print(inits3, file=compiler)
        initFST3 = compiler.compile()
        fststr.expand_other_symbols(initFST3)

        # read allom FST txt files
        allom_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_allom_")
        ]
        # print(allom_files)
        # compile txt files into FST, and union them into initFST3
        for f in allom_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            allom = open(f).read()
            print(allom, file=compiler)
            allom_FST = compiler.compile()
            fststr.expand_other_symbols(allom_FST)
            initFST3 = initFST3.union(allom_FST)

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # allom = open('FST_allom_EInsertion_shch.txt').read()
        # print(allom, file=compiler)
        # allom_FST = compiler.compile()
        # fststr.expand_other_symbols(allom_FST)
        # initFST3 = initFST3.union(allom_FST)

        return initFST3

예제 #12

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

    def buildMorphFST(self):

        # initialize a FST2
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        inits2 = '0\n'
        print(inits2, file=compiler)
        initFST2 = compiler.compile()
        fststr.expand_other_symbols(initFST2)

        # read morph FST txt files
        morph_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_morph_")
        ]
        # print(morph_files)
        # compile txt files into FST, and union them into initFST2
        for f in morph_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            morph = open(f).read()
            print(morph, file=compiler)
            morph_FST = compiler.compile()
            fststr.expand_other_symbols(morph_FST)
            initFST2 = initFST2.union(morph_FST)

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # morph = open('FST_morph_ing.txt').read()
        # print(morph, file=compiler)
        # morph_FST = compiler.compile()
        # fststr.expand_other_symbols(morph_FST)
        # initFST2 = initFST2.union(morph_FST)

        return initFST2

예제 #13

0

파일 보기

파일: lemmatizer.py 프로젝트: ecohron/lemmatizer

    def __init__(self):
        self.st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)

        # create preprocessing FST
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write('0 \n 0 0 <other> <other> \n0 0 <epsilon> <#>')
        preprocessFST = compiler.compile()
        fststr.expand_other_symbols(preprocessFST)

        # gets subFSTs for in-vocab FST
        self.inVocabFile = open("in_vocab_dictionary_verbs.txt")
        inVocabFST = self.getInVocabFST()
        morphoFST = self.getMorphoFST()
        alloFST = self.getAlloFST()

        # creates out-of-vocab FST
        compiler = fst.Compiler(isymbols=self.st,
                                osymbols=self.st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        compiler.write(
            '\n0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n0 2 <#> +Guess \n1 1 <other> <epsilon> \n1 2 <#> +Guess \n2'
        )
        oovPostFST = compiler.compile()
        fststr.expand_other_symbols(oovPostFST)
        temp = alloFST.union(oovPostFST)
        oovFST = fst.compose(morphoFST.arcsort(sort_type="olabel"),
                             temp.arcsort(sort_type="ilabel"))
        oovFST = fst.compose(oovFST.arcsort(sort_type="olabel"),
                             oovPostFST.arcsort(sort_type="ilabel"))

        # creates overall FST as union of each of the sub-FSTs
        self.fstOverall = fst.compose(
            preprocessFST.arcsort(sort_type="olabel"),
            inVocabFST.union(oovFST).arcsort(sort_type="ilabel"))

예제 #14

0

파일 보기

from fststr import fststr
import pywrapfst as fst

# Init FST
st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
compiler = fst.Compiler(isymbols=st,
                        osymbols=st,
                        keep_isymbols=True,
                        keep_osymbols=True)
fst_file = open('e-insertion.txt').read()
print(fst_file, file=compiler)
c = compiler.compile()
fststr.expand_other_symbols(c)

# Test FST
test_in = 'fox<^>s<#>'
print("input:", test_in)
print("output:", fststr.apply(test_in, c))

예제 #15

0

파일 보기

    def get_in_vocab_fst(self):
        alphabet = fststr.EN_SYMB
        st = fststr.symbols_table_from_alphabet(
            alphabet)  # <class 'pywrapfst.SymbolTable'>
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        in_vocab_fst = compiler.compile()

        lemma = []
        verb_in_form = []
        with open('in_vocab_dictionary_verbs.txt', 'r') as f:
            for line in f.readlines():
                lemma.append(line.split(',')[0])
                verb_in_form.append(line.split(',')[1])

        for idx in range(len(lemma)):
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            compiler.write('0 0 <other> <other>\n')
            compiler.write('0 1 <#> <epsilon>\n')
            compiler.write('1\n')
            a_fst = compiler.compile()
            fststr.expand_other_symbols(a_fst)

            lemma_word = lemma[idx]
            lemma_word_length = len(lemma_word)
            form_word = verb_in_form[idx]
            form_word_length = len(form_word)
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            if form_word_length >= lemma_word_length:
                for i in range(form_word_length):
                    if i < lemma_word_length:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' ' + lemma_word[i] + '\n')
                    else:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' <epsilon>\n')

                compiler.write(
                    str(form_word_length) + ' ' + str(form_word_length + 1) +
                    ' <epsilon>' + ' +Known\n')
                compiler.write(str(form_word_length + 1))
                b_fst = compiler.compile()
                fststr.expand_other_symbols(b_fst)

                c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"),
                                    b_fst.arcsort(sort_type="ilabel"))
                in_vocab_fst.union(c_fst)
            else:
                for i in range(lemma_word_length):
                    if i < form_word_length:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' ' + form_word[i] +
                            ' ' + lemma_word[i] + '\n')
                    else:
                        compiler.write(
                            str(i) + ' ' + str(i + 1) + ' <epsilon>' + ' ' +
                            lemma_word[i] + '\n')
                compiler.write(
                    str(lemma_word_length) + ' ' + str(lemma_word_length + 1) +
                    ' <epsilon> +Known\n')
                compiler.write(str(lemma_word_length + 1))
                b_fst = compiler.compile()
                fststr.expand_other_symbols(b_fst)

                c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"),
                                    b_fst.arcsort(sort_type="ilabel"))
                in_vocab_fst.union(c_fst)

        return in_vocab_fst

예제 #16

0

파일 보기

 def get_in_vocab_fst(self):
     with open("in_vocab_dictionary_verbs.txt", "r") as f:
         data = f.read()
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     fsT = self.generateFst(data, st)
     return fsT

예제 #17

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

    def buildpostProcessFST(self, input_str):

        # initialize a FSTpost
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        initposts = '0\n'
        print(initposts, file=compiler)
        initFSTpost = compiler.compile()
        fststr.expand_other_symbols(initFSTpost)

        # read post FST txt files
        post_files = [
            filename for filename in os.listdir('.')
            if filename.startswith("FST_post_")
        ]
        # print(post_files)
        # compile txt files into FST, and union them into initFSTpost
        for f in post_files:
            compiler = fst.Compiler(isymbols=st,
                                    osymbols=st,
                                    keep_isymbols=True,
                                    keep_osymbols=True)
            post = open(f).read()
            print(post, file=compiler)
            post_FST = compiler.compile()
            fststr.expand_other_symbols(post_FST)
            initFSTpost = initFSTpost.union(post_FST)
            #print("checkpoint: ", fststr.apply(input_str, initFSTpost), '\n')

        # Run indivdual FST file, for debugging purposes:
        # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        # post = open('FST_post_withsign.txt').read()
        # print(post, file=compiler)
        # post_FST = compiler.compile()
        # fststr.expand_other_symbols(post_FST)
        # initFSTpost = initFSTpost.union(post_FST)

        # FST that take care of input is original form
        s = ''
        # loop through the character parts of the input
        tracker = 0
        for i in range(len(input_str)):
            if (input_str[i] == '+'):
                s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1,
                                                     tracker + 1)
                tracker += 1
                break
            else:
                s += '{} {} {} {}\n'.format(tracker, tracker + 1,
                                            input_str[tracker],
                                            input_str[tracker])
                tracker += 1
        # take care of <#> in the end, change it to +Guess
        s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1)
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        print(s, file=compiler)
        original_case_FST = compiler.compile()
        fststr.expand_other_symbols(original_case_FST)
        initFSTpost = initFSTpost.union(original_case_FST)

        # # Last FST, clear out any word ends with <#>, output words ends with +Guess and +Known
        compiler = fst.Compiler(isymbols=st,
                                osymbols=st,
                                keep_isymbols=True,
                                keep_osymbols=True)
        clear = open('FST_finalclearance.txt').read()
        print(clear, file=compiler)
        clear_FST = compiler.compile()
        fststr.expand_other_symbols(clear_FST)
        lastFST = fst.compose(initFSTpost.arcsort(sort_type="olabel"),
                              clear_FST.arcsort(sort_type="ilabel"))

        return lastFST

예제 #18

0

파일 보기

class Lemmatizer:
    def e_insertion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('e-insertion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def k_insertion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('k-insertion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def get_morphotactics():
        suffix = ['', 's', 'ed', 'en', 'ing']
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        c = compiler.compile()
        
        for s in suffix:
            compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
            compiler.write('0 0 <other> <other>\n')
            compiler.write('0 1 +Guess <^>\n')
            l = len(s)
            for i in range(l):
                compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n')
            compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n')
            compiler.write(str(l+2))
            suffix_rule = compiler.compile()
            c = c.union(suffix_rule)
        fststr.expand_other_symbols(c)
        return c

    def general():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        compiler.write('0 0 <other> <other>\n')
        compiler.write('0 0 <#> <#>\n')
        compiler.write('0\n')
        # for special cases
        # compiler.write('0 1 i i\n')
        # compiler.write('1 2 n n\n')
        # compiler.write('2 3 g g\n')
        # compiler.write('3 4 <^> <^>\n')
        
        compiler.write('0 1 <^> <epsilon>\n')
        compiler.write('1 0 <other> <other>\n')
        compiler.write('1 0 <#> <#>\n')
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def e_deletion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('silent-e-deletion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c 

    def ch_sh_e_insertion():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('ch_sh_e_insertion.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def y_replacement():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        fst_file = open('y_replacement.txt').read()
        print(fst_file, file=compiler)
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def del_sharp():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        compiler.write('0\n')
        compiler.write('0 0 <other> <other>\n')
        compiler.write('0 1 <#> <epsilon>\n')
        compiler.write('1\n')
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    def consonant_doubling():
        st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z']
        vowel = ['a', 'e', 'i', 'o', 'u']
        compiler.write('0\n')
        compiler.write('0 0 <other> <other>\n')
        compiler.write('0 0 <#> <#>\n')
        for v in vowel:
            compiler.write('0 2 ' + v + ' ' + v + '\n')
        for c in consonant:
            compiler.write('0 1 ' + c + ' ' + c + '\n')
            compiler.write('1 1 ' + c + ' ' + c + '\n')
        for v in vowel:
            compiler.write('1 2 ' + v + ' ' + v + '\n')
        compiler.write('2 2 i i\n')
        compiler.write('2 2 u u\n')
        for i in range(len(consonant)):
            compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n')
            compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
            compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n')
            for c in consonant:
                compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n')
            for v in vowel:
                compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n')
        compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n')
        compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n')
        compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n')
        compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n')
        compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n')
        compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n')
        c = compiler.compile()
        fststr.expand_other_symbols(c)
        return c

    st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)

    lemma = []
    allomorphy = []

    with open("in_vocab_dictionary_verbs.txt", "r") as f:
        for line in f.readlines():
            lemma.append(line.split(',')[0])
            allomorphy.append(line.split(',')[1])

    compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
    rule = compiler.compile()

    for index in range(len(lemma)):
        compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True)
        if len(allomorphy[index]) >= len(lemma[index]):
            for i in range(len(allomorphy[index])):
                if i < len(lemma[index]):
                    compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n')
                else:
                    compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' <epsilon>' + '\n')
            l = len(allomorphy[index])
            compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n')
            compiler.write(str(l+1))
            rule.union(compiler.compile())
        else:
            for i in range(len(lemma[index])):
                if i < len(allomorphy[index]):
                    compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n')
                else:
                    compiler.write(str(i) + ' ' + str(i+1) + ' <epsilon>' + ' ' + lemma[index][i] + '\n')
            l = len(lemma[index])
            compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n')
            compiler.write(str(l+1))
            rule.union(compiler.compile())

    de_iv_rule = rule.copy().invert()

    morphotactics_rule = get_morphotactics()
    e_insertion_rule = e_insertion()
    k_insertion_rule = k_insertion()
    e_deletion_rule = e_deletion()
    general_rule = general()
    ch_sh_e_insertion_rule = ch_sh_e_insertion()
    y_replacement_rule = y_replacement()
    del_sharp_rule = del_sharp()
    consonant_doubling_rule = consonant_doubling()

    new_rule = k_insertion_rule.union(e_insertion_rule).union(general_rule).union(e_deletion_rule).union(ch_sh_e_insertion_rule).union(y_replacement_rule).union(consonant_doubling_rule)
    de_oov = fst.compose(morphotactics_rule.arcsort(sort_type="olabel"), new_rule.arcsort(sort_type="ilabel"))
    de_oov_rule = fst.compose(de_oov.arcsort(sort_type="olabel"), del_sharp_rule.arcsort(sort_type="ilabel"))
    
    oov_rule = de_oov_rule.copy().invert()
    # The final rules for lemmatizer
    rule = rule.union(oov_rule)
    # The final rules for delemmatizer
    de_rule = de_iv_rule.union(de_oov_rule)

    def lemmatize(self, in_str):
        
        out_set = set()
        for i in fststr.apply(in_str, self.rule):
            out_set.add(i)
        if in_str[-3:] == 'ing' or in_str[-2:] == 'ed' or in_str[-2:] == 'en' or (in_str[-1] == 's' and in_str[-2] != 's'):
            out_set.remove(in_str+'+Guess')
        
        return out_set

    def delemmatize(self, in_str):
        out_set = set()
        for i in fststr.apply(in_str, self.de_rule):
            out_set.add(i)
        return out_set

예제 #19

0

파일 보기

파일: lemmatizer.py 프로젝트: JingyuanXing/Finite-State-Transducer

 def __init__(self):
     # get the symbol table
     st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB)
     return