def buildpreProcessFST_delemmatize(self): # initialize a FSTpre st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) initpres = '1\n' print(initpres, file=compiler) initFSTpre = compiler.compile() fststr.expand_other_symbols(initFSTpre) pre_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_pre_") ] # print(pre_files) # compile txt files into FST, and union them into initFSTpre for f in pre_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) pre = open(f).read() print(pre, file=compiler) pre_FST = compiler.compile() fststr.expand_other_symbols(pre_FST) initFSTpre = initFSTpre.union(pre_FST) return initFSTpre
def consonant_doubling(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z'] vowel = ['a', 'e', 'i', 'o', 'u'] compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') for v in vowel: compiler.write('0 2 ' + v + ' ' + v + '\n') for c in consonant: compiler.write('0 1 ' + c + ' ' + c + '\n') compiler.write('1 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write('1 2 ' + v + ' ' + v + '\n') compiler.write('2 2 i i\n') compiler.write('2 2 u u\n') for i in range(len(consonant)): compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n') for c in consonant: compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n') compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n') compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n') compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n') compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n') compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n') compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def buildpreProcessFST(self, curr_str): s = '0\n' tracker = 0 for i in range(len(curr_str)): if (curr_str[i] == '+') or (curr_str[i] == '<'): s += '{} {} <epsilon> {}\n'.format( tracker, tracker + 1, curr_str[tracker:len(curr_str)]) # tracker +=1 break else: s += '{} {} {} {}\n'.format(tracker, tracker + 1, curr_str[tracker], curr_str[tracker]) tracker += 1 s += '{} {} <epsilon> <#>\n{}\n'.format(tracker, tracker + 1, tracker + 1) # print(s) st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) FSTpre = compiler.compile() fststr.expand_other_symbols(FSTpre) return FSTpre
def ch_sh_e_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('ch_sh_e_insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c
def y_replacement(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('y_replacement.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c
def del_sharp(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def get_compiler_from_file_name(self, file_name): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) in_file = open(file_name) fst_file = in_file.read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) in_file.close() return c
def buildInVocabFST(self): # initialize a FST1 st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) inits1 = '0\n' print(inits1, file=compiler) initFST1 = compiler.compile() fststr.expand_other_symbols(initFST1) # read dictionary file dict_file = open('in_vocab_dictionary_verbs.txt', 'r') # read each line of the file dict_lines = dict_file.readlines() # build FST for each word for line in dict_lines: # make each line into a list, one list for one word, # including its lemma form, surface form, and the form name line = line.strip() line = line.rstrip(',') lineList = line.split(',') # print(lineList) # now build and update FST base on each line s = '' for i in range(len(lineList[1])): try: s += '{} {} {} {}\n'.format(i, i + 1, lineList[1][i], lineList[0][i]) except: s += '{} {} {} <epsilon>\n'.format(i, i + 1, lineList[1][i]) s += '{} {} <#> <epsilon>\n'.format(len(lineList[1]), len(lineList[1]) + 1) s += '{} {} <epsilon> +Known\n{}\n'.format( len(lineList[1]) + 1, len(lineList[1]) + 2, len(lineList[1]) + 2) # print(s) # now union current FST into the initFST1 compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) currFST = compiler.compile() fststr.expand_other_symbols(currFST) initFST1 = initFST1.union(currFST) return initFST1
def general(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') compiler.write('0\n') # for special cases # compiler.write('0 1 i i\n') # compiler.write('1 2 n n\n') # compiler.write('2 3 g g\n') # compiler.write('3 4 <^> <^>\n') compiler.write('0 1 <^> <epsilon>\n') compiler.write('1 0 <other> <other>\n') compiler.write('1 0 <#> <#>\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def get_morphotactics(): suffix = ['', 's', 'ed', 'en', 'ing'] st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) c = compiler.compile() for s in suffix: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 +Guess <^>\n') l = len(s) for i in range(l): compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n') compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n') compiler.write(str(l+2)) suffix_rule = compiler.compile() c = c.union(suffix_rule) fststr.expand_other_symbols(c) return c
def buildAllomFST(self): # initialize a FST3 st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) inits3 = '0\n' print(inits3, file=compiler) initFST3 = compiler.compile() fststr.expand_other_symbols(initFST3) # read allom FST txt files allom_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_allom_") ] # print(allom_files) # compile txt files into FST, and union them into initFST3 for f in allom_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) allom = open(f).read() print(allom, file=compiler) allom_FST = compiler.compile() fststr.expand_other_symbols(allom_FST) initFST3 = initFST3.union(allom_FST) # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # allom = open('FST_allom_EInsertion_shch.txt').read() # print(allom, file=compiler) # allom_FST = compiler.compile() # fststr.expand_other_symbols(allom_FST) # initFST3 = initFST3.union(allom_FST) return initFST3
def buildMorphFST(self): # initialize a FST2 st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) inits2 = '0\n' print(inits2, file=compiler) initFST2 = compiler.compile() fststr.expand_other_symbols(initFST2) # read morph FST txt files morph_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_morph_") ] # print(morph_files) # compile txt files into FST, and union them into initFST2 for f in morph_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) morph = open(f).read() print(morph, file=compiler) morph_FST = compiler.compile() fststr.expand_other_symbols(morph_FST) initFST2 = initFST2.union(morph_FST) # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # morph = open('FST_morph_ing.txt').read() # print(morph, file=compiler) # morph_FST = compiler.compile() # fststr.expand_other_symbols(morph_FST) # initFST2 = initFST2.union(morph_FST) return initFST2
def __init__(self): self.st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) # create preprocessing FST compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 \n 0 0 <other> <other> \n0 0 <epsilon> <#>') preprocessFST = compiler.compile() fststr.expand_other_symbols(preprocessFST) # gets subFSTs for in-vocab FST self.inVocabFile = open("in_vocab_dictionary_verbs.txt") inVocabFST = self.getInVocabFST() morphoFST = self.getMorphoFST() alloFST = self.getAlloFST() # creates out-of-vocab FST compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write( '\n0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n0 2 <#> +Guess \n1 1 <other> <epsilon> \n1 2 <#> +Guess \n2' ) oovPostFST = compiler.compile() fststr.expand_other_symbols(oovPostFST) temp = alloFST.union(oovPostFST) oovFST = fst.compose(morphoFST.arcsort(sort_type="olabel"), temp.arcsort(sort_type="ilabel")) oovFST = fst.compose(oovFST.arcsort(sort_type="olabel"), oovPostFST.arcsort(sort_type="ilabel")) # creates overall FST as union of each of the sub-FSTs self.fstOverall = fst.compose( preprocessFST.arcsort(sort_type="olabel"), inVocabFST.union(oovFST).arcsort(sort_type="ilabel"))
from fststr import fststr import pywrapfst as fst # Init FST st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('e-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) # Test FST test_in = 'fox<^>s<#>' print("input:", test_in) print("output:", fststr.apply(test_in, c))
def get_in_vocab_fst(self): alphabet = fststr.EN_SYMB st = fststr.symbols_table_from_alphabet( alphabet) # <class 'pywrapfst.SymbolTable'> compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) in_vocab_fst = compiler.compile() lemma = [] verb_in_form = [] with open('in_vocab_dictionary_verbs.txt', 'r') as f: for line in f.readlines(): lemma.append(line.split(',')[0]) verb_in_form.append(line.split(',')[1]) for idx in range(len(lemma)): compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') a_fst = compiler.compile() fststr.expand_other_symbols(a_fst) lemma_word = lemma[idx] lemma_word_length = len(lemma_word) form_word = verb_in_form[idx] form_word_length = len(form_word) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) if form_word_length >= lemma_word_length: for i in range(form_word_length): if i < lemma_word_length: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' ' + lemma_word[i] + '\n') else: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' <epsilon>\n') compiler.write( str(form_word_length) + ' ' + str(form_word_length + 1) + ' <epsilon>' + ' +Known\n') compiler.write(str(form_word_length + 1)) b_fst = compiler.compile() fststr.expand_other_symbols(b_fst) c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"), b_fst.arcsort(sort_type="ilabel")) in_vocab_fst.union(c_fst) else: for i in range(lemma_word_length): if i < form_word_length: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' ' + lemma_word[i] + '\n') else: compiler.write( str(i) + ' ' + str(i + 1) + ' <epsilon>' + ' ' + lemma_word[i] + '\n') compiler.write( str(lemma_word_length) + ' ' + str(lemma_word_length + 1) + ' <epsilon> +Known\n') compiler.write(str(lemma_word_length + 1)) b_fst = compiler.compile() fststr.expand_other_symbols(b_fst) c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"), b_fst.arcsort(sort_type="ilabel")) in_vocab_fst.union(c_fst) return in_vocab_fst
def get_in_vocab_fst(self): with open("in_vocab_dictionary_verbs.txt", "r") as f: data = f.read() st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) fsT = self.generateFst(data, st) return fsT
def buildpostProcessFST(self, input_str): # initialize a FSTpost st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) initposts = '0\n' print(initposts, file=compiler) initFSTpost = compiler.compile() fststr.expand_other_symbols(initFSTpost) # read post FST txt files post_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_post_") ] # print(post_files) # compile txt files into FST, and union them into initFSTpost for f in post_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) post = open(f).read() print(post, file=compiler) post_FST = compiler.compile() fststr.expand_other_symbols(post_FST) initFSTpost = initFSTpost.union(post_FST) #print("checkpoint: ", fststr.apply(input_str, initFSTpost), '\n') # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # post = open('FST_post_withsign.txt').read() # print(post, file=compiler) # post_FST = compiler.compile() # fststr.expand_other_symbols(post_FST) # initFSTpost = initFSTpost.union(post_FST) # FST that take care of input is original form s = '' # loop through the character parts of the input tracker = 0 for i in range(len(input_str)): if (input_str[i] == '+'): s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1) tracker += 1 break else: s += '{} {} {} {}\n'.format(tracker, tracker + 1, input_str[tracker], input_str[tracker]) tracker += 1 # take care of <#> in the end, change it to +Guess s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) original_case_FST = compiler.compile() fststr.expand_other_symbols(original_case_FST) initFSTpost = initFSTpost.union(original_case_FST) # # Last FST, clear out any word ends with <#>, output words ends with +Guess and +Known compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) clear = open('FST_finalclearance.txt').read() print(clear, file=compiler) clear_FST = compiler.compile() fststr.expand_other_symbols(clear_FST) lastFST = fst.compose(initFSTpost.arcsort(sort_type="olabel"), clear_FST.arcsort(sort_type="ilabel")) return lastFST
class Lemmatizer: def e_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('e-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def k_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('k-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def get_morphotactics(): suffix = ['', 's', 'ed', 'en', 'ing'] st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) c = compiler.compile() for s in suffix: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 +Guess <^>\n') l = len(s) for i in range(l): compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n') compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n') compiler.write(str(l+2)) suffix_rule = compiler.compile() c = c.union(suffix_rule) fststr.expand_other_symbols(c) return c def general(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') compiler.write('0\n') # for special cases # compiler.write('0 1 i i\n') # compiler.write('1 2 n n\n') # compiler.write('2 3 g g\n') # compiler.write('3 4 <^> <^>\n') compiler.write('0 1 <^> <epsilon>\n') compiler.write('1 0 <other> <other>\n') compiler.write('1 0 <#> <#>\n') c = compiler.compile() fststr.expand_other_symbols(c) return c def e_deletion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('silent-e-deletion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def ch_sh_e_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('ch_sh_e_insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def y_replacement(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('y_replacement.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def del_sharp(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') c = compiler.compile() fststr.expand_other_symbols(c) return c def consonant_doubling(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z'] vowel = ['a', 'e', 'i', 'o', 'u'] compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') for v in vowel: compiler.write('0 2 ' + v + ' ' + v + '\n') for c in consonant: compiler.write('0 1 ' + c + ' ' + c + '\n') compiler.write('1 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write('1 2 ' + v + ' ' + v + '\n') compiler.write('2 2 i i\n') compiler.write('2 2 u u\n') for i in range(len(consonant)): compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n') for c in consonant: compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n') compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n') compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n') compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n') compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n') compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n') compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n') c = compiler.compile() fststr.expand_other_symbols(c) return c st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) lemma = [] allomorphy = [] with open("in_vocab_dictionary_verbs.txt", "r") as f: for line in f.readlines(): lemma.append(line.split(',')[0]) allomorphy.append(line.split(',')[1]) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) rule = compiler.compile() for index in range(len(lemma)): compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) if len(allomorphy[index]) >= len(lemma[index]): for i in range(len(allomorphy[index])): if i < len(lemma[index]): compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n') else: compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' <epsilon>' + '\n') l = len(allomorphy[index]) compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n') compiler.write(str(l+1)) rule.union(compiler.compile()) else: for i in range(len(lemma[index])): if i < len(allomorphy[index]): compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n') else: compiler.write(str(i) + ' ' + str(i+1) + ' <epsilon>' + ' ' + lemma[index][i] + '\n') l = len(lemma[index]) compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n') compiler.write(str(l+1)) rule.union(compiler.compile()) de_iv_rule = rule.copy().invert() morphotactics_rule = get_morphotactics() e_insertion_rule = e_insertion() k_insertion_rule = k_insertion() e_deletion_rule = e_deletion() general_rule = general() ch_sh_e_insertion_rule = ch_sh_e_insertion() y_replacement_rule = y_replacement() del_sharp_rule = del_sharp() consonant_doubling_rule = consonant_doubling() new_rule = k_insertion_rule.union(e_insertion_rule).union(general_rule).union(e_deletion_rule).union(ch_sh_e_insertion_rule).union(y_replacement_rule).union(consonant_doubling_rule) de_oov = fst.compose(morphotactics_rule.arcsort(sort_type="olabel"), new_rule.arcsort(sort_type="ilabel")) de_oov_rule = fst.compose(de_oov.arcsort(sort_type="olabel"), del_sharp_rule.arcsort(sort_type="ilabel")) oov_rule = de_oov_rule.copy().invert() # The final rules for lemmatizer rule = rule.union(oov_rule) # The final rules for delemmatizer de_rule = de_iv_rule.union(de_oov_rule) def lemmatize(self, in_str): out_set = set() for i in fststr.apply(in_str, self.rule): out_set.add(i) if in_str[-3:] == 'ing' or in_str[-2:] == 'ed' or in_str[-2:] == 'en' or (in_str[-1] == 's' and in_str[-2] != 's'): out_set.remove(in_str+'+Guess') return out_set def delemmatize(self, in_str): out_set = set() for i in fststr.apply(in_str, self.de_rule): out_set.add(i) return out_set
def __init__(self): # get the symbol table st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) return