sys.path.append("common") import hfconv import generate_lex_common import voikkoutils import xml.dom.minidom import codecs from xml.dom import Node flag_attributes = voikkoutils.readFlagAttributes( generate_lex_common.VOCABULARY_DATA + "/flags.txt") # Get command line options OPTIONS = generate_lex_common.get_options() # Inflection class map CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap) # No special vocabularies are built for Voikko generate_lex_common.SPECIAL_VOCABULARY = [] vocabularyFileSuffixes = [ "ep", "ee", "es", "em", "t", "nl", "l", "n", "h", "p", "a", "s", "c" ] vocabularyFiles = {} for fileSuffix in vocabularyFileSuffixes: vocFile = codecs.open( OPTIONS["destdir"] + "/joukahainen-" + fileSuffix + ".lexc", 'w', 'UTF-8') vocFile.write( "! This is automatically generated intermediate lexicon file for\n") vocFile.write(
import sys sys.path.append("common") import hfconv import generate_lex_common import voikkoutils import xml.dom.minidom import codecs flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + "/flags.txt") # Get command line options OPTIONS = generate_lex_common.get_options() # Inflection class map CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap) # No special vocabularies are built for Voikko generate_lex_common.SPECIAL_VOCABULARY = [] main_vocabulary = generate_lex_common.open_lex(OPTIONS["destdir"], "joukahainen.lex") def frequency(word): fclass = word.getElementsByTagName("fclass") if len(fclass) == 0: return 7 return int(generate_lex_common.tValue(fclass[0])) # Check the style flags of the word according to current options. # Returns True if the word is acceptable, otherwise returns false. def check_style(word): global OPTIONS
(None, u"(..*CO)itUs", u"aivoitus"), (None, u"(...*O)tUs", u"jaotus"), (None, u"(.*V)s", u"vastaus"), ], ), (u"veranta", u"sw", [(u"nt", u"(.*n)tA", u"veranta")]), ( u"vieras", u"ws", [(None, u"(.*[lr]iA)s", u"utelias"), (u"k", u"(.*mek)As", u"iäkäs"), (u"k", u"(.*k)As", u"varas")], ), (u"vihanta", u"sw", [(u"nt", u"(.*n)tA", u"vihanta")]), (u"virkkaa", u"sw", [(u"kk", u"(.*k)kAA", u"virkkaa")]), ] classmap = hfconv.compileClassmapREs(historical) classmap.extend(hfconv.compileClassmapREs(hfconv.modern_classmap)) pattern = ( u"^(?P<alku>.*)(?:" + u"(?P<keltainen>C[aouyäö]i?nen)|" + u"(?P<symboli_ym>[^aeouyäö]o[dfglmnrv]i)|" + u"(?P<maineikas>[mntv]eikAs)" + u")$" ) pattern = pattern.replace(u"A", u"[aä]") pattern = pattern.replace(u"O", u"[oö]") pattern = pattern.replace(u"U", u"[uy]") pattern = pattern.replace(u"C", u"[bcdfghjklmnpqrstvwxzšžçðñþß]") rx = re.compile(pattern, re.IGNORECASE)
(u'valmis', u'ws', [(None, u'(.*)is', u'valmis')]), (u'vastaus', u'-', [ (None, u'(lootu)s', u'vastaus'), (None, u'(..*CO)itUs', u'aivoitus'), (None, u'(...*O)tUs', u'jaotus'), (None, u'(.*V)s', u'vastaus'), ]), (u'veranta', u'sw', [(u'nt', u'(.*n)tA', u'veranta')]), (u'vieras', u'ws', [(None, u'(.*[lr]iA)s', u'utelias'), (u'k', u'(.*mek)As', u'iäkäs'), (u'k', u'(.*k)As', u'varas')]), (u'vihanta', u'sw', [(u'nt', u'(.*n)tA', u'vihanta')]), (u'virkkaa', u'sw', [(u'kk', u'(.*k)kAA', u'virkkaa')]) ] classmap = hfconv.compileClassmapREs(historical) classmap.extend(hfconv.compileClassmapREs(hfconv.modern_classmap)) pattern = u"^(?P<alku>.*)(?:" + \ u"(?P<keltainen>C[aouyäö]i?nen)|" + \ u"(?P<symboli_ym>[^aeouyäö]o[dfglmnrv]i)|" + \ u"(?P<maineikas>[mntv]eikAs)" + \ u")$" pattern = pattern.replace(u"A", u"[aä]") pattern = pattern.replace(u"O", u"[oö]") pattern = pattern.replace(u"U", u"[uy]") pattern = pattern.replace(u"C", u"[bcdfghjklmnpqrstvwxzšžçðñþß]") rx = re.compile(pattern, re.IGNORECASE) begin = u"(amerikan|jälleen|tiibetin|uudelleen).+"