def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon # but only if not generating Sukija lexicon. if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency( word) == OPTIONS["frequency"] and generate_lex_common.has_flag( word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None if OPTIONS["sukija"]: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") == "historical": voikko_infclass = generate_lex_common.tValue(infclass) if voikko_infclass == "banaali": # Banaali taipuu kuten paperi. voikko_infclass = "paperi" elif voikko_infclass == "pasuuna": voikko_infclass = "peruna" if voikko_infclass not in [ "aavistaa-av1", "arvelu", "arvelu-av1", "haravoida-av2", "karahka", "matala", "paperi", "paperi-av1", "peruna" ]: voikko_infclass = None break if voikko_infclass == None: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == "poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues( word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in [ "interjection", "prefix", "abbreviation", "conjunction", "adverb" ] and voikko_infclass == None: return vfst_word_class = get_vfst_word_class(wordclasses) if vfst_word_class == None: return # Get diacritics altforms = generate_lex_common.tValues( word.getElementsByTagName("forms")[0], "form") diacritics = "".join(get_diacritics(word, altforms, vfst_word_class)) # Get forced vowel type if voikko_infclass == None and vfst_word_class != "[La]": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type( inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information debug_info = "" if OPTIONS["sourceid"]: debug_info = '[Xs]%s[X]' % word.getAttribute("id")[1:].replace( "0", "%0") infoFlags = get_info_flags(word) # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in altforms: outputBaseform = altform.replace('|', '') wordform = outputBaseform.replace('=', '') if len(altform) == len(wordform.replace('-', '')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class( wordform, voikko_infclass, wordclasses, CLASSMAP) if alku == None: errorstr = "ERROR: VFST class not found for (%s, %s)\n" % ( wordform, voikko_infclass) sys.stderr.write(errorstr.encode("UTF-8")) sys.exit(1) if vfst_word_class == "[La]": jatko = get_abbreviation_jatko(word, altform) elif vfst_word_class == "[Ls]": jatko = get_adverb_jatko(word, altform) else: jatko = jatko.title() if vfst_word_class in ["[Ls]", "[Lc]", "[Lh]"]: for element in word.getElementsByTagName("baseform"): wordform = generate_lex_common.tValue(element) outputBaseform = wordform.replace('|', '') if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = 'ä' elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = 'a' elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = 'aä' vocabularyFile = vocabularyFiles[vfst_word_class.replace("[L", "").replace( "]", "")] if alku == None: errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr) sys.stderr.write(errorstr.encode("UTF-8")) sys.exit(1) alku = alku.lower() (rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku) if OPTIONS["no-baseform"]: outputBaseform = "" if vfst_word_class == "[Lh]": entry = '%s%s%s%s:%s # ;' % ( vfst_word_class, debug_info, rakenne, injectBaseformToStructure(outputBaseform, alkuWithTags), alku) vocabularyFile.write(entry + "\n") continue vfst_class_prefix = get_vfst_class_prefix(vfst_word_class) # Vowel type in derived verbs if jatko in [ "Heittää", "Muistaa", "Juontaa", "Hohtaa", "Murtaa", "Nousta", "Loistaa", "Jättää", "Kihistä" ]: diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags) if jatko == "Kihistä" and vtype == voikkoutils.VOWEL_FRONT and "y" not in alku and "ä" not in alku and "ö" not in alku and "e" in alku: jatko = "Helistä" if jatko == "Nainen" and vfst_class_prefix in [ "Laatusana", "NimiLaatusana" ] and altform.endswith("inen"): jatko = "NainenInen" if vfst_word_class == "[Lp]": entry = '[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \ % (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform)) else: entry = '%s%s%s%s%s%s:%s%s %s%s_%s ;' \ % (vfst_word_class, debug_info, rakenne, infoFlags, injectBaseformToStructure(outputBaseform, alkuWithTags), diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype) vocabularyFile.write(entry + "\n") # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(multiPartForm.rfind( "="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write( "ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == "poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in ["interjection", "prefix", "abbreviation", "conjunction", "adverb"] and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return baseformTags = word.getElementsByTagName("baseform") if len(baseformTags) > 0: baseform = generate_lex_common.tValue(baseformTags[0]) else: baseform = None # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None and malaga_word_class != "lyhenne": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information and additional attributes additional_attributes = get_additional_attributes(word) if OPTIONS["sourceid"]: additional_attributes = additional_attributes + ', sourceid: "%s"' % word.getAttribute("id") # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace('|', '').replace('=', '') if len(altform) == len(wordform.replace('-', '')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if alku == None: errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr) sys.stderr.write(errorstr.encode("UTF-8")) sys.exit(1) if malaga_word_class == "lyhenne": jatko = get_abbreviation_jatko(word, altform) elif malaga_word_class == "seikkasana": jatko = get_adverb_jatko(word) if malaga_word_class == "etuliite": vtype = voikkoutils.VOWEL_BOTH malaga_jatko = get_prefix_jatko(word) else: if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype malaga_jatko = "<" + jatko + ">" if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = 'ä' elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = 'a' elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = 'aä' rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if baseform is None: altBaseform = altform else: altBaseform = baseform if malaga_word_class == "lyhenne": perusmuotoEntry = "" else: perusmuotoEntry = 'perusmuoto: "%s", ' % altBaseform entry = '[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \ % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), additional_attributes) generate_lex_common.write_entry(main_vocabulary, {}, word, entry) # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(multiPartForm.rfind("="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write("ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(main_vocabulary, vocabulary_files, word): if generate_lex_common.has_flag(word, "not_sukija"): return # Get the inflection class. Exactly one inflection class is needed. infclasses = word.getElementsByTagName("infclass") voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") == "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass in [ u"antautua", u"kaihtaa", u"laittaa", u"paahtaa", u"taittaa", u"veranta", u"vihanta", u"virkkaa", ]: voikko_infclass = voikko_infclass + u"-av1" if voikko_infclass == None: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break ## if voikko_infclass == None: return if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] != u"interjection" and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0]) # Get forced vowel type ### forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0]) # Process all alternative forms for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace(u"|", u"").replace(u"=", u"") if (voikko_infclass == u"nuolaista-av2") and (wordform in [u"häväistä", u"vavista"]): voikko_infclass = u"nuolaista" # print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n") # print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n") (alku, jatko) = generate_lex_common.get_malaga_inflection_class( wordform, voikko_infclass, wordclasses, classmap ) # print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" " + str(voikko_infclass)) if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u"ä" elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u"a" elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u"aä" malaga_vtype = new_vtype(malaga_vtype, wordform) rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if alku == None: generate_lex_common.write_entry( main_vocabulary, vocabulary_files, word, u"#Malaga class not found for (%s, %s)\n" % (wordform, voikko_infclass), ) continue if wordform in words: # print ("Ei tarvita: " + wordform) continue if rx_begin.match(wordform) != None: # print ("Ei tarvita: " + wordform) continue if rx_end.match(wordform) != None: # print ("Ei tarvita: " + wordform) continue # Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa # taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa. if (wordform in [u"ori", u"ripsi", u"sini", u"täti", u"äiti"]) and (jatko == u"risti"): # print ("Ei tarvita: " + wordform) continue if (wordform == u"kampi") and (jatko == u"sampi"): # print ("Ei tarvita: " + wordform) continue # nsyl = number_of_syllabels(wordform) m = rx.match(wordform) d = None if m != None: d = m.groupdict() alku2 = u"" jatko2 = u"" wordform2 = u"" alku3 = u"" jatko3 = u"" wordform3 = u"" alku4 = u"" jatko4 = u"" wordform4 = u"" alku5 = u"" jatko5 = u"" wordform5 = u"" alku6 = u"" jatko6 = u"" wordform6 = u"" s = u'lähtösana: "' + wordform + u'", lähtöalku: "' + alku + u'"' # Korjataan alku- ja jatko-kenttien arvoja. # # elif (jatko == u"rakentaa"): if jatko == u"rakentaa": alku = wordform[:-4] # Tulostetaan. # print(u"Word " + wordform + u"\n") entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' % ( wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform) if len(wordform2) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform2, alku2, malaga_word_class, jatko2, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform3) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform3, alku3, malaga_word_class, jatko3, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform4) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform4, alku4, malaga_word_class, jatko4, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform5) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform5, alku5, malaga_word_class, jatko5, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform6) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform6, alku6, malaga_word_class, jatko6, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon # but only if not generating Sukija lexicon. if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"adverb"] and voikko_infclass == None: return vfst_word_class = get_vfst_word_class(wordclasses) if vfst_word_class == None: return # Get diacritics altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form") diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"") # Get forced vowel type if voikko_infclass == None and vfst_word_class != u"[La]": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information debug_info = u"" if OPTIONS["sourceid"]: debug_info = u', sourceid: "%s"' % word.getAttribute("id") infoFlags = get_info_flags(word) # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in altforms: wordform = altform.replace(u'|', u'').replace(u'=', u'') if len(altform) == len(wordform.replace(u'-', u'')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if vfst_word_class == u"[La]": jatko = u"Lyhenne" elif vfst_word_class == u"[Ls]": jatko = get_adverb_jatko(word) else: jatko = jatko.title() if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä' elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a' elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä' rakenne = get_structure(altform, vfst_word_class) vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")] if alku == None: errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr) sys.stderr.write(errorstr.encode(u"UTF-8")) sys.exit(1) if vfst_word_class == u"[Lh]": entry = u'%s[Xp]%s[X]%s%s:%s # ;' % (vfst_word_class, wordform, get_structure(altform, vfst_word_class), alku, alku) vocabularyFile.write(entry + u"\n") continue vfst_class_prefix = get_vfst_class_prefix(vfst_word_class) #entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s%s];' \ # % (wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags, # generate_lex_common.get_structure(altform, malaga_word_class), # debug_info) alku = alku.lower() # Vowel type in derived verbs if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää"]: diacritics = diacritics + vowel_type_for_derived_verb(alku) if vfst_word_class == u"[Lp]": entry = u'[Lp]%s:%s EtuliitteenJatko_%s;' \ % (wordform, wordform, get_prefix_jatko(word)) else: entry = u'%s[Xp]%s[X]%s%s%s%s:%s%s %s%s_%s ;' \ % (vfst_word_class, wordform, get_structure(altform, vfst_word_class), infoFlags, alku, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype) vocabularyFile.write(entry + u"\n") # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return baseformTags = word.getElementsByTagName("baseform") if len(baseformTags) > 0: baseform = generate_lex_common.tValue(baseformTags[0]) else: baseform = None # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None and malaga_word_class != u"lyhenne": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information and additional attributes additional_attributes = get_additional_attributes(word) if OPTIONS["sourceid"]: additional_attributes = additional_attributes + u', sourceid: "%s"' % word.getAttribute("id") # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace(u'|', u'').replace(u'=', u'') if len(altform) == len(wordform.replace(u'-', u'')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if alku == None: errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr) sys.stderr.write(errorstr.encode(u"UTF-8")) sys.exit(1) if malaga_word_class == u"lyhenne": jatko = get_abbreviation_jatko(word, altform) elif malaga_word_class == u"seikkasana": jatko = get_adverb_jatko(word) if malaga_word_class == u"etuliite": vtype = voikkoutils.VOWEL_BOTH malaga_jatko = get_prefix_jatko(word) else: if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype malaga_jatko = u"<" + jatko + u">" if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä' elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a' elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä' rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if baseform is None: altBaseform = altform else: altBaseform = baseform if malaga_word_class == u"lyhenne": perusmuotoEntry = u"" else: perusmuotoEntry = u'perusmuoto: "%s", ' % altBaseform entry = u'[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \ % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), additional_attributes) generate_lex_common.write_entry(main_vocabulary, {}, word, entry) # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon # but only if not generating Sukija lexicon. if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None if OPTIONS["sukija"]: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") == "historical": voikko_infclass = generate_lex_common.tValue(infclass) if voikko_infclass == "banaali": # Banaali taipuu kuten paperi. voikko_infclass = "paperi" elif voikko_infclass == "pasuuna": voikko_infclass = "peruna" if voikko_infclass not in ["aavistaa-av1", "arvelu", "arvelu-av1", "asema-av1", "haravoida-av2", "karahka", "kiiski", "matala", "paperi", "paperi-av1", "peruna"]: voikko_infclass = None break if voikko_infclass == None: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == "poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in ["interjection", "prefix", "abbreviation", "conjunction", "adverb"] and voikko_infclass == None: return vfst_word_class = get_vfst_word_class(wordclasses) if vfst_word_class == None: return # Get diacritics altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form") diacritics = "".join(get_diacritics(word, altforms, vfst_word_class)) # Get forced vowel type if voikko_infclass == None and vfst_word_class != "[La]": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information debug_info = "" if OPTIONS["sourceid"]: debug_info = '[Xs]%s[X]' % word.getAttribute("id")[1:].replace("0", "%0") infoFlags = get_info_flags(word) # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in altforms: outputBaseform = altform.replace('|', '') wordform = outputBaseform.replace('=', '') if len(altform) == len(wordform.replace('-', '')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if alku == None: errorstr = "ERROR: VFST class not found for (%s, %s)\n" % (wordform, voikko_infclass) sys.stderr.write(errorstr) sys.exit(1) if vfst_word_class == "[La]": jatko = get_abbreviation_jatko(word, altform) elif vfst_word_class == "[Ls]": jatko = get_adverb_jatko(word, altform) else: jatko = jatko.title() if vfst_word_class in ["[Ls]", "[Lc]", "[Lh]"]: for element in word.getElementsByTagName("baseform"): wordform = generate_lex_common.tValue(element) outputBaseform = wordform.replace('|', '') if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = 'ä' elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = 'a' elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = 'aä' vocabularyFile = vocabularyFiles[vfst_word_class.replace("[L", "").replace("]", "")] if alku == None: errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr) sys.stderr.write(errorstr) sys.exit(1) alku = alku.lower() (rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku) if OPTIONS["no-baseform"]: outputBaseform = "" if vfst_word_class == "[Lh]": entry = '%s%s%s%s:%s # ;' % (vfst_word_class, debug_info, rakenne, injectBaseformToStructure(outputBaseform, alkuWithTags), alku) vocabularyFile.write(entry + "\n") continue vfst_class_prefix = get_vfst_class_prefix(vfst_word_class) # Vowel type in derived verbs if jatko in ["Heittää", "Muistaa", "Juontaa", "Hohtaa", "Murtaa", "Nousta", "Loistaa", "Jättää", "Kihistä", "Kyntää2"]: diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags) if jatko == "Kihistä" and vtype == voikkoutils.VOWEL_FRONT and "y" not in alku and "ä" not in alku and "ö" not in alku and "e" in alku: jatko = "Helistä" if jatko == "Nainen" and vfst_class_prefix in ["Laatusana", "NimiLaatusana"] and altform.endswith("inen"): jatko = "NainenInen" if vfst_word_class == "[Lp]": entry = '[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \ % (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform)) else: entry = '%s%s%s%s%s%s:%s%s %s%s_%s ;' \ % (vfst_word_class, debug_info, rakenne, infoFlags, injectBaseformToStructure(outputBaseform, alkuWithTags), diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype) vocabularyFile.write(entry + "\n") # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(multiPartForm.rfind("="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write("ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(main_vocabulary, vocabulary_files, word): if generate_lex_common.has_flag(word, "not_sukija"): return # Get the inflection class. Exactly one inflection class is needed. infclasses = word.getElementsByTagName("infclass") voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") == "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if (voikko_infclass in [ u"antautua", u"kaihtaa", u"laittaa", u"paahtaa", u"taittaa", u"veranta", u"vihanta", u"virkkaa" ]): voikko_infclass = voikko_infclass + u"-av1" if voikko_infclass == None: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break ## if voikko_infclass == None: return if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues( word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] != u"interjection" and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: forced_inflection_vtype = generate_lex_common.vowel_type( word.getElementsByTagName("inflection")[0]) # Get forced vowel type ### forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0]) # Process all alternative forms for altform in generate_lex_common.tValues( word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace(u'|', u'').replace(u'=', u'') if (voikko_infclass == u"nuolaista-av2") and (wordform in [ u"häväistä", u"vavista" ]): voikko_infclass = u"nuolaista" # print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n") # print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n") (alku, jatko) = generate_lex_common.get_malaga_inflection_class( wordform, voikko_infclass, wordclasses, classmap) # print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" " + str(voikko_infclass)) if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä' elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a' elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä' malaga_vtype = new_vtype(malaga_vtype, wordform) rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if alku == None: generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, \ u"#Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass)) continue if (wordform in words): # print ("Ei tarvita: " + wordform) continue if (rx_begin.match(wordform) != None): # print ("Ei tarvita: " + wordform) continue if (rx_end.match(wordform) != None): # print ("Ei tarvita: " + wordform) continue # Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa # taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa. if ((wordform in [u'ori', u'ripsi', u'sini', u'täti', u'äiti']) and (jatko == u'risti')): # print ("Ei tarvita: " + wordform) continue if ((wordform == u'kampi') and (jatko == u'sampi')): # print ("Ei tarvita: " + wordform) continue # nsyl = number_of_syllabels(wordform) m = rx.match(wordform) d = None if (m != None): d = m.groupdict() alku2 = u"" jatko2 = u"" wordform2 = u"" alku3 = u"" jatko3 = u"" wordform3 = u"" alku4 = u"" jatko4 = u"" wordform4 = u"" alku5 = u"" jatko5 = u"" wordform5 = u"" alku6 = u"" jatko6 = u"" wordform6 = u"" s = u"lähtösana: \"" + wordform + u"\", lähtöalku: \"" + alku + u"\"" # Korjataan alku- ja jatko-kenttien arvoja. # # elif (jatko == u"rakentaa"): if (jatko == u"rakentaa"): alku = wordform[:-4] # Tulostetaan. # print(u"Word " + wordform + u"\n") entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' \ % (wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class)) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform) if (len(wordform2) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform2, alku2, malaga_word_class, jatko2, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform3) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform3, alku3, malaga_word_class, jatko3, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform4) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform4, alku4, malaga_word_class, jatko4, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform5) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform5, alku5, malaga_word_class, jatko5, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform6) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform6, alku6, malaga_word_class, jatko6, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon # but only if not generating Sukija lexicon. if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None: return vfst_word_class = get_vfst_word_class(wordclasses) if vfst_word_class == None: return # Get diacritics altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form") diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"") # Get forced vowel type if voikko_infclass == None and vfst_word_class != u"[La]": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information debug_info = u"" if OPTIONS["sourceid"]: debug_info = u', sourceid: "%s"' % word.getAttribute("id") infoFlags = get_info_flags(word) # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in altforms: wordform = altform.replace(u'|', u'').replace(u'=', u'') if len(altform) == len(wordform.replace(u'-', u'')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if vfst_word_class == u"[La]": jatko = get_abbreviation_jatko(word, altform) elif vfst_word_class == u"[Ls]": jatko = get_adverb_jatko(word, altform) else: jatko = jatko.title() if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä' elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a' elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä' vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")] if alku == None: errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr) sys.stderr.write(errorstr.encode(u"UTF-8")) sys.exit(1) alku = alku.lower() (rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku) if vfst_word_class == u"[Lh]": entry = u'%s[Xp]%s[X]%s%s:%s # ;' % (vfst_word_class, wordform, rakenne, alkuWithTags, alku) vocabularyFile.write(entry + u"\n") continue vfst_class_prefix = get_vfst_class_prefix(vfst_word_class) # Vowel type in derived verbs if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää", u"Kihistä"]: diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags) if jatko == u"Nainen" and vfst_class_prefix in [u"Laatusana", u"NimiLaatusana"] and altform.endswith(u"inen"): jatko = u"NainenInen" if vfst_word_class == u"[Lp]": entry = u'[Lp]%s%s%s:%s%s EtuliitteenJatko_%s;' \ % (wordform, diacritics, infoFlags, wordform, diacritics, get_prefix_jatko(word, altform)) else: entry = u'%s[Xp]%s[X]%s%s%s%s:%s%s %s%s_%s ;' \ % (vfst_word_class, wordform, rakenne, infoFlags, alkuWithTags, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype) vocabularyFile.write(entry + u"\n") # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)