def handle_word(main_vocabulary, vocabulary_files, word): if generate_lex_common.has_flag(word, "not_sukija"): return # Get the inflection class. Exactly one inflection class is needed. infclasses = word.getElementsByTagName("infclass") voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") == "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass in [ u"antautua", u"kaihtaa", u"laittaa", u"paahtaa", u"taittaa", u"veranta", u"vihanta", u"virkkaa", ]: voikko_infclass = voikko_infclass + u"-av1" if voikko_infclass == None: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break ## if voikko_infclass == None: return if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] != u"interjection" and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0]) # Get forced vowel type ### forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0]) # Process all alternative forms for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace(u"|", u"").replace(u"=", u"") if (voikko_infclass == u"nuolaista-av2") and (wordform in [u"häväistä", u"vavista"]): voikko_infclass = u"nuolaista" # print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n") # print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n") (alku, jatko) = generate_lex_common.get_malaga_inflection_class( wordform, voikko_infclass, wordclasses, classmap ) # print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" " + str(voikko_infclass)) if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u"ä" elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u"a" elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u"aä" malaga_vtype = new_vtype(malaga_vtype, wordform) rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if alku == None: generate_lex_common.write_entry( main_vocabulary, vocabulary_files, word, u"#Malaga class not found for (%s, %s)\n" % (wordform, voikko_infclass), ) continue if wordform in words: # print ("Ei tarvita: " + wordform) continue if rx_begin.match(wordform) != None: # print ("Ei tarvita: " + wordform) continue if rx_end.match(wordform) != None: # print ("Ei tarvita: " + wordform) continue # Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa # taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa. if (wordform in [u"ori", u"ripsi", u"sini", u"täti", u"äiti"]) and (jatko == u"risti"): # print ("Ei tarvita: " + wordform) continue if (wordform == u"kampi") and (jatko == u"sampi"): # print ("Ei tarvita: " + wordform) continue # nsyl = number_of_syllabels(wordform) m = rx.match(wordform) d = None if m != None: d = m.groupdict() alku2 = u"" jatko2 = u"" wordform2 = u"" alku3 = u"" jatko3 = u"" wordform3 = u"" alku4 = u"" jatko4 = u"" wordform4 = u"" alku5 = u"" jatko5 = u"" wordform5 = u"" alku6 = u"" jatko6 = u"" wordform6 = u"" s = u'lähtösana: "' + wordform + u'", lähtöalku: "' + alku + u'"' # Korjataan alku- ja jatko-kenttien arvoja. # # elif (jatko == u"rakentaa"): if jatko == u"rakentaa": alku = wordform[:-4] # Tulostetaan. # print(u"Word " + wordform + u"\n") entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' % ( wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform) if len(wordform2) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform2, alku2, malaga_word_class, jatko2, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform3) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform3, alku3, malaga_word_class, jatko3, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform4) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform4, alku4, malaga_word_class, jatko4, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform5) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform5, alku5, malaga_word_class, jatko5, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if len(wordform6) > 0: entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % ( wordform6, alku6, malaga_word_class, jatko6, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s, ) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return baseformTags = word.getElementsByTagName("baseform") if len(baseformTags) > 0: baseform = generate_lex_common.tValue(baseformTags[0]) else: baseform = None # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None and malaga_word_class != u"lyhenne": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information and additional attributes additional_attributes = get_additional_attributes(word) if OPTIONS["sourceid"]: additional_attributes = additional_attributes + u', sourceid: "%s"' % word.getAttribute("id") # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace(u'|', u'').replace(u'=', u'') if len(altform) == len(wordform.replace(u'-', u'')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if alku == None: errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr) sys.stderr.write(errorstr.encode(u"UTF-8")) sys.exit(1) if malaga_word_class == u"lyhenne": jatko = get_abbreviation_jatko(word, altform) elif malaga_word_class == u"seikkasana": jatko = get_adverb_jatko(word) if malaga_word_class == u"etuliite": vtype = voikkoutils.VOWEL_BOTH malaga_jatko = get_prefix_jatko(word) else: if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype malaga_jatko = u"<" + jatko + u">" if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä' elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a' elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä' rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if baseform is None: altBaseform = altform else: altBaseform = baseform if malaga_word_class == u"lyhenne": perusmuotoEntry = u"" else: perusmuotoEntry = u'perusmuoto: "%s", ' % altBaseform entry = u'[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \ % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), additional_attributes) generate_lex_common.write_entry(main_vocabulary, {}, word, entry) # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(word): global OPTIONS global CLASSMAP # Drop words that are not needed in the Voikko lexicon if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]: return if not check_style(word): return if not check_usage(word): return if frequency(word) >= OPTIONS["frequency"] + 1: return if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return # Get the inflection class. Exactly one inflection class is needed voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if voikko_infclass == "poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] not in ["interjection", "prefix", "abbreviation", "conjunction", "adverb"] and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return baseformTags = word.getElementsByTagName("baseform") if len(baseformTags) > 0: baseform = generate_lex_common.tValue(baseformTags[0]) else: baseform = None # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None and malaga_word_class != "lyhenne": forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: inflectionElement = word.getElementsByTagName("inflection") if len(inflectionElement) > 0: forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0]) else: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT # Construct debug information and additional attributes additional_attributes = get_additional_attributes(word) if OPTIONS["sourceid"]: additional_attributes = additional_attributes + ', sourceid: "%s"' % word.getAttribute("id") # Process all alternative forms singlePartForms = [] multiPartForms = [] for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace('|', '').replace('=', '') if len(altform) == len(wordform.replace('-', '')): singlePartForms.append(altform) else: multiPartForms.append(altform) (alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP) if alku == None: errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass) generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr) sys.stderr.write(errorstr.encode("UTF-8")) sys.exit(1) if malaga_word_class == "lyhenne": jatko = get_abbreviation_jatko(word, altform) elif malaga_word_class == "seikkasana": jatko = get_adverb_jatko(word) if malaga_word_class == "etuliite": vtype = voikkoutils.VOWEL_BOTH malaga_jatko = get_prefix_jatko(word) else: if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype malaga_jatko = "<" + jatko + ">" if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = 'ä' elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = 'a' elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = 'aä' rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if baseform is None: altBaseform = altform else: altBaseform = baseform if malaga_word_class == "lyhenne": perusmuotoEntry = "" else: perusmuotoEntry = 'perusmuoto: "%s", ' % altBaseform entry = '[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \ % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), additional_attributes) generate_lex_common.write_entry(main_vocabulary, {}, word, entry) # Sanity check for alternative forms: if there are both multi part forms and single part forms # then all multi part forms must end with a part contained in the single part set. if singlePartForms: for multiPartForm in multiPartForms: lastPart = multiPartForm[max(multiPartForm.rfind("="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:] if lastPart not in singlePartForms: sys.stderr.write("ERROR: suspicious alternative spelling: %s\n" % multiPartForm) sys.exit(1)
def handle_word(main_vocabulary, vocabulary_files, word): if generate_lex_common.has_flag(word, "not_sukija"): return # Get the inflection class. Exactly one inflection class is needed. infclasses = word.getElementsByTagName("infclass") voikko_infclass = None for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") == "historical": voikko_infclass = generate_lex_common.tValue(infclass) break if (voikko_infclass in [ u"antautua", u"kaihtaa", u"laittaa", u"paahtaa", u"taittaa", u"veranta", u"vihanta", u"virkkaa" ]): voikko_infclass = voikko_infclass + u"-av1" if voikko_infclass == None: for infclass in word.getElementsByTagName("infclass"): if infclass.getAttribute("type") != "historical": voikko_infclass = generate_lex_common.tValue(infclass) break ## if voikko_infclass == None: return if voikko_infclass == u"poikkeava": return # Get the word classes wordclasses = generate_lex_common.tValues( word.getElementsByTagName("classes")[0], "wclass") if wordclasses[0] != u"interjection" and voikko_infclass == None: return malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses) if malaga_word_class == None: return # Get malaga flags malaga_flags = generate_lex_common.get_malaga_flags(word) # Get forced vowel type if voikko_infclass == None: forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT else: forced_inflection_vtype = generate_lex_common.vowel_type( word.getElementsByTagName("inflection")[0]) # Get forced vowel type ### forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0]) # Process all alternative forms for altform in generate_lex_common.tValues( word.getElementsByTagName("forms")[0], "form"): wordform = altform.replace(u'|', u'').replace(u'=', u'') if (voikko_infclass == u"nuolaista-av2") and (wordform in [ u"häväistä", u"vavista" ]): voikko_infclass = u"nuolaista" # print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n") # print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n") (alku, jatko) = generate_lex_common.get_malaga_inflection_class( wordform, voikko_infclass, wordclasses, classmap) # print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" " + str(voikko_infclass)) if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT: vtype = voikkoutils.get_wordform_infl_vowel_type(altform) else: vtype = forced_inflection_vtype if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä' elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a' elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä' malaga_vtype = new_vtype(malaga_vtype, wordform) rakenne = generate_lex_common.get_structure(altform, malaga_word_class) if alku == None: generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, \ u"#Malaga class not found for (%s, %s)\n" \ % (wordform, voikko_infclass)) continue if (wordform in words): # print ("Ei tarvita: " + wordform) continue if (rx_begin.match(wordform) != None): # print ("Ei tarvita: " + wordform) continue if (rx_end.match(wordform) != None): # print ("Ei tarvita: " + wordform) continue # Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa # taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa. if ((wordform in [u'ori', u'ripsi', u'sini', u'täti', u'äiti']) and (jatko == u'risti')): # print ("Ei tarvita: " + wordform) continue if ((wordform == u'kampi') and (jatko == u'sampi')): # print ("Ei tarvita: " + wordform) continue # nsyl = number_of_syllabels(wordform) m = rx.match(wordform) d = None if (m != None): d = m.groupdict() alku2 = u"" jatko2 = u"" wordform2 = u"" alku3 = u"" jatko3 = u"" wordform3 = u"" alku4 = u"" jatko4 = u"" wordform4 = u"" alku5 = u"" jatko5 = u"" wordform5 = u"" alku6 = u"" jatko6 = u"" wordform6 = u"" s = u"lähtösana: \"" + wordform + u"\", lähtöalku: \"" + alku + u"\"" # Korjataan alku- ja jatko-kenttien arvoja. # # elif (jatko == u"rakentaa"): if (jatko == u"rakentaa"): alku = wordform[:-4] # Tulostetaan. # print(u"Word " + wordform + u"\n") entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' \ % (wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class)) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform) if (len(wordform2) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform2, alku2, malaga_word_class, jatko2, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform3) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform3, alku3, malaga_word_class, jatko3, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform4) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform4, alku4, malaga_word_class, jatko4, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform5) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform5, alku5, malaga_word_class, jatko5, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry) if (len(wordform6) > 0): entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \ % (wordform6, alku6, malaga_word_class, jatko6, malaga_vtype, malaga_flags, generate_lex_common.get_structure(altform, malaga_word_class), s) generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)