# diff test.out ~/Lataukset/vv* | grep '>.*[+][?]' | gawk '{print $2}' |flookup -i sukija.fst | gawk 'length($0) > 0' # cp test.out ~/Lataukset/vvfst-sukija-testi.out # Style- ja usage-lippujen arvot suoraan Joukahaisesta: # grep -A1 '<style>' ../vocabulary/joukahainen.xml|grep flag|sort -u|gawk '{printf "%s,", substr($1,7,length($1)-13)}' # grep -A1 '<usage>' ../vocabulary/joukahainen.xml|grep flag|sort -u|gawk '{printf "%s,", substr($1,7,length($1)-13)}' import codecs import getopt import re import sys from types import * sys.path.append("common") import generate_lex_common OPTIONS = generate_lex_common.get_options() infile = codecs.open(OPTIONS["destdir"] + u"/all.lexc", "r", "UTF-8") outfile = codecs.open(OPTIONS["destdir"] + u"/all-sukija.lexc", 'w', 'UTF-8') sukijafile = codecs.open(OPTIONS["destdir"] + u"/poikkeavat-sukija.lexc", 'r', 'UTF-8') C = u"[qwrtpsšdfghjklzžxcvbnm]" # Kerakkeet. K = u"[qwrtpsšdfghjklzžxcvbnmaiou]" # Kerakkeet + ääntiöitä. V = u"[aeiouüyåäö]" # Ääntiöt. A = u"[aä]" U = u"[uy]" def makeRePattern(wordClass, word): u = u"^\\[%s\\](\\[I..\\])?\\[Xp\\].*%s\\[X\\]" % (wordClass, word)
# You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA import sys sys.path.append("common") import hfconv import generate_lex_common import voikkoutils import xml.dom.minidom import codecs flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + "/flags.txt") # Get command line options OPTIONS = generate_lex_common.get_options() # Inflection class map CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap) # No special vocabularies are built for Voikko generate_lex_common.SPECIAL_VOCABULARY = [] main_vocabulary = generate_lex_common.open_lex(OPTIONS["destdir"], "joukahainen.lex") def frequency(word): fclass = word.getElementsByTagName("fclass") if len(fclass) == 0: return 7 return int(generate_lex_common.tValue(fclass[0])) # Check the style flags of the word according to current options.