def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini"))
def initVoikko(): global _voikko for allowedDict in ALLOWED_DICTS: v = Voikko(allowedDict) v.setIgnoreDot(False) v.setAcceptUnfinishedParagraphsInGc(True) _voikko[allowedDict] = v for d in Voikko.listDicts(): tag = d.language + u"-x-" + d.variant if tag in ALLOWED_DICTS: _dictInfo[tag] = d
def __openHandleWithVariant(self, language, fullVariant): logging.debug("VoikkoHandlePool.__openHandleWithVariant") try: voikkoHandle = Voikko(fullVariant, self.getDictionaryPath()) self.__handles[language] = voikkoHandle for booleanOpt, booleanValue in self.__globalBooleanOptions.items( ): voikkoHandle.setBooleanOption(booleanOpt, booleanValue) for integerOpt, integerValue in self.__globalIntegerOptions.items( ): voikkoHandle.setIntegerOption(integerOpt, integerValue) return voikkoHandle except VoikkoException as e: self.__initializationErrors[language] = e.args[0] return None
def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini"))
def setUp(self): self.voikko = Voikko(u"fi")
def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa"))
def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant")
"""Contains functions for retrieving pre-processed words from one teletext frontpage image. See instructions in words_from_image() """ import re from typing import List, Tuple import pytesseract from PIL import Image, ImageOps from libvoikko import Voikko # these settings only work in Windows environment Voikko.setLibrarySearchPath("C:/python37/DLLs") voikko = Voikko("fi-x-morphoid") pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe' def words_from_image(filename: str) -> List[List[str]]: """Retrieve pre-processed words from given 'filename' containing teletext frontpage image. Return value is a list of lists e.g. [['word1', 'word2'], ['word1', 'word2', 'word3']] """ # make image black and white image = Image.open(filename).convert('1').convert('RGB') # invert black and white image = ImageOps.invert(image)
import re from libvoikko import Voikko # v = Voikko('fi-x-morphoid') v = Voikko("fi") # Replace all non letter characters with space RE_WS_REPLACE = re.compile(r"[^\w]", re.UNICODE) RE_FIND_COMPOUNDS = re.compile(r"\(([\w+]+)\)", re.UNICODE) def voikko_analyze(text): text = RE_WS_REPLACE.sub(" ", text) words = text.split(" ") # Strip spaces words = [x.strip() for x in words] # Remove empty items words = filter(None, words) # Loop all words and analyze them analyzed = [] for word in words: aword = v.analyze(word) if aword: i = 0 for f in aword: i += 1 f["found"] = True f["original"] = word wordbases = RE_FIND_COMPOUNDS.findall(f.get("WORDBASES", "")) f["wordbase_list"] = [
def __init__(self, attributes, langtag="fi"): self.input = input self.attributes = attributes self.voikko = Voikko(langtag) self.__init_feature_names()
def __init__(self, langtag="fi", binary=False, stop_word_classes=[]): self.voikko = Voikko(langtag) self.stop_word_classes = set(stop_word_classes) super().__init__(binary=binary)
locale.setlocale(locale.LC_ALL, "FI_fi") weekday = datetime.datetime.now().strftime("%A") # e.g. Tiistai if len(sys.argv) > 1: weekday = sys.argv[1] pattern = re.compile(".*{}.*".format(weekday)) URL = "http://pompier.fi/espa/lounas/" text = get_html(URL) soup = BeautifulSoup(text) # columns = soup.find_all('strong') todays_lunch = soup.find(text=pattern) print(todays_lunch.parent.parent.text) from libvoikko import Voikko, Token v = Voikko(u"fi-x-morphoid") ttt = (todays_lunch.parent.parent.text.replace("-", " ").replace("\r", " ").replace( "\n", " ")) all_words = [] for word in ttt.split(" "): word = word.strip("\n\r,.") foo = v.analyze(word) print("-- " + word + "--") if foo and "BASEFORM" in foo[0]: base = foo[0]["BASEFORM"] else: base = word all_words.append(base) print(": " + base)
from libvoikko import Voikko voikko = Voikko("fi") # from https://stackoverflow.com/a/1988826/95357 class Memoize: def __init__(self, f): self.f = f self.memo = {} def __call__(self, *args): if not args in self.memo: self.memo[args] = self.f(*args) # Warning: You may wish to do a deepcopy here if returning objects return self.memo[args] @Memoize def analyze_word(form): return voikko.analyze(form)
from flask import Flask, request from flask_restful import Resource, Api from flask import jsonify import sys from libvoikko import Voikko app = Flask(__name__) api = Api(app) v = Voikko('fi') class Finnish_text_analysis(Resource): def get(self): word = request.args.get('word') return (jsonify(self.process(word))) class Analyze(Finnish_text_analysis): def process(self, word): return v.analyze(word) class Spell(Finnish_text_analysis): def process(self, word): return {"spelling": v.spell(word)} class Suggest(Finnish_text_analysis): def process(self, word):
#!/usr/bin/env python # -*- coding: utf-8 -*- from libvoikko import Voikko AUTHOR = 'Viljami Venekoski' AUTHOR_EMAIL = "*****@*****.**" VERSION = '0.1' VOIKKO = Voikko("fi")