def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini"))
class VoikkoCountVectorizer(CountVectorizer): """Converts a collection of text documents to a matrix of lemmatized token counts. This is similar to scikit-learn CountVectorizer but uses Voikko for tokenization and lemmatization. Additionally stop words can be specified using word classes that are considered irrelevant for particular task. """ FINNISH_STOPWORD_CLASSES = [ "huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana", "kieltosana" ] """List of closed word classes for Finnish analyzer. Use these if you want to concentrate the analysis on nouns, verbs and adjectives only.""" def __init__(self, langtag="fi", binary=False, stop_word_classes=[]): self.voikko = Voikko(langtag) self.stop_word_classes = set(stop_word_classes) super().__init__(binary=binary) def terminate(self): self.voikko.terminate() def build_analyzer(self): check_stop_words = len(self.stop_word_classes) > 0 def analyse_word(word): baseform = None is_stop_word = False for analysis in self.voikko.analyze(word): if check_stop_words and "CLASS" in analysis and analysis[ "CLASS"] in self.stop_word_classes: is_stop_word = True elif "BASEFORM" in analysis: new_baseform = analysis["BASEFORM"] if baseform is not None and baseform != new_baseform: return word.lower() baseform = new_baseform else: return word.lower() if baseform is None: if is_stop_word: return None return word.lower() return baseform def analyse_text(text): baseforms = [ analyse_word(token.tokenText) for token in self.voikko.tokens(text) if token.tokenType == Token.WORD ] if check_stop_words: return [ baseform for baseform in baseforms if baseform is not None ] return baseforms return analyse_text
def setInstallationPath(self, path): self.__installationPath = path searchPath = os.path.join( path, "voikko", platform.system() + "-" + "-".join(platform.architecture())) logging.debug( "VoikkoHandlePool.setInstallationPath: library search path is " + searchPath) Voikko.setLibrarySearchPath(searchPath)
def __openHandleWithVariant(self, language, fullVariant): logging.debug("VoikkoHandlePool.__openHandleWithVariant") try: voikkoHandle = Voikko(fullVariant, self.getDictionaryPath()) self.__handles[language] = voikkoHandle for booleanOpt, booleanValue in self.__globalBooleanOptions.items(): voikkoHandle.setBooleanOption(booleanOpt, booleanValue) for integerOpt, integerValue in self.__globalIntegerOptions.items(): voikkoHandle.setIntegerOption(integerOpt, integerValue) return voikkoHandle; except VoikkoException as e: self.__initializationErrors[language] = e.args[0] return None
def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual( u"standard", standard.variant, u"Standard dictionary must be the default in test environment.")
def initVoikko(): global _voikko for allowedDict in ALLOWED_DICTS: v = Voikko(allowedDict) v.setIgnoreDot(False) v.setAcceptUnfinishedParagraphsInGc(True) _voikko[allowedDict] = v for d in Voikko.listDicts(): tag = d.language + u"-x-" + d.variant if tag in ALLOWED_DICTS: _dictInfo[tag] = d
class VoikkoCountVectorizer(CountVectorizer): FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana", "kieltosana"] def __init__(self, langtag="fi", binary=False, stop_word_classes=[]): self.voikko = Voikko(langtag) self.stop_word_classes = set(stop_word_classes) super().__init__(binary=binary) def terminate(self): self.voikko.terminate() def build_analyzer(self): check_stop_words = len(self.stop_word_classes) > 0 def analyse_word(word): baseform = None is_stop_word = False for analysis in self.voikko.analyze(word): if check_stop_words and "CLASS" in analysis and analysis["CLASS"] in self.stop_word_classes: is_stop_word = True elif "BASEFORM" in analysis: new_baseform = analysis["BASEFORM"] if baseform is not None and baseform != new_baseform: return word.lower() baseform = new_baseform else: return word.lower() if baseform is None: if is_stop_word: return None return word.lower() return baseform def analyse_text(text): baseforms = [analyse_word(token.tokenText) for token in self.voikko.tokens(text) if token.tokenType == Token.WORD] if check_stop_words: return [baseform for baseform in baseforms if baseform is not None] return baseforms return analyse_text
def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list(filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script)
def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list( filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script)
def __openHandleWithVariant(self, language, fullVariant): logging.debug("VoikkoHandlePool.__openHandleWithVariant") try: voikkoHandle = Voikko(fullVariant, self.getDictionaryPath()) self.__handles[language] = voikkoHandle for booleanOpt, booleanValue in self.__globalBooleanOptions.items( ): voikkoHandle.setBooleanOption(booleanOpt, booleanValue) for integerOpt, integerValue in self.__globalIntegerOptions.items( ): voikkoHandle.setIntegerOption(integerOpt, integerValue) return voikkoHandle except VoikkoException as e: self.__initializationErrors[language] = e.args[0] return None
def testGetVersion(self): version = Voikko.getVersion() # We can't test for correct version but let's assume it starts with a number self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
def setUp(self): self.voikko = Voikko(u"fi")
def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa"))
#!/usr/bin/env python # -*- coding: utf-8 -*- from libvoikko import Voikko AUTHOR = 'Viljami Venekoski' AUTHOR_EMAIL = "*****@*****.**" VERSION = '0.1' VOIKKO = Voikko("fi")
import re from libvoikko import Voikko # v = Voikko('fi-x-morphoid') v = Voikko("fi") # Replace all non letter characters with space RE_WS_REPLACE = re.compile(r"[^\w]", re.UNICODE) RE_FIND_COMPOUNDS = re.compile(r"\(([\w+]+)\)", re.UNICODE) def voikko_analyze(text): text = RE_WS_REPLACE.sub(" ", text) words = text.split(" ") # Strip spaces words = [x.strip() for x in words] # Remove empty items words = filter(None, words) # Loop all words and analyze them analyzed = [] for word in words: aword = v.analyze(word) if aword: i = 0 for f in aword: i += 1 f["found"] = True f["original"] = word wordbases = RE_FIND_COMPOUNDS.findall(f.get("WORDBASES", "")) f["wordbase_list"] = [
class VoikkoAttributeVectorizer: """Converts a collection of text documents to a matrix of counts of words having specific value for enumerated morphological analysis attributes. Examples -------- >>> from voikko_sklearn import VoikkoAttributeVectorizer >>> corpus = [ ... 'Koiran karvat olivat takussa.', ... 'Kissamme goli vanha.' ... ] >>> vectorizer = VoikkoAttributeVectorizer(['NUMBER', 'PERSON'], langtag='fi') >>> print(vectorizer.get_feature_names()) ['unknown', 'NUMBER_plural', 'NUMBER_singular', 'PERSON_1', 'PERSON_2', 'PERSON_3', 'PERSON_4'] >>> X = vectorizer.transform(corpus) >>> print(X.toarray()) [[0. 0.5 0.5 0. 0. 0.25 0. ] [0.33333333 0. 0.66666667 0. 0. 0. 0. ]] """ def __init__(self, attributes, langtag="fi"): self.input = input self.attributes = attributes self.voikko = Voikko(langtag) self.__init_feature_names() def __init_feature_names(self): self.feature_names = ['unknown'] self.feature_name_to_index = {'unknown' : 0} for attribute in self.attributes: values = self.voikko.attributeValues(attribute) if values is None: raise ValueError("Attribute '" + attribute + "' does not exist or is not categorial.") values.sort() for value in values: name = attribute + '_' + value self.feature_name_to_index[name] = len(self.feature_names) self.feature_names.append(name) def terminate(self): self.voikko.terminate() def build_tokenizer(self): return lambda text: [token.tokenText for token in self.voikko.tokens(text) if token.tokenType == Token.WORD] def get_feature_names(self): return self.feature_names def __transform_document(self, document, target_vector): words = self.build_tokenizer()(document) wordcount = len(words) if wordcount == 0: return for word in words: analysis_list = self.voikko.analyze(word) count = len(analysis_list) if count == 0: target_vector[0] += 1 else: for analysis in analysis_list: for attribute in self.attributes: if attribute in analysis: value = analysis[attribute] target_vector[self.feature_name_to_index[attribute + "_" + value]] += 1.0 / count target_vector /= wordcount def transform(self, document_list): document_count = len(document_list) vector_length = len(self.feature_names) data = numpy.zeros((document_count, vector_length), dtype=numpy.float64) for i in range(document_count): self.__transform_document(document_list[i], data[i]) return csr_matrix(data) def fit(self, document_list): return self def fit_transform(self, document_list): return self.transform(document_list)
def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual(u"standard", standard.variant, u"Standard dictionary must be the default in test environment.")
def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini"))
class LibvoikkoTest(unittest.TestCase): def setUp(self): self.voikko = Voikko(u"fi") def tearDown(self): self.voikko.terminate() def testInitAndTerminate(self): pass # do nothing, just check that setUp and tearDown complete succesfully def testTerminateCanBeCalledMultipleTimes(self): self.voikko.terminate() self.voikko.terminate() def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini")) def testDictionaryComparisonWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(u"kissa", d1) self.assertNotEqual(d1, u"kissa") self.assertNotEqual(d1, d2) self.assertNotEqual(d1, d3) self.assertNotEqual(d4, d5) self.assertEqual(d1, d4) self.failUnless(d1 < d2) self.failUnless(d2 < d3) self.failUnless(d4 < d5) def testDictionaryHashCodeWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(hash(d1), hash(d2)) self.assertNotEqual(hash(d1), hash(d3)) self.assertNotEqual(hash(d4), hash(d5)) self.assertEqual(hash(d1), hash(d4)) def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual(u"standard", standard.variant, u"Standard dictionary must be the default in test environment.") def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless(u"fi" in langs, u"Finnish dictionary must be present in the test environment") def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list(filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script) def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini")) def testInitWithNonExistentDictThrowsException(self): def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant") self.voikko.terminate() self.assertRaises(VoikkoException, tryInit) def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa")) def testSpellAfterTerminateThrowsException(self): def trySpell(): self.voikko.spell(u"kissa") self.voikko.terminate() self.assertRaises(VoikkoException, trySpell) def testSpell(self): self.failUnless(self.voikko.spell(u"määrä")) self.failIf(self.voikko.spell(u"määä")) def testSuggest(self): suggs = self.voikko.suggest(u"koirra") self.failUnless(u"koira" in suggs) def testSuggestReturnsArgumentIfWordIsCorrect(self): suggs = self.voikko.suggest(u"koira") self.assertEqual(1, len(suggs)) self.assertEqual(u"koira", suggs[0]) def testGrammarErrorsAndExplanation(self): errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(10, error.startPos) self.assertEqual(11, error.errorLen) self.assertEqual([u"jotenkuten"], error.suggestions) self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription) def testNoGrammarErrorsInEmptyParagraph(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi") self.assertEqual(0, len(errors)) def testGrammarErrorOffsetsInMultipleParagraphs(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(16, error.startPos) self.assertEqual(11, error.errorLen) def testAnalyze(self): analysisList = self.voikko.analyze(u"kansaneläkehakemus") self.assertEqual(1, len(analysisList)) analysis = analysisList[0] self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"]) def testTokens(self): tokenList = self.voikko.tokens(u"kissa ja koira") self.assertEqual(5, len(tokenList)) tokenJa = tokenList[2] self.assertEqual(Token.WORD, tokenJa.tokenType) self.assertEqual(u"ja", tokenJa.tokenText) def testSentences(self): sentences = self.voikko.sentences(u"Kissa ei ole koira. Koira ei ole kissa.") self.assertEqual(2, len(sentences)) self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText) self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType) self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText) self.assertEqual(Sentence.NONE, sentences[1].nextStartType) def testHyphenationPattern(self): pattern = self.voikko.getHyphenationPattern(u"kissa") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"määrä") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"kuorma-auto") self.assertEqual(" - = - ", pattern) pattern = self.voikko.getHyphenationPattern(u"vaa'an") self.assertEqual(" = ", pattern) pattern = self.voikko.getHyphenationPattern(u"auton-") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-") self.assertEqual(" - - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-alus") self.assertEqual(" - -= - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-auton") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-aztoa") self.assertEqual(" - -", pattern) def testHyphenate(self): self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa")) self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä")) self.assertEqual(u"kuor-ma-au-to", self.voikko.hyphenate(u"kuorma-auto")) self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an")) def testHyphenateWithCustomSeparator(self): self.assertEqual(u"kis­sa", self.voikko.hyphenate(u"kissa", u"­", True)) self.assertEqual(u"kuor­ma-au­to", self.voikko.hyphenate(u"kuorma-auto", u"­", True)) self.assertEqual(u"vaa­an", self.voikko.hyphenate(u"vaa'an", u"­", True)) self.assertEqual(u"vaa'an", self.voikko.hyphenate(u"vaa'an", u"­", False)) def testSetIgnoreDot(self): self.voikko.setIgnoreDot(False) self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setIgnoreDot(True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetBooleanOption(self): self.voikko.setBooleanOption(0, False) # This is "ignore dot" self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setBooleanOption(0, True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetIgnoreNumbers(self): self.voikko.setIgnoreNumbers(False) self.failIf(self.voikko.spell(u"kissa2")) self.voikko.setIgnoreNumbers(True) self.failUnless(self.voikko.spell(u"kissa2")) def testSetIgnoreUppercase(self): self.voikko.setIgnoreUppercase(False) self.failIf(self.voikko.spell(u"KAAAA")) self.voikko.setIgnoreUppercase(True) self.failUnless(self.voikko.spell(u"KAAAA")) def testAcceptFirstUppercase(self): self.voikko.setAcceptFirstUppercase(False) self.failIf(self.voikko.spell("Kissa")) self.voikko.setAcceptFirstUppercase(True) self.failUnless(self.voikko.spell("Kissa")) def testUpperCaseScandinavianLetters(self): self.failUnless(self.voikko.spell(u"Äiti")) self.failIf(self.voikko.spell(u"Ääiti")) self.failUnless(self.voikko.spell(u"š")) self.failUnless(self.voikko.spell(u"Š")) def testAcceptAllUppercase(self): self.voikko.setIgnoreUppercase(False) self.voikko.setAcceptAllUppercase(False) self.failIf(self.voikko.spell("KISSA")) self.voikko.setAcceptAllUppercase(True) self.failUnless(self.voikko.spell("KISSA")) self.failIf(self.voikko.spell("KAAAA")) def testIgnoreNonwords(self): self.voikko.setIgnoreNonwords(False) self.failIf(self.voikko.spell("*****@*****.**")) self.voikko.setIgnoreNonwords(True) self.failUnless(self.voikko.spell("*****@*****.**")) self.failIf(self.voikko.spell("ashdaksd")) def testAcceptExtraHyphens(self): self.voikko.setAcceptExtraHyphens(False) self.failIf(self.voikko.spell("kerros-talo")) self.voikko.setAcceptExtraHyphens(True) self.failUnless(self.voikko.spell("kerros-talo")) def testAcceptMissingHyphens(self): self.voikko.setAcceptMissingHyphens(False) self.failIf(self.voikko.spell("sosiaali")) self.voikko.setAcceptMissingHyphens(True) self.failUnless(self.voikko.spell("sosiaali")) def testSetAcceptTitlesInGc(self): self.voikko.setAcceptTitlesInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) self.voikko.setAcceptTitlesInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) def testSetAcceptUnfinishedParagraphsInGc(self): self.voikko.setAcceptUnfinishedParagraphsInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) self.voikko.setAcceptUnfinishedParagraphsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) def testSetAcceptBulletedListsInGc(self): self.voikko.setAcceptBulletedListsInGc(False) self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) self.voikko.setAcceptBulletedListsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) def testSetNoUglyHyphenation(self): self.voikko.setNoUglyHyphenation(False) self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva")) self.voikko.setNoUglyHyphenation(True) self.assertEqual(u"iva", self.voikko.hyphenate(u"iva")) def testSetHyphenateUnknownWordsWorks(self): self.voikko.setHyphenateUnknownWords(False) self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo")) self.voikko.setHyphenateUnknownWords(True) self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo")) def testSetMinHyphenatedWordLength(self): self.voikko.setMinHyphenatedWordLength(6) self.assertEqual(u"koira", self.voikko.hyphenate(u"koira")) self.voikko.setMinHyphenatedWordLength(2) self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira")) def testIncreaseSpellerCacheSize(self): # TODO: this only tests that nothing breaks, not that cache is actually increased self.voikko.setSpellerCacheSize(3) self.failUnless(self.voikko.spell(u"kissa")) def testDisableSpellerCache(self): # TODO: this only tests that nothing breaks, not that cache is actually disabled self.voikko.setSpellerCacheSize(-1) self.failUnless(self.voikko.spell(u"kissa")) def testSetSuggestionStrategy(self): self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR) self.failIf(u"koira" in self.voikko.suggest(u"koari")) self.failUnless(u"koira" in self.voikko.suggest(u"koir_")) self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO) self.failUnless(u"koira" in self.voikko.suggest(u"koari")) def testMaxAnalysisCountIsNotPassed(self): complexWord = u"lumenerolumenerolumenerolumenerolumenero" self.failUnless(len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT) def testMorPruningWorks(self): # TODO: this test will not fail, it just takes very long time # if pruning does not work. complexWord = u"" for i in range(0, 20): complexWord = complexWord + u"lumenero" self.failUnless(len(complexWord) < MAX_WORD_CHARS) self.voikko.analyze(complexWord) def testOverLongWordsAreRejectedInSpellCheck(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.failUnless(self.voikko.spell(longWord)) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.failIf(self.voikko.spell(longWord)) def testOverLongWordsAreRejectedInAnalysis(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.assertEqual(1, len(self.voikko.analyze(longWord))) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.assertEqual(0, len(self.voikko.analyze(longWord))) def testTokenizationWorksForHugeParagraphs(self): hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000 self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph))) def testTokenizationWorksWithSomeMultibyteCharacters(self): text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9 self.assertEqual(180, len(self.voikko.tokens(text))) def testEmbeddedNullsAreNotAccepted(self): self.failIf(self.voikko.spell(u"kissa\0asdasd")) self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira"))) self.assertEqual(u"kissa\0koira", self.voikko.hyphenate(u"kissa\0koira")) self.assertEquals(0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi"))) self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira"))) def testNullCharMeansSingleSentence(self): sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.") self.assertEqual(1, len(sentences)) self.assertEqual(Sentence.NONE, sentences[0].nextStartType) self.assertEqual(u"kissa\0koira. Koira ja kissa.", sentences[0].sentenceText) def testNullCharIsUnknownToken(self): tokens = self.voikko.tokens(u"kissa\0koira") self.assertEquals(3, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.WORD, tokens[2].tokenType) self.assertEquals(u"koira", tokens[2].tokenText) tokens = self.voikko.tokens(u"kissa\0\0koira") self.assertEquals(4, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.UNKNOWN, tokens[2].tokenType) self.assertEquals(u"\0", tokens[2].tokenText) self.assertEquals(Token.WORD, tokens[3].tokenType) self.assertEquals(u"koira", tokens[3].tokenText) tokens = self.voikko.tokens(u"kissa\0") self.assertEquals(2, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0kissa") self.assertEquals(2, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(Token.WORD, tokens[1].tokenType) self.assertEquals(u"kissa", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0") self.assertEquals(1, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(0, len(self.voikko.tokens(u""))) def testAllCapsAndDot(self): self.voikko.setIgnoreDot(True) self.failIf(self.voikko.spell(u"ABC-DEF.")) def testGetVersion(self): version = Voikko.getVersion() # We can't test for correct version but let's assume it starts with a number self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant")
def __init__(self, langtag="fi", binary=False, stop_word_classes=[]): self.voikko = Voikko(langtag) self.stop_word_classes = set(stop_word_classes) super().__init__(binary=binary)
def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless(u"fi" in langs, u"Finnish dictionary must be present in the test environment")
def __init__(self, attributes, langtag="fi"): self.input = input self.attributes = attributes self.voikko = Voikko(langtag) self.__init_feature_names()
# This is an example application import pandas as pd from libvoikko import Voikko v = Voikko("fi") print(v.analyze("autossa")) df1 = pd.read_csv("./data/input.csv") df2 = pd.read_csv("./data/input.csv") df1 = df1.append(df2) df1.to_csv("./data/output.csv")
logging.getLogger().setLevel(logging.DEBUG) def messageBox(messageText): ctx = uno.getComponentContext() sManager = ctx.ServiceManager toolkit = sManager.createInstance("com.sun.star.awt.Toolkit") msgbox = toolkit.createMessageBox(None, ERRORBOX, BUTTONS_OK, "Error initializing Voikko", messageText) return msgbox.execute() if not PropertyManager.loadingFailed: try: # Force initialization of property manager so that it is done before anything else. PropertyManager.getInstance() # We could check for specific version but this at least ensures that libvoikko is installed # (this would throw an exception if it's not). Voikko.getVersion() # name of g_ImplementationHelper is significant, Python component loader expects to find it g_ImplementationHelper = unohelper.ImplementationHelper() g_ImplementationHelper.addImplementation(SettingsEventHandler, \ SettingsEventHandler.IMPLEMENTATION_NAME, SettingsEventHandler.SUPPORTED_SERVICE_NAMES,) g_ImplementationHelper.addImplementation(SpellChecker, \ SpellChecker.IMPLEMENTATION_NAME, SpellChecker.SUPPORTED_SERVICE_NAMES,) g_ImplementationHelper.addImplementation(Hyphenator, \ Hyphenator.IMPLEMENTATION_NAME, Hyphenator.SUPPORTED_SERVICE_NAMES,) g_ImplementationHelper.addImplementation(GrammarChecker, \ GrammarChecker.IMPLEMENTATION_NAME, GrammarChecker.SUPPORTED_SERVICE_NAMES,) except OSError as e:
def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless( u"fi" in langs, u"Finnish dictionary must be present in the test environment")
"""Contains functions for retrieving pre-processed words from one teletext frontpage image. See instructions in words_from_image() """ import re from typing import List, Tuple import pytesseract from PIL import Image, ImageOps from libvoikko import Voikko # these settings only work in Windows environment Voikko.setLibrarySearchPath("C:/python37/DLLs") voikko = Voikko("fi-x-morphoid") pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe' def words_from_image(filename: str) -> List[List[str]]: """Retrieve pre-processed words from given 'filename' containing teletext frontpage image. Return value is a list of lists e.g. [['word1', 'word2'], ['word1', 'word2', 'word3']] """ # make image black and white image = Image.open(filename).convert('1').convert('RGB') # invert black and white image = ImageOps.invert(image)
locale.setlocale(locale.LC_ALL, 'FI_fi') weekday = datetime.datetime.now().strftime('%A') # e.g. Tiistai if len(sys.argv) > 1: weekday = sys.argv[1] pattern = re.compile('.*{}.*'.format(weekday)) URL = 'http://pompier.fi/espa/lounas/' text = get_html(URL) soup = BeautifulSoup(text) # columns = soup.find_all('strong') todays_lunch = soup.find(text=pattern) print(todays_lunch.parent.parent.text) from libvoikko import Voikko, Token v = Voikko(u"fi-x-morphoid") ttt = todays_lunch.parent.parent.text.replace('-', ' ').replace('\r', ' ').replace('\n', ' ') all_words = [] for word in ttt.split(" "): word = word.strip('\n\r,.') foo = v.analyze(word) print("-- " + word + "--") if foo and 'BASEFORM' in foo[0]: base = foo[0]['BASEFORM'] else: base = word all_words.append(base) print(": " + base) print(all_words)
locale.setlocale(locale.LC_ALL, "FI_fi") weekday = datetime.datetime.now().strftime("%A") # e.g. Tiistai if len(sys.argv) > 1: weekday = sys.argv[1] pattern = re.compile(".*{}.*".format(weekday)) URL = "http://pompier.fi/espa/lounas/" text = get_html(URL) soup = BeautifulSoup(text) # columns = soup.find_all('strong') todays_lunch = soup.find(text=pattern) print(todays_lunch.parent.parent.text) from libvoikko import Voikko, Token v = Voikko(u"fi-x-morphoid") ttt = (todays_lunch.parent.parent.text.replace("-", " ").replace("\r", " ").replace( "\n", " ")) all_words = [] for word in ttt.split(" "): word = word.strip("\n\r,.") foo = v.analyze(word) print("-- " + word + "--") if foo and "BASEFORM" in foo[0]: base = foo[0]["BASEFORM"] else: base = word all_words.append(base) print(": " + base)
class LibvoikkoTest(unittest.TestCase): def setUp(self): self.voikko = Voikko(u"fi") def tearDown(self): self.voikko.terminate() def testInitAndTerminate(self): pass # do nothing, just check that setUp and tearDown complete succesfully def testTerminateCanBeCalledMultipleTimes(self): self.voikko.terminate() self.voikko.terminate() def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini")) def testDictionaryComparisonWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(u"kissa", d1) self.assertNotEqual(d1, u"kissa") self.assertNotEqual(d1, d2) self.assertNotEqual(d1, d3) self.assertNotEqual(d4, d5) self.assertEqual(d1, d4) self.failUnless(d1 < d2) self.failUnless(d2 < d3) self.failUnless(d4 < d5) def testDictionaryHashCodeWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(hash(d1), hash(d2)) self.assertNotEqual(hash(d1), hash(d3)) self.assertNotEqual(hash(d4), hash(d5)) self.assertEqual(hash(d1), hash(d4)) def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual( u"standard", standard.variant, u"Standard dictionary must be the default in test environment.") def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless( u"fi" in langs, u"Finnish dictionary must be present in the test environment") def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list( filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script) def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini")) def testInitWithNonExistentDictThrowsException(self): def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant") self.voikko.terminate() self.assertRaises(VoikkoException, tryInit) def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa")) def testSpellAfterTerminateThrowsException(self): def trySpell(): self.voikko.spell(u"kissa") self.voikko.terminate() self.assertRaises(VoikkoException, trySpell) def testSpell(self): self.failUnless(self.voikko.spell(u"määrä")) self.failIf(self.voikko.spell(u"määä")) def testSuggest(self): suggs = self.voikko.suggest(u"koirra") self.failUnless(u"koira" in suggs) def testSuggestReturnsArgumentIfWordIsCorrect(self): suggs = self.voikko.suggest(u"koira") self.assertEqual(1, len(suggs)) self.assertEqual(u"koira", suggs[0]) def testGrammarErrorsAndExplanation(self): errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(10, error.startPos) self.assertEqual(11, error.errorLen) self.assertEqual([u"jotenkuten"], error.suggestions) self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription) def testNoGrammarErrorsInEmptyParagraph(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi") self.assertEqual(0, len(errors)) def testGrammarErrorOffsetsInMultipleParagraphs(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(16, error.startPos) self.assertEqual(11, error.errorLen) def testAnalyze(self): analysisList = self.voikko.analyze(u"kansaneläkehakemus") self.assertEqual(1, len(analysisList)) analysis = analysisList[0] self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"]) def testTokens(self): tokenList = self.voikko.tokens(u"kissa ja koira") self.assertEqual(5, len(tokenList)) tokenJa = tokenList[2] self.assertEqual(Token.WORD, tokenJa.tokenType) self.assertEqual(u"ja", tokenJa.tokenText) def testSentences(self): sentences = self.voikko.sentences( u"Kissa ei ole koira. Koira ei ole kissa.") self.assertEqual(2, len(sentences)) self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText) self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType) self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText) self.assertEqual(Sentence.NONE, sentences[1].nextStartType) def testAttributeValuesForEnumeratedAttribute(self): values = self.voikko.attributeValues(u"NUMBER") self.assertEqual(2, len(values)) self.assertTrue("singular" in values) self.assertTrue("plural" in values) def testAttributeValuesForNonEnumeratedAttribute(self): values = self.voikko.attributeValues(u"BASEFORM") self.assertEqual(None, values) def testAttributeValuesForUnknownAttribute(self): values = self.voikko.attributeValues(u"XYZ") self.assertEqual(None, values) def testHyphenationPattern(self): pattern = self.voikko.getHyphenationPattern(u"kissa") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"määrä") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"kuorma-auto") self.assertEqual(" - = - ", pattern) pattern = self.voikko.getHyphenationPattern(u"vaa'an") self.assertEqual(" = ", pattern) pattern = self.voikko.getHyphenationPattern(u"auton-") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-") self.assertEqual(" - - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-alus") self.assertEqual(" - -= - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-auton") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-aztoa") self.assertEqual(" - -", pattern) def testHyphenate(self): self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa")) self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä")) self.assertEqual(u"kuor-ma-au-to", self.voikko.hyphenate(u"kuorma-auto")) self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an")) def testHyphenateWithCustomSeparator(self): self.assertEqual(u"kis­sa", self.voikko.hyphenate(u"kissa", u"­", True)) self.assertEqual(u"kuor­ma-au­to", self.voikko.hyphenate(u"kuorma-auto", u"­", True)) self.assertEqual(u"vaa­an", self.voikko.hyphenate(u"vaa'an", u"­", True)) self.assertEqual(u"vaa'an", self.voikko.hyphenate(u"vaa'an", u"­", False)) def testSetIgnoreDot(self): self.voikko.setIgnoreDot(False) self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setIgnoreDot(True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetBooleanOption(self): self.voikko.setBooleanOption(0, False) # This is "ignore dot" self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setBooleanOption(0, True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetIgnoreNumbers(self): self.voikko.setIgnoreNumbers(False) self.failIf(self.voikko.spell(u"kissa2")) self.voikko.setIgnoreNumbers(True) self.failUnless(self.voikko.spell(u"kissa2")) def testSetIgnoreUppercase(self): self.voikko.setIgnoreUppercase(False) self.failIf(self.voikko.spell(u"KAAAA")) self.voikko.setIgnoreUppercase(True) self.failUnless(self.voikko.spell(u"KAAAA")) def testAcceptFirstUppercase(self): self.voikko.setAcceptFirstUppercase(False) self.failIf(self.voikko.spell("Kissa")) self.voikko.setAcceptFirstUppercase(True) self.failUnless(self.voikko.spell("Kissa")) def testUpperCaseScandinavianLetters(self): self.failUnless(self.voikko.spell(u"Äiti")) self.failIf(self.voikko.spell(u"Ääiti")) self.failUnless(self.voikko.spell(u"š")) self.failUnless(self.voikko.spell(u"Š")) def testAcceptAllUppercase(self): self.voikko.setIgnoreUppercase(False) self.voikko.setAcceptAllUppercase(False) self.failIf(self.voikko.spell("KISSA")) self.voikko.setAcceptAllUppercase(True) self.failUnless(self.voikko.spell("KISSA")) self.failIf(self.voikko.spell("KAAAA")) def testIgnoreNonwords(self): self.voikko.setIgnoreNonwords(False) self.failIf(self.voikko.spell("*****@*****.**")) self.voikko.setIgnoreNonwords(True) self.failUnless(self.voikko.spell("*****@*****.**")) self.failIf(self.voikko.spell("ashdaksd")) def testAcceptExtraHyphens(self): self.voikko.setAcceptExtraHyphens(False) self.failIf(self.voikko.spell("kerros-talo")) self.voikko.setAcceptExtraHyphens(True) self.failUnless(self.voikko.spell("kerros-talo")) def testAcceptMissingHyphens(self): self.voikko.setAcceptMissingHyphens(False) self.failIf(self.voikko.spell("sosiaali")) self.voikko.setAcceptMissingHyphens(True) self.failUnless(self.voikko.spell("sosiaali")) def testSetAcceptTitlesInGc(self): self.voikko.setAcceptTitlesInGc(False) self.assertEqual( 1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) self.voikko.setAcceptTitlesInGc(True) self.assertEqual( 0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) def testSetAcceptUnfinishedParagraphsInGc(self): self.voikko.setAcceptUnfinishedParagraphsInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) self.voikko.setAcceptUnfinishedParagraphsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) def testSetAcceptBulletedListsInGc(self): self.voikko.setAcceptBulletedListsInGc(False) self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) self.voikko.setAcceptBulletedListsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) def testSetNoUglyHyphenation(self): self.voikko.setNoUglyHyphenation(False) self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva")) self.voikko.setNoUglyHyphenation(True) self.assertEqual(u"iva", self.voikko.hyphenate(u"iva")) def testSetHyphenateUnknownWordsWorks(self): self.voikko.setHyphenateUnknownWords(False) self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo")) self.voikko.setHyphenateUnknownWords(True) self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo")) def testSetMinHyphenatedWordLength(self): self.voikko.setMinHyphenatedWordLength(6) self.assertEqual(u"koira", self.voikko.hyphenate(u"koira")) self.voikko.setMinHyphenatedWordLength(2) self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira")) def testIncreaseSpellerCacheSize(self): # TODO: this only tests that nothing breaks, not that cache is actually increased self.voikko.setSpellerCacheSize(3) self.failUnless(self.voikko.spell(u"kissa")) def testDisableSpellerCache(self): # TODO: this only tests that nothing breaks, not that cache is actually disabled self.voikko.setSpellerCacheSize(-1) self.failUnless(self.voikko.spell(u"kissa")) def testSetSuggestionStrategy(self): self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR) self.failIf(u"koira" in self.voikko.suggest(u"koari")) self.failUnless(u"koira" in self.voikko.suggest(u"koir_")) self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO) self.failUnless(u"koira" in self.voikko.suggest(u"koari")) def testMaxAnalysisCountIsNotPassed(self): complexWord = u"lumenerolumenerolumenerolumenerolumenero" self.failUnless( len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT) def testMorPruningWorks(self): # TODO: this test will not fail, it just takes very long time # if pruning does not work. complexWord = u"" for i in range(0, 20): complexWord = complexWord + u"lumenero" self.failUnless(len(complexWord) < MAX_WORD_CHARS) self.voikko.analyze(complexWord) def testOverLongWordsAreRejectedInSpellCheck(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.failUnless(self.voikko.spell(longWord)) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.failIf(self.voikko.spell(longWord)) def testOverLongWordsAreRejectedInAnalysis(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.assertEqual(1, len(self.voikko.analyze(longWord))) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.assertEqual(0, len(self.voikko.analyze(longWord))) def testTokenizationWorksForHugeParagraphs(self): hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000 self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph))) def testTokenizationWorksWithSomeMultibyteCharacters(self): text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9 self.assertEqual(180, len(self.voikko.tokens(text))) def testEmbeddedNullsAreNotAccepted(self): self.failIf(self.voikko.spell(u"kissa\0asdasd")) self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira"))) self.assertEqual(u"kissa\0koira", self.voikko.hyphenate(u"kissa\0koira")) self.assertEquals( 0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi"))) self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira"))) def testNullCharMeansSingleSentence(self): sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.") self.assertEqual(1, len(sentences)) self.assertEqual(Sentence.NONE, sentences[0].nextStartType) self.assertEqual(u"kissa\0koira. Koira ja kissa.", sentences[0].sentenceText) def testNullCharIsUnknownToken(self): tokens = self.voikko.tokens(u"kissa\0koira") self.assertEquals(3, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.WORD, tokens[2].tokenType) self.assertEquals(u"koira", tokens[2].tokenText) tokens = self.voikko.tokens(u"kissa\0\0koira") self.assertEquals(4, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.UNKNOWN, tokens[2].tokenType) self.assertEquals(u"\0", tokens[2].tokenText) self.assertEquals(Token.WORD, tokens[3].tokenType) self.assertEquals(u"koira", tokens[3].tokenText) tokens = self.voikko.tokens(u"kissa\0") self.assertEquals(2, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0kissa") self.assertEquals(2, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(Token.WORD, tokens[1].tokenType) self.assertEquals(u"kissa", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0") self.assertEquals(1, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(0, len(self.voikko.tokens(u""))) def testAllCapsAndDot(self): self.voikko.setIgnoreDot(True) self.failIf(self.voikko.spell(u"ABC-DEF.")) def testGetVersion(self): version = Voikko.getVersion() # We can't test for correct version but let's assume it starts with a number self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
#!/usr/bin/env python import sys from libvoikko import Voikko print('Analysoidaan annetut sanat:\n') v = Voikko("fi") # Pass the 1st argument as it is the app name itself. for a in sys.argv[1:]: print('Sanan {} analyysi:'.format(a)) print(v.analyze(a)) print('Annetut sanat analysoitu.')
from flask import Flask, request from flask_restful import Resource, Api from flask import jsonify import sys from libvoikko import Voikko app = Flask(__name__) api = Api(app) v = Voikko('fi') class Finnish_text_analysis(Resource): def get(self): word = request.args.get('word') return (jsonify(self.process(word))) class Analyze(Finnish_text_analysis): def process(self, word): return v.analyze(word) class Spell(Finnish_text_analysis): def process(self, word): return {"spelling": v.spell(word)} class Suggest(Finnish_text_analysis): def process(self, word):
#!/usr/bin/env python from libvoikko import Voikko v = Voikko("fi") print(v.analyze('astetta'))
def setInstallationPath(self, path): self.__installationPath = path searchPath = os.path.join(path, "voikko", platform.system() + "-" + "-".join(platform.architecture())) logging.debug("VoikkoHandlePool.setInstallationPath: library search path is " + searchPath) Voikko.setLibrarySearchPath(searchPath)
from libvoikko import Voikko voikko = Voikko("fi") # from https://stackoverflow.com/a/1988826/95357 class Memoize: def __init__(self, f): self.f = f self.memo = {} def __call__(self, *args): if not args in self.memo: self.memo[args] = self.f(*args) # Warning: You may wish to do a deepcopy here if returning objects return self.memo[args] @Memoize def analyze_word(form): return voikko.analyze(form)
def __initAvailableVariants(self): dicts = Voikko.listDicts(VoikkoHandlePool.getInstance().getDictionaryPath()) self.__dictionaryVariantList = [] for vDict in dicts: dictName = vDict.variant + ": " + vDict.description self.__dictionaryVariantList.append(dictName)