Пример #1
0
 def testInitWithCorrectDictWorks(self):
     self.voikko.terminate()
     self.voikko = Voikko(u"fi-x-standard")
     self.failIf(self.voikko.spell(u"amifostiini"))
     self.voikko.terminate()
     self.voikko = Voikko(u"fi-x-medicine")
     self.failUnless(self.voikko.spell(u"amifostiini"))
Пример #2
0
 def testInitWithCorrectDictWorks(self):
     self.voikko.terminate()
     self.voikko = Voikko(u"fi-x-standard")
     self.failIf(self.voikko.spell(u"amifostiini"))
     self.voikko.terminate()
     self.voikko = Voikko(u"fi-x-medicine")
     self.failUnless(self.voikko.spell(u"amifostiini"))
Пример #3
0
class VoikkoCountVectorizer(CountVectorizer):
    """Converts a collection of text documents to a matrix of lemmatized token counts.
	
	This is similar to scikit-learn CountVectorizer but uses Voikko for tokenization and
	lemmatization. Additionally stop words can be specified using word classes that are
	considered irrelevant for particular task.
	"""

    FINNISH_STOPWORD_CLASSES = [
        "huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana",
        "suhdesana", "kieltosana"
    ]
    """List of closed word classes for Finnish analyzer. Use these if you want to concentrate the analysis on nouns, verbs and
	adjectives only."""
    def __init__(self, langtag="fi", binary=False, stop_word_classes=[]):
        self.voikko = Voikko(langtag)
        self.stop_word_classes = set(stop_word_classes)
        super().__init__(binary=binary)

    def terminate(self):
        self.voikko.terminate()

    def build_analyzer(self):
        check_stop_words = len(self.stop_word_classes) > 0

        def analyse_word(word):
            baseform = None
            is_stop_word = False
            for analysis in self.voikko.analyze(word):
                if check_stop_words and "CLASS" in analysis and analysis[
                        "CLASS"] in self.stop_word_classes:
                    is_stop_word = True
                elif "BASEFORM" in analysis:
                    new_baseform = analysis["BASEFORM"]
                    if baseform is not None and baseform != new_baseform:
                        return word.lower()
                    baseform = new_baseform
                else:
                    return word.lower()
            if baseform is None:
                if is_stop_word:
                    return None
                return word.lower()
            return baseform

        def analyse_text(text):
            baseforms = [
                analyse_word(token.tokenText)
                for token in self.voikko.tokens(text)
                if token.tokenType == Token.WORD
            ]
            if check_stop_words:
                return [
                    baseform for baseform in baseforms if baseform is not None
                ]
            return baseforms

        return analyse_text
 def setInstallationPath(self, path):
     self.__installationPath = path
     searchPath = os.path.join(
         path, "voikko",
         platform.system() + "-" + "-".join(platform.architecture()))
     logging.debug(
         "VoikkoHandlePool.setInstallationPath: library search path is " +
         searchPath)
     Voikko.setLibrarySearchPath(searchPath)
Пример #5
0
	def __openHandleWithVariant(self, language, fullVariant):
		logging.debug("VoikkoHandlePool.__openHandleWithVariant")
		try:
			voikkoHandle = Voikko(fullVariant, self.getDictionaryPath())
			self.__handles[language] = voikkoHandle
			for booleanOpt, booleanValue in self.__globalBooleanOptions.items():
				voikkoHandle.setBooleanOption(booleanOpt, booleanValue)
			for integerOpt, integerValue in self.__globalIntegerOptions.items():
				voikkoHandle.setIntegerOption(integerOpt, integerValue)
			return voikkoHandle;
		except VoikkoException as e:
			self.__initializationErrors[language] = e.args[0]
			return None
Пример #6
0
 def testListDictsWithoutPath(self):
     dicts = Voikko.listDicts()
     self.failUnless(len(dicts) > 0)
     standard = dicts[0]
     self.assertEqual(
         u"standard", standard.variant,
         u"Standard dictionary must be the default in test environment.")
Пример #7
0
def initVoikko():
    global _voikko
    for allowedDict in ALLOWED_DICTS:
        v = Voikko(allowedDict)
        v.setIgnoreDot(False)
        v.setAcceptUnfinishedParagraphsInGc(True)
        _voikko[allowedDict] = v
    for d in Voikko.listDicts():
        tag = d.language + u"-x-" + d.variant
        if tag in ALLOWED_DICTS:
            _dictInfo[tag] = d
Пример #8
0
class VoikkoCountVectorizer(CountVectorizer):

	FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana", "kieltosana"]

	def __init__(self, langtag="fi", binary=False, stop_word_classes=[]):
		self.voikko = Voikko(langtag)
		self.stop_word_classes = set(stop_word_classes)
		super().__init__(binary=binary)

	def terminate(self):
		self.voikko.terminate()

	def build_analyzer(self):
		check_stop_words = len(self.stop_word_classes) > 0
		def analyse_word(word):
			baseform = None
			is_stop_word = False
			for analysis in self.voikko.analyze(word):
				if check_stop_words and "CLASS" in analysis and analysis["CLASS"] in self.stop_word_classes:
					is_stop_word = True
				elif "BASEFORM" in analysis:
					new_baseform = analysis["BASEFORM"]
					if baseform is not None and baseform != new_baseform:
						return word.lower()
					baseform = new_baseform
				else:
					return word.lower()
			if baseform is None:
				if is_stop_word:
					return None
				return word.lower()
			return baseform
		def analyse_text(text):
			baseforms = [analyse_word(token.tokenText) for token in self.voikko.tokens(text) if token.tokenType == Token.WORD]
			if check_stop_words:
				return [baseform for baseform in baseforms if baseform is not None]
			return baseforms
		return analyse_text
Пример #9
0
 def testListDictsWithPathAndAttributes(self):
     info = MorphologyInfo()
     info.variant = u"test-variant-name"
     info.description = u"Some test description sakldjasd"
     info.morphology = u"null"
     dataDir = TestDataDir()
     dataDir.createMorphology(info.variant, info)
     dicts = Voikko.listDicts(dataDir.getDirectory())
     dataDir.tearDown()
     dictsWithCorrectVariant = list(filter(lambda aDict: aDict.variant == info.variant, dicts))
     self.assertEqual(1, len(dictsWithCorrectVariant))
     theDict = dictsWithCorrectVariant[0]
     self.assertEqual(info.description, theDict.description)
     self.assertEqual(u"fi", theDict.language)
     self.assertEqual(u"", theDict.script)
Пример #10
0
 def testListDictsWithPathAndAttributes(self):
     info = MorphologyInfo()
     info.variant = u"test-variant-name"
     info.description = u"Some test description sakldjasd"
     info.morphology = u"null"
     dataDir = TestDataDir()
     dataDir.createMorphology(info.variant, info)
     dicts = Voikko.listDicts(dataDir.getDirectory())
     dataDir.tearDown()
     dictsWithCorrectVariant = list(
         filter(lambda aDict: aDict.variant == info.variant, dicts))
     self.assertEqual(1, len(dictsWithCorrectVariant))
     theDict = dictsWithCorrectVariant[0]
     self.assertEqual(info.description, theDict.description)
     self.assertEqual(u"fi", theDict.language)
     self.assertEqual(u"", theDict.script)
Пример #11
0
def initVoikko():
	global _voikko
	for allowedDict in ALLOWED_DICTS:
		v = Voikko(allowedDict)
		v.setIgnoreDot(False)
		v.setAcceptUnfinishedParagraphsInGc(True)
		_voikko[allowedDict] = v
	for d in Voikko.listDicts():
		tag = d.language + u"-x-" + d.variant
		if tag in ALLOWED_DICTS:
			_dictInfo[tag] = d
Пример #12
0
 def __openHandleWithVariant(self, language, fullVariant):
     logging.debug("VoikkoHandlePool.__openHandleWithVariant")
     try:
         voikkoHandle = Voikko(fullVariant, self.getDictionaryPath())
         self.__handles[language] = voikkoHandle
         for booleanOpt, booleanValue in self.__globalBooleanOptions.items(
         ):
             voikkoHandle.setBooleanOption(booleanOpt, booleanValue)
         for integerOpt, integerValue in self.__globalIntegerOptions.items(
         ):
             voikkoHandle.setIntegerOption(integerOpt, integerValue)
         return voikkoHandle
     except VoikkoException as e:
         self.__initializationErrors[language] = e.args[0]
         return None
Пример #13
0
 def testGetVersion(self):
     version = Voikko.getVersion()
     # We can't test for correct version but let's assume it starts with a number
     self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
Пример #14
0
 def setUp(self):
     self.voikko = Voikko(u"fi")
Пример #15
0
 def testInitWithPathWorks(self):
     # TODO: better test
     self.voikko.terminate()
     self.voikko = Voikko(u"fi", path=u"/path/to/nowhere")
     self.failUnless(self.voikko.spell(u"kissa"))
Пример #16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from libvoikko import Voikko

AUTHOR = 'Viljami Venekoski'
AUTHOR_EMAIL = "*****@*****.**"
VERSION = '0.1'
VOIKKO = Voikko("fi")
Пример #17
0
import re

from libvoikko import Voikko

# v = Voikko('fi-x-morphoid')
v = Voikko("fi")

# Replace all non letter characters with space
RE_WS_REPLACE = re.compile(r"[^\w]", re.UNICODE)
RE_FIND_COMPOUNDS = re.compile(r"\(([\w+]+)\)", re.UNICODE)


def voikko_analyze(text):
    text = RE_WS_REPLACE.sub(" ", text)
    words = text.split(" ")
    # Strip spaces
    words = [x.strip() for x in words]
    # Remove empty items
    words = filter(None, words)
    # Loop all words and analyze them
    analyzed = []
    for word in words:
        aword = v.analyze(word)
        if aword:
            i = 0
            for f in aword:
                i += 1
                f["found"] = True
                f["original"] = word
                wordbases = RE_FIND_COMPOUNDS.findall(f.get("WORDBASES", ""))
                f["wordbase_list"] = [
Пример #18
0
class VoikkoAttributeVectorizer:
	"""Converts a collection of text documents to a matrix of counts of words
	having specific value for enumerated morphological analysis attributes.
	
	Examples
	--------
	>>> from voikko_sklearn import VoikkoAttributeVectorizer
	>>> corpus = [
	...     'Koiran karvat olivat takussa.',
	...     'Kissamme goli vanha.'
	... ]
	>>> vectorizer = VoikkoAttributeVectorizer(['NUMBER', 'PERSON'], langtag='fi')
	>>> print(vectorizer.get_feature_names())
	['unknown', 'NUMBER_plural', 'NUMBER_singular', 'PERSON_1', 'PERSON_2', 'PERSON_3', 'PERSON_4']
	>>> X = vectorizer.transform(corpus)
	>>> print(X.toarray())
	[[0.         0.5        0.5        0.         0.         0.25       0.        ]
	[0.33333333 0.         0.66666667 0.         0.         0.         0.        ]]
	"""
	
	def __init__(self, attributes, langtag="fi"):
		self.input = input
		self.attributes = attributes
		self.voikko = Voikko(langtag)
		self.__init_feature_names()

	def __init_feature_names(self):
		self.feature_names = ['unknown']
		self.feature_name_to_index = {'unknown' : 0}
		for attribute in self.attributes:
			values = self.voikko.attributeValues(attribute)
			if values is None:
				raise ValueError("Attribute '" + attribute + "' does not exist or is not categorial.")
			values.sort()
			for value in values:
				name = attribute + '_' + value
				self.feature_name_to_index[name] = len(self.feature_names)
				self.feature_names.append(name)

	def terminate(self):
		self.voikko.terminate()

	def build_tokenizer(self):
		return lambda text: [token.tokenText for token in self.voikko.tokens(text) if token.tokenType == Token.WORD]

	def get_feature_names(self):
		return self.feature_names

	def __transform_document(self, document, target_vector):
		words = self.build_tokenizer()(document)
		wordcount = len(words)
		if wordcount == 0:
			return
		for word in words:
			analysis_list = self.voikko.analyze(word)
			count = len(analysis_list)
			if count == 0:
				target_vector[0] += 1
			else:
				for analysis in analysis_list:
					for attribute in self.attributes:
						if attribute in analysis:
							value = analysis[attribute]
							target_vector[self.feature_name_to_index[attribute + "_" + value]] += 1.0 / count
		target_vector /= wordcount

	def transform(self, document_list):
		document_count = len(document_list)
		vector_length = len(self.feature_names)
		data = numpy.zeros((document_count, vector_length), dtype=numpy.float64)
		for i in range(document_count):
			self.__transform_document(document_list[i], data[i])
		return csr_matrix(data)

	def fit(self, document_list):
		return self

	def fit_transform(self, document_list):
		return self.transform(document_list)
Пример #19
0
 def testListDictsWithoutPath(self):
     dicts = Voikko.listDicts()
     self.failUnless(len(dicts) > 0)
     standard = dicts[0]
     self.assertEqual(u"standard", standard.variant,
                      u"Standard dictionary must be the default in test environment.")
Пример #20
0
 def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self):
     medicalVoikko = Voikko(u"fi-x-medicine")
     self.failUnless(medicalVoikko.spell(u"amifostiini"))
     self.failIf(self.voikko.spell(u"amifostiini"))
     del medicalVoikko
     self.failIf(self.voikko.spell(u"amifostiini"))
Пример #21
0
 def testGetVersion(self):
     version = Voikko.getVersion()
     # We can't test for correct version but let's assume it starts with a number
     self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
Пример #22
0
 def setUp(self):
     self.voikko = Voikko(u"fi")
Пример #23
0
class LibvoikkoTest(unittest.TestCase):

    def setUp(self):
        self.voikko = Voikko(u"fi")

    def tearDown(self):
        self.voikko.terminate()

    def testInitAndTerminate(self):
        pass  # do nothing, just check that setUp and tearDown complete succesfully

    def testTerminateCanBeCalledMultipleTimes(self):
        self.voikko.terminate()
        self.voikko.terminate()

    def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self):
        medicalVoikko = Voikko(u"fi-x-medicine")
        self.failUnless(medicalVoikko.spell(u"amifostiini"))
        self.failIf(self.voikko.spell(u"amifostiini"))
        del medicalVoikko
        self.failIf(self.voikko.spell(u"amifostiini"))

    def testDictionaryComparisonWorks(self):
        d1 = Dictionary(u"fi", u"", u"a", u"b")
        d2 = Dictionary(u"fi", u"", u"a", u"c")
        d3 = Dictionary(u"fi", u"", u"c", u"b")
        d4 = Dictionary(u"fi", u"", u"a", u"b")
        d5 = Dictionary(u"sv", u"", u"a", u"b")
        self.assertNotEqual(u"kissa", d1)
        self.assertNotEqual(d1, u"kissa")
        self.assertNotEqual(d1, d2)
        self.assertNotEqual(d1, d3)
        self.assertNotEqual(d4, d5)
        self.assertEqual(d1, d4)
        self.failUnless(d1 < d2)
        self.failUnless(d2 < d3)
        self.failUnless(d4 < d5)

    def testDictionaryHashCodeWorks(self):
        d1 = Dictionary(u"fi", u"", u"a", u"b")
        d2 = Dictionary(u"fi", u"", u"a", u"c")
        d3 = Dictionary(u"fi", u"", u"c", u"b")
        d4 = Dictionary(u"fi", u"", u"a", u"b")
        d5 = Dictionary(u"sv", u"", u"a", u"b")
        self.assertNotEqual(hash(d1), hash(d2))
        self.assertNotEqual(hash(d1), hash(d3))
        self.assertNotEqual(hash(d4), hash(d5))
        self.assertEqual(hash(d1), hash(d4))

    def testListDictsWithoutPath(self):
        dicts = Voikko.listDicts()
        self.failUnless(len(dicts) > 0)
        standard = dicts[0]
        self.assertEqual(u"standard", standard.variant,
                         u"Standard dictionary must be the default in test environment.")

    def testListSupportedSpellingLanguagesWithoutPath(self):
        langs = Voikko.listSupportedSpellingLanguages()
        self.failUnless(u"fi" in langs, u"Finnish dictionary must be present in the test environment")

    def testListDictsWithPathAndAttributes(self):
        info = MorphologyInfo()
        info.variant = u"test-variant-name"
        info.description = u"Some test description sakldjasd"
        info.morphology = u"null"
        dataDir = TestDataDir()
        dataDir.createMorphology(info.variant, info)
        dicts = Voikko.listDicts(dataDir.getDirectory())
        dataDir.tearDown()
        dictsWithCorrectVariant = list(filter(lambda aDict: aDict.variant == info.variant, dicts))
        self.assertEqual(1, len(dictsWithCorrectVariant))
        theDict = dictsWithCorrectVariant[0]
        self.assertEqual(info.description, theDict.description)
        self.assertEqual(u"fi", theDict.language)
        self.assertEqual(u"", theDict.script)

    def testInitWithCorrectDictWorks(self):
        self.voikko.terminate()
        self.voikko = Voikko(u"fi-x-standard")
        self.failIf(self.voikko.spell(u"amifostiini"))
        self.voikko.terminate()
        self.voikko = Voikko(u"fi-x-medicine")
        self.failUnless(self.voikko.spell(u"amifostiini"))

    def testInitWithNonExistentDictThrowsException(self):
        def tryInit():
            self.voikko = Voikko(u"fi-x-non-existent-variant")
        self.voikko.terminate()
        self.assertRaises(VoikkoException, tryInit)

    def testInitWithPathWorks(self):
        # TODO: better test
        self.voikko.terminate()
        self.voikko = Voikko(u"fi", path=u"/path/to/nowhere")
        self.failUnless(self.voikko.spell(u"kissa"))

    def testSpellAfterTerminateThrowsException(self):
        def trySpell():
            self.voikko.spell(u"kissa")
        self.voikko.terminate()
        self.assertRaises(VoikkoException, trySpell)

    def testSpell(self):
        self.failUnless(self.voikko.spell(u"määrä"))
        self.failIf(self.voikko.spell(u"määä"))

    def testSuggest(self):
        suggs = self.voikko.suggest(u"koirra")
        self.failUnless(u"koira" in suggs)

    def testSuggestReturnsArgumentIfWordIsCorrect(self):
        suggs = self.voikko.suggest(u"koira")
        self.assertEqual(1, len(suggs))
        self.assertEqual(u"koira", suggs[0])

    def testGrammarErrorsAndExplanation(self):
        errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.", "fi")
        self.assertEqual(1, len(errors))
        error = errors[0]
        self.assertEqual(10, error.startPos)
        self.assertEqual(11, error.errorLen)
        self.assertEqual([u"jotenkuten"], error.suggestions)
        self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription)

    def testNoGrammarErrorsInEmptyParagraph(self):
        errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi")
        self.assertEqual(0, len(errors))

    def testGrammarErrorOffsetsInMultipleParagraphs(self):
        errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.", "fi")
        self.assertEqual(1, len(errors))
        error = errors[0]
        self.assertEqual(16, error.startPos)
        self.assertEqual(11, error.errorLen)

    def testAnalyze(self):
        analysisList = self.voikko.analyze(u"kansaneläkehakemus")
        self.assertEqual(1, len(analysisList))
        analysis = analysisList[0]
        self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"])

    def testTokens(self):
        tokenList = self.voikko.tokens(u"kissa ja koira")
        self.assertEqual(5, len(tokenList))
        tokenJa = tokenList[2]
        self.assertEqual(Token.WORD, tokenJa.tokenType)
        self.assertEqual(u"ja", tokenJa.tokenText)

    def testSentences(self):
        sentences = self.voikko.sentences(u"Kissa ei ole koira. Koira ei ole kissa.")
        self.assertEqual(2, len(sentences))
        self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText)
        self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType)
        self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText)
        self.assertEqual(Sentence.NONE, sentences[1].nextStartType)

    def testHyphenationPattern(self):
        pattern = self.voikko.getHyphenationPattern(u"kissa")
        self.assertEqual("   - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"määrä")
        self.assertEqual("   - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"kuorma-auto")
        self.assertEqual("    - =  - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"vaa'an")
        self.assertEqual("   =  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"auton-")
        self.assertEqual("  -   ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"aztoa-")
        self.assertEqual("  - - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"aztoa-alus")
        self.assertEqual("  - -= -  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"-auton")
        self.assertEqual("   -  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"-aztoa")
        self.assertEqual("   - -", pattern)

    def testHyphenate(self):
        self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa"))
        self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä"))
        self.assertEqual(u"kuor-ma-au-to", self.voikko.hyphenate(u"kuorma-auto"))
        self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an"))

    def testHyphenateWithCustomSeparator(self):
        self.assertEqual(u"kis&shy;sa", self.voikko.hyphenate(u"kissa", u"&shy;", True))
        self.assertEqual(u"kuor&shy;ma-au&shy;to", self.voikko.hyphenate(u"kuorma-auto", u"&shy;", True))
        self.assertEqual(u"vaa&shy;an", self.voikko.hyphenate(u"vaa'an", u"&shy;", True))
        self.assertEqual(u"vaa'an", self.voikko.hyphenate(u"vaa'an", u"&shy;", False))

    def testSetIgnoreDot(self):
        self.voikko.setIgnoreDot(False)
        self.failIf(self.voikko.spell(u"kissa."))
        self.voikko.setIgnoreDot(True)
        self.failUnless(self.voikko.spell(u"kissa."))

    def testSetBooleanOption(self):
        self.voikko.setBooleanOption(0, False)  # This is "ignore dot"
        self.failIf(self.voikko.spell(u"kissa."))
        self.voikko.setBooleanOption(0, True)
        self.failUnless(self.voikko.spell(u"kissa."))

    def testSetIgnoreNumbers(self):
        self.voikko.setIgnoreNumbers(False)
        self.failIf(self.voikko.spell(u"kissa2"))
        self.voikko.setIgnoreNumbers(True)
        self.failUnless(self.voikko.spell(u"kissa2"))

    def testSetIgnoreUppercase(self):
        self.voikko.setIgnoreUppercase(False)
        self.failIf(self.voikko.spell(u"KAAAA"))
        self.voikko.setIgnoreUppercase(True)
        self.failUnless(self.voikko.spell(u"KAAAA"))

    def testAcceptFirstUppercase(self):
        self.voikko.setAcceptFirstUppercase(False)
        self.failIf(self.voikko.spell("Kissa"))
        self.voikko.setAcceptFirstUppercase(True)
        self.failUnless(self.voikko.spell("Kissa"))

    def testUpperCaseScandinavianLetters(self):
        self.failUnless(self.voikko.spell(u"Äiti"))
        self.failIf(self.voikko.spell(u"Ääiti"))
        self.failUnless(self.voikko.spell(u"š"))
        self.failUnless(self.voikko.spell(u"Š"))

    def testAcceptAllUppercase(self):
        self.voikko.setIgnoreUppercase(False)
        self.voikko.setAcceptAllUppercase(False)
        self.failIf(self.voikko.spell("KISSA"))
        self.voikko.setAcceptAllUppercase(True)
        self.failUnless(self.voikko.spell("KISSA"))
        self.failIf(self.voikko.spell("KAAAA"))

    def testIgnoreNonwords(self):
        self.voikko.setIgnoreNonwords(False)
        self.failIf(self.voikko.spell("*****@*****.**"))
        self.voikko.setIgnoreNonwords(True)
        self.failUnless(self.voikko.spell("*****@*****.**"))
        self.failIf(self.voikko.spell("ashdaksd"))

    def testAcceptExtraHyphens(self):
        self.voikko.setAcceptExtraHyphens(False)
        self.failIf(self.voikko.spell("kerros-talo"))
        self.voikko.setAcceptExtraHyphens(True)
        self.failUnless(self.voikko.spell("kerros-talo"))

    def testAcceptMissingHyphens(self):
        self.voikko.setAcceptMissingHyphens(False)
        self.failIf(self.voikko.spell("sosiaali"))
        self.voikko.setAcceptMissingHyphens(True)
        self.failUnless(self.voikko.spell("sosiaali"))

    def testSetAcceptTitlesInGc(self):
        self.voikko.setAcceptTitlesInGc(False)
        self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi")))
        self.voikko.setAcceptTitlesInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi")))

    def testSetAcceptUnfinishedParagraphsInGc(self):
        self.voikko.setAcceptUnfinishedParagraphsInGc(False)
        self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi")))
        self.voikko.setAcceptUnfinishedParagraphsInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi")))

    def testSetAcceptBulletedListsInGc(self):
        self.voikko.setAcceptBulletedListsInGc(False)
        self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi")))
        self.voikko.setAcceptBulletedListsInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi")))

    def testSetNoUglyHyphenation(self):
        self.voikko.setNoUglyHyphenation(False)
        self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva"))
        self.voikko.setNoUglyHyphenation(True)
        self.assertEqual(u"iva", self.voikko.hyphenate(u"iva"))

    def testSetHyphenateUnknownWordsWorks(self):
        self.voikko.setHyphenateUnknownWords(False)
        self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo"))
        self.voikko.setHyphenateUnknownWords(True)
        self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo"))

    def testSetMinHyphenatedWordLength(self):
        self.voikko.setMinHyphenatedWordLength(6)
        self.assertEqual(u"koira", self.voikko.hyphenate(u"koira"))
        self.voikko.setMinHyphenatedWordLength(2)
        self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira"))

    def testIncreaseSpellerCacheSize(self):
        # TODO: this only tests that nothing breaks, not that cache is actually increased
        self.voikko.setSpellerCacheSize(3)
        self.failUnless(self.voikko.spell(u"kissa"))

    def testDisableSpellerCache(self):
        # TODO: this only tests that nothing breaks, not that cache is actually disabled
        self.voikko.setSpellerCacheSize(-1)
        self.failUnless(self.voikko.spell(u"kissa"))

    def testSetSuggestionStrategy(self):
        self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR)
        self.failIf(u"koira" in self.voikko.suggest(u"koari"))
        self.failUnless(u"koira" in self.voikko.suggest(u"koir_"))
        self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO)
        self.failUnless(u"koira" in self.voikko.suggest(u"koari"))

    def testMaxAnalysisCountIsNotPassed(self):
        complexWord = u"lumenerolumenerolumenerolumenerolumenero"
        self.failUnless(len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT)

    def testMorPruningWorks(self):
        # TODO: this test will not fail, it just takes very long time
        # if pruning does not work.
        complexWord = u""
        for i in range(0, 20):
            complexWord = complexWord + u"lumenero"
        self.failUnless(len(complexWord) < MAX_WORD_CHARS)
        self.voikko.analyze(complexWord)

    def testOverLongWordsAreRejectedInSpellCheck(self):
        # Limit is 255 characters. This behavior is deprecated and may change.
        longWord = u""
        for i in range(0, 25):
            longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) < MAX_WORD_CHARS)
        self.failUnless(self.voikko.spell(longWord))

        longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) > MAX_WORD_CHARS)
        self.failIf(self.voikko.spell(longWord))

    def testOverLongWordsAreRejectedInAnalysis(self):
        # Limit is 255 characters. This behavior is deprecated and may change.
        longWord = u""
        for i in range(0, 25):
            longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) < MAX_WORD_CHARS)
        self.assertEqual(1, len(self.voikko.analyze(longWord)))

        longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) > MAX_WORD_CHARS)
        self.assertEqual(0, len(self.voikko.analyze(longWord)))

    def testTokenizationWorksForHugeParagraphs(self):
        hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000
        self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph)))

    def testTokenizationWorksWithSomeMultibyteCharacters(self):
        text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9
        self.assertEqual(180, len(self.voikko.tokens(text)))

    def testEmbeddedNullsAreNotAccepted(self):
        self.failIf(self.voikko.spell(u"kissa\0asdasd"))
        self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira")))
        self.assertEqual(u"kissa\0koira", self.voikko.hyphenate(u"kissa\0koira"))
        self.assertEquals(0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi")))
        self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira")))

    def testNullCharMeansSingleSentence(self):
        sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.")
        self.assertEqual(1, len(sentences))
        self.assertEqual(Sentence.NONE, sentences[0].nextStartType)
        self.assertEqual(u"kissa\0koira. Koira ja kissa.", sentences[0].sentenceText)

    def testNullCharIsUnknownToken(self):
        tokens = self.voikko.tokens(u"kissa\0koira")
        self.assertEquals(3, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)
        self.assertEquals(Token.WORD, tokens[2].tokenType)
        self.assertEquals(u"koira", tokens[2].tokenText)

        tokens = self.voikko.tokens(u"kissa\0\0koira")
        self.assertEquals(4, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[2].tokenType)
        self.assertEquals(u"\0", tokens[2].tokenText)
        self.assertEquals(Token.WORD, tokens[3].tokenType)
        self.assertEquals(u"koira", tokens[3].tokenText)

        tokens = self.voikko.tokens(u"kissa\0")
        self.assertEquals(2, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)

        tokens = self.voikko.tokens(u"\0kissa")
        self.assertEquals(2, len(tokens))
        self.assertEquals(Token.UNKNOWN, tokens[0].tokenType)
        self.assertEquals(u"\0", tokens[0].tokenText)
        self.assertEquals(Token.WORD, tokens[1].tokenType)
        self.assertEquals(u"kissa", tokens[1].tokenText)

        tokens = self.voikko.tokens(u"\0")
        self.assertEquals(1, len(tokens))
        self.assertEquals(Token.UNKNOWN, tokens[0].tokenType)
        self.assertEquals(u"\0", tokens[0].tokenText)

        self.assertEquals(0, len(self.voikko.tokens(u"")))

    def testAllCapsAndDot(self):
        self.voikko.setIgnoreDot(True)
        self.failIf(self.voikko.spell(u"ABC-DEF."))

    def testGetVersion(self):
        version = Voikko.getVersion()
        # We can't test for correct version but let's assume it starts with a number
        self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
Пример #24
0
 def testInitWithPathWorks(self):
     # TODO: better test
     self.voikko.terminate()
     self.voikko = Voikko(u"fi", path=u"/path/to/nowhere")
     self.failUnless(self.voikko.spell(u"kissa"))
Пример #25
0
 def tryInit():
     self.voikko = Voikko(u"fi-x-non-existent-variant")
Пример #26
0
	def __init__(self, langtag="fi", binary=False, stop_word_classes=[]):
		self.voikko = Voikko(langtag)
		self.stop_word_classes = set(stop_word_classes)
		super().__init__(binary=binary)
Пример #27
0
 def testListSupportedSpellingLanguagesWithoutPath(self):
     langs = Voikko.listSupportedSpellingLanguages()
     self.failUnless(u"fi" in langs, u"Finnish dictionary must be present in the test environment")
Пример #28
0
	def __init__(self, attributes, langtag="fi"):
		self.input = input
		self.attributes = attributes
		self.voikko = Voikko(langtag)
		self.__init_feature_names()
# This is an example application

import pandas as pd
from libvoikko import Voikko

v = Voikko("fi")
print(v.analyze("autossa"))

df1 = pd.read_csv("./data/input.csv")
df2 = pd.read_csv("./data/input.csv")
df1 = df1.append(df2)
df1.to_csv("./data/output.csv")

Пример #30
0
	logging.getLogger().setLevel(logging.DEBUG)

def messageBox(messageText):
	ctx = uno.getComponentContext()
	sManager = ctx.ServiceManager
	toolkit = sManager.createInstance("com.sun.star.awt.Toolkit")
	msgbox = toolkit.createMessageBox(None, ERRORBOX, BUTTONS_OK, "Error initializing Voikko", messageText)
	return msgbox.execute()

if not PropertyManager.loadingFailed:
	try:
		# Force initialization of property manager so that it is done before anything else.
		PropertyManager.getInstance()
		# We could check for specific version but this at least ensures that libvoikko is installed
		# (this would throw an exception if it's not).
		Voikko.getVersion()
		# name of g_ImplementationHelper is significant, Python component loader expects to find it
		g_ImplementationHelper = unohelper.ImplementationHelper()
		g_ImplementationHelper.addImplementation(SettingsEventHandler, \
		                    SettingsEventHandler.IMPLEMENTATION_NAME,
		                    SettingsEventHandler.SUPPORTED_SERVICE_NAMES,)
		g_ImplementationHelper.addImplementation(SpellChecker, \
		                    SpellChecker.IMPLEMENTATION_NAME,
		                    SpellChecker.SUPPORTED_SERVICE_NAMES,)
		g_ImplementationHelper.addImplementation(Hyphenator, \
		                    Hyphenator.IMPLEMENTATION_NAME,
		                    Hyphenator.SUPPORTED_SERVICE_NAMES,)
		g_ImplementationHelper.addImplementation(GrammarChecker, \
		                    GrammarChecker.IMPLEMENTATION_NAME,
		                    GrammarChecker.SUPPORTED_SERVICE_NAMES,)
	except OSError as e:
Пример #31
0
 def testListSupportedSpellingLanguagesWithoutPath(self):
     langs = Voikko.listSupportedSpellingLanguages()
     self.failUnless(
         u"fi" in langs,
         u"Finnish dictionary must be present in the test environment")
"""Contains functions for retrieving pre-processed words from one teletext frontpage image.

See instructions in words_from_image()
"""

import re
from typing import List, Tuple

import pytesseract
from PIL import Image, ImageOps
from libvoikko import Voikko

# these settings only work in Windows environment
Voikko.setLibrarySearchPath("C:/python37/DLLs")
voikko = Voikko("fi-x-morphoid")
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'


def words_from_image(filename: str) -> List[List[str]]:
    """Retrieve pre-processed words from given 'filename' containing teletext frontpage image.

    Return value is a list of lists
    e.g. [['word1', 'word2'], ['word1', 'word2', 'word3']]
    """

    # make image black and white
    image = Image.open(filename).convert('1').convert('RGB')

    # invert black and white
    image = ImageOps.invert(image)
Пример #33
0
locale.setlocale(locale.LC_ALL, 'FI_fi')
weekday = datetime.datetime.now().strftime('%A')  # e.g. Tiistai
if len(sys.argv) > 1:
    weekday = sys.argv[1]

pattern = re.compile('.*{}.*'.format(weekday))
URL = 'http://pompier.fi/espa/lounas/'
text = get_html(URL)
soup = BeautifulSoup(text)
# columns = soup.find_all('strong')
todays_lunch = soup.find(text=pattern)
print(todays_lunch.parent.parent.text)

from libvoikko import Voikko, Token
v = Voikko(u"fi-x-morphoid")
ttt = todays_lunch.parent.parent.text.replace('-', ' ').replace('\r', ' ').replace('\n', ' ')
all_words = []
for word in ttt.split(" "):
    word = word.strip('\n\r,.')
    foo = v.analyze(word)
    print("-- " + word + "--")
    if foo and 'BASEFORM' in foo[0]:
        base = foo[0]['BASEFORM']
    else:
        base = word
    all_words.append(base)
    print(":  " + base)


print(all_words)
Пример #34
0
 def tryInit():
     self.voikko = Voikko(u"fi-x-non-existent-variant")
Пример #35
0
locale.setlocale(locale.LC_ALL, "FI_fi")
weekday = datetime.datetime.now().strftime("%A")  # e.g. Tiistai
if len(sys.argv) > 1:
    weekday = sys.argv[1]

pattern = re.compile(".*{}.*".format(weekday))
URL = "http://pompier.fi/espa/lounas/"
text = get_html(URL)
soup = BeautifulSoup(text)
# columns = soup.find_all('strong')
todays_lunch = soup.find(text=pattern)
print(todays_lunch.parent.parent.text)

from libvoikko import Voikko, Token

v = Voikko(u"fi-x-morphoid")
ttt = (todays_lunch.parent.parent.text.replace("-",
                                               " ").replace("\r", " ").replace(
                                                   "\n", " "))
all_words = []
for word in ttt.split(" "):
    word = word.strip("\n\r,.")
    foo = v.analyze(word)
    print("-- " + word + "--")
    if foo and "BASEFORM" in foo[0]:
        base = foo[0]["BASEFORM"]
    else:
        base = word
    all_words.append(base)
    print(":  " + base)
Пример #36
0
class LibvoikkoTest(unittest.TestCase):
    def setUp(self):
        self.voikko = Voikko(u"fi")

    def tearDown(self):
        self.voikko.terminate()

    def testInitAndTerminate(self):
        pass  # do nothing, just check that setUp and tearDown complete succesfully

    def testTerminateCanBeCalledMultipleTimes(self):
        self.voikko.terminate()
        self.voikko.terminate()

    def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self):
        medicalVoikko = Voikko(u"fi-x-medicine")
        self.failUnless(medicalVoikko.spell(u"amifostiini"))
        self.failIf(self.voikko.spell(u"amifostiini"))
        del medicalVoikko
        self.failIf(self.voikko.spell(u"amifostiini"))

    def testDictionaryComparisonWorks(self):
        d1 = Dictionary(u"fi", u"", u"a", u"b")
        d2 = Dictionary(u"fi", u"", u"a", u"c")
        d3 = Dictionary(u"fi", u"", u"c", u"b")
        d4 = Dictionary(u"fi", u"", u"a", u"b")
        d5 = Dictionary(u"sv", u"", u"a", u"b")
        self.assertNotEqual(u"kissa", d1)
        self.assertNotEqual(d1, u"kissa")
        self.assertNotEqual(d1, d2)
        self.assertNotEqual(d1, d3)
        self.assertNotEqual(d4, d5)
        self.assertEqual(d1, d4)
        self.failUnless(d1 < d2)
        self.failUnless(d2 < d3)
        self.failUnless(d4 < d5)

    def testDictionaryHashCodeWorks(self):
        d1 = Dictionary(u"fi", u"", u"a", u"b")
        d2 = Dictionary(u"fi", u"", u"a", u"c")
        d3 = Dictionary(u"fi", u"", u"c", u"b")
        d4 = Dictionary(u"fi", u"", u"a", u"b")
        d5 = Dictionary(u"sv", u"", u"a", u"b")
        self.assertNotEqual(hash(d1), hash(d2))
        self.assertNotEqual(hash(d1), hash(d3))
        self.assertNotEqual(hash(d4), hash(d5))
        self.assertEqual(hash(d1), hash(d4))

    def testListDictsWithoutPath(self):
        dicts = Voikko.listDicts()
        self.failUnless(len(dicts) > 0)
        standard = dicts[0]
        self.assertEqual(
            u"standard", standard.variant,
            u"Standard dictionary must be the default in test environment.")

    def testListSupportedSpellingLanguagesWithoutPath(self):
        langs = Voikko.listSupportedSpellingLanguages()
        self.failUnless(
            u"fi" in langs,
            u"Finnish dictionary must be present in the test environment")

    def testListDictsWithPathAndAttributes(self):
        info = MorphologyInfo()
        info.variant = u"test-variant-name"
        info.description = u"Some test description sakldjasd"
        info.morphology = u"null"
        dataDir = TestDataDir()
        dataDir.createMorphology(info.variant, info)
        dicts = Voikko.listDicts(dataDir.getDirectory())
        dataDir.tearDown()
        dictsWithCorrectVariant = list(
            filter(lambda aDict: aDict.variant == info.variant, dicts))
        self.assertEqual(1, len(dictsWithCorrectVariant))
        theDict = dictsWithCorrectVariant[0]
        self.assertEqual(info.description, theDict.description)
        self.assertEqual(u"fi", theDict.language)
        self.assertEqual(u"", theDict.script)

    def testInitWithCorrectDictWorks(self):
        self.voikko.terminate()
        self.voikko = Voikko(u"fi-x-standard")
        self.failIf(self.voikko.spell(u"amifostiini"))
        self.voikko.terminate()
        self.voikko = Voikko(u"fi-x-medicine")
        self.failUnless(self.voikko.spell(u"amifostiini"))

    def testInitWithNonExistentDictThrowsException(self):
        def tryInit():
            self.voikko = Voikko(u"fi-x-non-existent-variant")

        self.voikko.terminate()
        self.assertRaises(VoikkoException, tryInit)

    def testInitWithPathWorks(self):
        # TODO: better test
        self.voikko.terminate()
        self.voikko = Voikko(u"fi", path=u"/path/to/nowhere")
        self.failUnless(self.voikko.spell(u"kissa"))

    def testSpellAfterTerminateThrowsException(self):
        def trySpell():
            self.voikko.spell(u"kissa")

        self.voikko.terminate()
        self.assertRaises(VoikkoException, trySpell)

    def testSpell(self):
        self.failUnless(self.voikko.spell(u"määrä"))
        self.failIf(self.voikko.spell(u"määä"))

    def testSuggest(self):
        suggs = self.voikko.suggest(u"koirra")
        self.failUnless(u"koira" in suggs)

    def testSuggestReturnsArgumentIfWordIsCorrect(self):
        suggs = self.voikko.suggest(u"koira")
        self.assertEqual(1, len(suggs))
        self.assertEqual(u"koira", suggs[0])

    def testGrammarErrorsAndExplanation(self):
        errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.",
                                           "fi")
        self.assertEqual(1, len(errors))
        error = errors[0]
        self.assertEqual(10, error.startPos)
        self.assertEqual(11, error.errorLen)
        self.assertEqual([u"jotenkuten"], error.suggestions)
        self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription)

    def testNoGrammarErrorsInEmptyParagraph(self):
        errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi")
        self.assertEqual(0, len(errors))

    def testGrammarErrorOffsetsInMultipleParagraphs(self):
        errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.",
                                           "fi")
        self.assertEqual(1, len(errors))
        error = errors[0]
        self.assertEqual(16, error.startPos)
        self.assertEqual(11, error.errorLen)

    def testAnalyze(self):
        analysisList = self.voikko.analyze(u"kansaneläkehakemus")
        self.assertEqual(1, len(analysisList))
        analysis = analysisList[0]
        self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"])

    def testTokens(self):
        tokenList = self.voikko.tokens(u"kissa ja koira")
        self.assertEqual(5, len(tokenList))
        tokenJa = tokenList[2]
        self.assertEqual(Token.WORD, tokenJa.tokenType)
        self.assertEqual(u"ja", tokenJa.tokenText)

    def testSentences(self):
        sentences = self.voikko.sentences(
            u"Kissa ei ole koira. Koira ei ole kissa.")
        self.assertEqual(2, len(sentences))
        self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText)
        self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType)
        self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText)
        self.assertEqual(Sentence.NONE, sentences[1].nextStartType)

    def testAttributeValuesForEnumeratedAttribute(self):
        values = self.voikko.attributeValues(u"NUMBER")
        self.assertEqual(2, len(values))
        self.assertTrue("singular" in values)
        self.assertTrue("plural" in values)

    def testAttributeValuesForNonEnumeratedAttribute(self):
        values = self.voikko.attributeValues(u"BASEFORM")
        self.assertEqual(None, values)

    def testAttributeValuesForUnknownAttribute(self):
        values = self.voikko.attributeValues(u"XYZ")
        self.assertEqual(None, values)

    def testHyphenationPattern(self):
        pattern = self.voikko.getHyphenationPattern(u"kissa")
        self.assertEqual("   - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"määrä")
        self.assertEqual("   - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"kuorma-auto")
        self.assertEqual("    - =  - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"vaa'an")
        self.assertEqual("   =  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"auton-")
        self.assertEqual("  -   ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"aztoa-")
        self.assertEqual("  - - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"aztoa-alus")
        self.assertEqual("  - -= -  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"-auton")
        self.assertEqual("   -  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"-aztoa")
        self.assertEqual("   - -", pattern)

    def testHyphenate(self):
        self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa"))
        self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä"))
        self.assertEqual(u"kuor-ma-au-to",
                         self.voikko.hyphenate(u"kuorma-auto"))
        self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an"))

    def testHyphenateWithCustomSeparator(self):
        self.assertEqual(u"kis&shy;sa",
                         self.voikko.hyphenate(u"kissa", u"&shy;", True))
        self.assertEqual(u"kuor&shy;ma-au&shy;to",
                         self.voikko.hyphenate(u"kuorma-auto", u"&shy;", True))
        self.assertEqual(u"vaa&shy;an",
                         self.voikko.hyphenate(u"vaa'an", u"&shy;", True))
        self.assertEqual(u"vaa'an",
                         self.voikko.hyphenate(u"vaa'an", u"&shy;", False))

    def testSetIgnoreDot(self):
        self.voikko.setIgnoreDot(False)
        self.failIf(self.voikko.spell(u"kissa."))
        self.voikko.setIgnoreDot(True)
        self.failUnless(self.voikko.spell(u"kissa."))

    def testSetBooleanOption(self):
        self.voikko.setBooleanOption(0, False)  # This is "ignore dot"
        self.failIf(self.voikko.spell(u"kissa."))
        self.voikko.setBooleanOption(0, True)
        self.failUnless(self.voikko.spell(u"kissa."))

    def testSetIgnoreNumbers(self):
        self.voikko.setIgnoreNumbers(False)
        self.failIf(self.voikko.spell(u"kissa2"))
        self.voikko.setIgnoreNumbers(True)
        self.failUnless(self.voikko.spell(u"kissa2"))

    def testSetIgnoreUppercase(self):
        self.voikko.setIgnoreUppercase(False)
        self.failIf(self.voikko.spell(u"KAAAA"))
        self.voikko.setIgnoreUppercase(True)
        self.failUnless(self.voikko.spell(u"KAAAA"))

    def testAcceptFirstUppercase(self):
        self.voikko.setAcceptFirstUppercase(False)
        self.failIf(self.voikko.spell("Kissa"))
        self.voikko.setAcceptFirstUppercase(True)
        self.failUnless(self.voikko.spell("Kissa"))

    def testUpperCaseScandinavianLetters(self):
        self.failUnless(self.voikko.spell(u"Äiti"))
        self.failIf(self.voikko.spell(u"Ääiti"))
        self.failUnless(self.voikko.spell(u"š"))
        self.failUnless(self.voikko.spell(u"Š"))

    def testAcceptAllUppercase(self):
        self.voikko.setIgnoreUppercase(False)
        self.voikko.setAcceptAllUppercase(False)
        self.failIf(self.voikko.spell("KISSA"))
        self.voikko.setAcceptAllUppercase(True)
        self.failUnless(self.voikko.spell("KISSA"))
        self.failIf(self.voikko.spell("KAAAA"))

    def testIgnoreNonwords(self):
        self.voikko.setIgnoreNonwords(False)
        self.failIf(self.voikko.spell("*****@*****.**"))
        self.voikko.setIgnoreNonwords(True)
        self.failUnless(self.voikko.spell("*****@*****.**"))
        self.failIf(self.voikko.spell("ashdaksd"))

    def testAcceptExtraHyphens(self):
        self.voikko.setAcceptExtraHyphens(False)
        self.failIf(self.voikko.spell("kerros-talo"))
        self.voikko.setAcceptExtraHyphens(True)
        self.failUnless(self.voikko.spell("kerros-talo"))

    def testAcceptMissingHyphens(self):
        self.voikko.setAcceptMissingHyphens(False)
        self.failIf(self.voikko.spell("sosiaali"))
        self.voikko.setAcceptMissingHyphens(True)
        self.failUnless(self.voikko.spell("sosiaali"))

    def testSetAcceptTitlesInGc(self):
        self.voikko.setAcceptTitlesInGc(False)
        self.assertEqual(
            1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi")))
        self.voikko.setAcceptTitlesInGc(True)
        self.assertEqual(
            0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi")))

    def testSetAcceptUnfinishedParagraphsInGc(self):
        self.voikko.setAcceptUnfinishedParagraphsInGc(False)
        self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi")))
        self.voikko.setAcceptUnfinishedParagraphsInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi")))

    def testSetAcceptBulletedListsInGc(self):
        self.voikko.setAcceptBulletedListsInGc(False)
        self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi")))
        self.voikko.setAcceptBulletedListsInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi")))

    def testSetNoUglyHyphenation(self):
        self.voikko.setNoUglyHyphenation(False)
        self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva"))
        self.voikko.setNoUglyHyphenation(True)
        self.assertEqual(u"iva", self.voikko.hyphenate(u"iva"))

    def testSetHyphenateUnknownWordsWorks(self):
        self.voikko.setHyphenateUnknownWords(False)
        self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo"))
        self.voikko.setHyphenateUnknownWords(True)
        self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo"))

    def testSetMinHyphenatedWordLength(self):
        self.voikko.setMinHyphenatedWordLength(6)
        self.assertEqual(u"koira", self.voikko.hyphenate(u"koira"))
        self.voikko.setMinHyphenatedWordLength(2)
        self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira"))

    def testIncreaseSpellerCacheSize(self):
        # TODO: this only tests that nothing breaks, not that cache is actually increased
        self.voikko.setSpellerCacheSize(3)
        self.failUnless(self.voikko.spell(u"kissa"))

    def testDisableSpellerCache(self):
        # TODO: this only tests that nothing breaks, not that cache is actually disabled
        self.voikko.setSpellerCacheSize(-1)
        self.failUnless(self.voikko.spell(u"kissa"))

    def testSetSuggestionStrategy(self):
        self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR)
        self.failIf(u"koira" in self.voikko.suggest(u"koari"))
        self.failUnless(u"koira" in self.voikko.suggest(u"koir_"))
        self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO)
        self.failUnless(u"koira" in self.voikko.suggest(u"koari"))

    def testMaxAnalysisCountIsNotPassed(self):
        complexWord = u"lumenerolumenerolumenerolumenerolumenero"
        self.failUnless(
            len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT)

    def testMorPruningWorks(self):
        # TODO: this test will not fail, it just takes very long time
        # if pruning does not work.
        complexWord = u""
        for i in range(0, 20):
            complexWord = complexWord + u"lumenero"
        self.failUnless(len(complexWord) < MAX_WORD_CHARS)
        self.voikko.analyze(complexWord)

    def testOverLongWordsAreRejectedInSpellCheck(self):
        # Limit is 255 characters. This behavior is deprecated and may change.
        longWord = u""
        for i in range(0, 25):
            longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) < MAX_WORD_CHARS)
        self.failUnless(self.voikko.spell(longWord))

        longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) > MAX_WORD_CHARS)
        self.failIf(self.voikko.spell(longWord))

    def testOverLongWordsAreRejectedInAnalysis(self):
        # Limit is 255 characters. This behavior is deprecated and may change.
        longWord = u""
        for i in range(0, 25):
            longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) < MAX_WORD_CHARS)
        self.assertEqual(1, len(self.voikko.analyze(longWord)))

        longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) > MAX_WORD_CHARS)
        self.assertEqual(0, len(self.voikko.analyze(longWord)))

    def testTokenizationWorksForHugeParagraphs(self):
        hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000
        self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph)))

    def testTokenizationWorksWithSomeMultibyteCharacters(self):
        text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9
        self.assertEqual(180, len(self.voikko.tokens(text)))

    def testEmbeddedNullsAreNotAccepted(self):
        self.failIf(self.voikko.spell(u"kissa\0asdasd"))
        self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira")))
        self.assertEqual(u"kissa\0koira",
                         self.voikko.hyphenate(u"kissa\0koira"))
        self.assertEquals(
            0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi")))
        self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira")))

    def testNullCharMeansSingleSentence(self):
        sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.")
        self.assertEqual(1, len(sentences))
        self.assertEqual(Sentence.NONE, sentences[0].nextStartType)
        self.assertEqual(u"kissa\0koira. Koira ja kissa.",
                         sentences[0].sentenceText)

    def testNullCharIsUnknownToken(self):
        tokens = self.voikko.tokens(u"kissa\0koira")
        self.assertEquals(3, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)
        self.assertEquals(Token.WORD, tokens[2].tokenType)
        self.assertEquals(u"koira", tokens[2].tokenText)

        tokens = self.voikko.tokens(u"kissa\0\0koira")
        self.assertEquals(4, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[2].tokenType)
        self.assertEquals(u"\0", tokens[2].tokenText)
        self.assertEquals(Token.WORD, tokens[3].tokenType)
        self.assertEquals(u"koira", tokens[3].tokenText)

        tokens = self.voikko.tokens(u"kissa\0")
        self.assertEquals(2, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)

        tokens = self.voikko.tokens(u"\0kissa")
        self.assertEquals(2, len(tokens))
        self.assertEquals(Token.UNKNOWN, tokens[0].tokenType)
        self.assertEquals(u"\0", tokens[0].tokenText)
        self.assertEquals(Token.WORD, tokens[1].tokenType)
        self.assertEquals(u"kissa", tokens[1].tokenText)

        tokens = self.voikko.tokens(u"\0")
        self.assertEquals(1, len(tokens))
        self.assertEquals(Token.UNKNOWN, tokens[0].tokenType)
        self.assertEquals(u"\0", tokens[0].tokenText)

        self.assertEquals(0, len(self.voikko.tokens(u"")))

    def testAllCapsAndDot(self):
        self.voikko.setIgnoreDot(True)
        self.failIf(self.voikko.spell(u"ABC-DEF."))

    def testGetVersion(self):
        version = Voikko.getVersion()
        # We can't test for correct version but let's assume it starts with a number
        self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
Пример #37
0
#!/usr/bin/env python

import sys
from libvoikko import Voikko

print('Analysoidaan annetut sanat:\n')

v = Voikko("fi")

# Pass the 1st argument as it is the app name itself.
for a in sys.argv[1:]:
    print('Sanan {} analyysi:'.format(a))
    print(v.analyze(a))

print('Annetut sanat analysoitu.')
Пример #38
0
 def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self):
     medicalVoikko = Voikko(u"fi-x-medicine")
     self.failUnless(medicalVoikko.spell(u"amifostiini"))
     self.failIf(self.voikko.spell(u"amifostiini"))
     del medicalVoikko
     self.failIf(self.voikko.spell(u"amifostiini"))
Пример #39
0
from flask import Flask, request
from flask_restful import Resource, Api
from flask import jsonify

import sys
from libvoikko import Voikko

app = Flask(__name__)
api = Api(app)

v = Voikko('fi')


class Finnish_text_analysis(Resource):
    def get(self):
        word = request.args.get('word')
        return (jsonify(self.process(word)))


class Analyze(Finnish_text_analysis):
    def process(self, word):
        return v.analyze(word)


class Spell(Finnish_text_analysis):
    def process(self, word):
        return {"spelling": v.spell(word)}


class Suggest(Finnish_text_analysis):
    def process(self, word):
Пример #40
0
#!/usr/bin/env python

from libvoikko import Voikko

v = Voikko("fi")
print(v.analyze('astetta'))
Пример #41
0
	def setInstallationPath(self, path):
		self.__installationPath = path
		searchPath = os.path.join(path, "voikko", platform.system() + "-" + "-".join(platform.architecture()))
		logging.debug("VoikkoHandlePool.setInstallationPath: library search path is " + searchPath)
		Voikko.setLibrarySearchPath(searchPath)
Пример #42
0
from libvoikko import Voikko
voikko = Voikko("fi")

# from https://stackoverflow.com/a/1988826/95357


class Memoize:
    def __init__(self, f):
        self.f = f
        self.memo = {}

    def __call__(self, *args):
        if not args in self.memo:
            self.memo[args] = self.f(*args)
        # Warning: You may wish to do a deepcopy here if returning objects
        return self.memo[args]


@Memoize
def analyze_word(form):
    return voikko.analyze(form)
Пример #43
0
	def __initAvailableVariants(self):
		dicts = Voikko.listDicts(VoikkoHandlePool.getInstance().getDictionaryPath())
		self.__dictionaryVariantList = []
		for vDict in dicts:
			dictName = vDict.variant + ": " + vDict.description
			self.__dictionaryVariantList.append(dictName)