def lemmatize_file(filename): print('lemmatizing ' + filename) v = Voikko("fi") lemmatized_filename = filename + '_lemmatized' lemmatized_file = open(lemmatized_filename, 'w') with open(filename, 'r') as f: for sentence in f: sent_toks = v.tokens(sentence) words_baseform = [] for word in sent_toks: if word.tokenType == 1: word_analyzed = v.analyze(word.tokenText) if len(word_analyzed) > 0: words_baseform.append(word_analyzed[0].get('BASEFORM')) else: words_baseform.append(word.tokenText) else: words_baseform.append(word.tokenText) sent_baseform = ''.join(words_baseform) lemmatized_file.write(sent_baseform) lemmatized_file.close() v.terminate() return lemmatized_filename
class VoikkoTokenizer(): """ Voikko Tokenizer ~~~~~~~~~~~~~~~~ Getting Voikko to work on Windows ================================= - Download voikko DLL into application directory from: https://www.puimula.org/htp/testing/voikko-sdk/win-crossbuild/ - Download and extract dictionary files into `instance/voikko` directory: https://www.puimula.org/htp/testing/voikko-snapshot-v5/ Select one contain morphological data. """ """ Tokenize text """ def __init__(self, lang="fi"): # Voikko dictrionary path. dict_path = instance_path() / "voikko" path = str(dict_path) if dict_path.exists() else None self.stem_map = {} self.voikko = Voikko(lang, path=path) self.regex_words = re.compile(r""" (\w+-(?:\w+)+ # Get wordcharacters conjucated by dash (-) |\w{1,} # OR all word characters len() > 1 )|(?::[\w]*) # ignore word characters after colon """, re.VERBOSE + re.MULTILINE) self.err_treshold = 0.5 def tokenize(self, text: str) -> List[str]: """ Return list of words """ # Split into paragraphs. paragraphs = text.splitlines() tokens = chain(*map(self.tokenize_paragraph, paragraphs)) return tokens def tokenize_paragraph(self, sentence, use_suggestions=True): """ Tokenize words using :class:`~Voikko` ..todo: - Detect abbrevations from CAPITAL letters. :param use_suggestions: Should stemming use spell checking. """ # Spell check mistake counters err_count = 0 def _stem(word: str) -> List[str]: """ Return :type:`list` of stemmed words. If word is found on voikko dataset, uses suggestion to lookup for first candidate. """ nonlocal err_count # See: https://github.com/voikko/voikko-sklearn/blob/master/voikko_sklearn.py FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana"] # Check for previous stemming result stemmed_word = self.stem_map.get(word, None) if stemmed_word is not None: return [stemmed_word] analysis = self.analyze(word) if not analysis: # If analyze didn't produce results, try spellcheking err_count += 1 analysis = [] if use_suggestions: # Get first suggestion. suggested, *xs = self.voikko.suggest(word) or [None] logger.debug(f"Voikko did not found word {word!r}; suggested spelling: {suggested!r}") if suggested is not None: # return tokenized suggestion - It can be two or more words. return self.tokenize_paragraph(suggested, use_suggestions=False) # Prefer nimisana over others analysis = sorted(analysis, key=lambda x: -1 if x.get('CLASS') in ["nimisana"] else 0) for _word in analysis: # Find first suitable iteration of word. _class = _word.get("CLASS", None) if _class not in FINNISH_STOPWORD_CLASSES: baseform = _word.get('BASEFORM').lower() self.stem_map[word] = baseform return [baseform] # Fall back to given word. self.stem_map[word] = word.lower() return [word.lower()] # Create list of words from string, separating from non-word characters. r = [x for x in re.findall(self.regex_words, sentence.lower()) if x != ""] r = [x for x in chain(*map(_stem, r)) if x] if len(r) * self.err_treshold < err_count: # Too many spelling errors. Presume incorrect language, and disregard paragraph. logger.debug("Too many spelling errors: %d out of %d", err_count, len(r)) return [] return r @cached(LFUCache(maxsize=512)) def analyze(self, word: str) -> List[Dict]: """ Analyze word, returning morhpological data. Uses :class:`LFUCache` - least frequently used - cache. """ return self.voikko.analyze(word) def __getstate__(self): """ Return pickleable attributes. :class:`Voikko` can't be serialized, so remove it. """ state = self.__dict__.copy() state['voikko_lang'] = self.voikko.listDicts()[0].language del state['voikko'] return state def __setstate__(self, state): state['voikko'] = Voikko(state['voikko_lang']) del state['voikko_lang'] self.__dict__.update(state)
bf = word[:word.index(":")] cl = "lukusana" if re.fullmatch(r'\d+', bf) else "nimisana" output += [AltWords(word, [Word(word, bf, case, number, cl)])] cont = True if cont: continue for case in ORDINAL_CASE_REGEXES: if re.fullmatch(ORDINAL_CASE_REGEXES[case], word): bf = word[:word.index(":")] cl = "lukusana" if re.fullmatch(r'\d+', bf) else "nimisana" output += [AltWords(word, [Word(word, bf, case, number, cl, ordinal_like=True)])] cont = True if cont: continue analysis_list = voikko.analyze(word) prefix = "" if len(analysis_list) == 0 and "-" in word: i = word.rindex("-")+1 analysis_list = voikko.analyze(word[i:]) prefix = word[:i].lower() alternatives = [] for analysis in analysis_list: bf = prefix+analysis["BASEFORM"] cl = analysis["CLASS"] if bf in ORDINALS+CARDINALS or re.fullmatch(r'\d+', bf): cl = "lukusana" elif "PARTICIPLE" in analysis and analysis["PARTICIPLE"] == "agent": cl = "laatusana" number = analysis.get("NUMBER", "") person = analysis.get("PERSON", "")