def read_data(file_path): ''' Read data into a list of words and store the words into a file if the relevant word file does not exist''' if os.path.exists(file_path + '_words'): print('reading from word file...') with open(file_path + '_words', 'r') as f: words = f.read().split('\n') return words print('reading from data file...') v = Voikko("fi") with open(file_path) as f: words = [ word.tokenText.lower() for word in v.tokens(f.read()) if word.tokenType == 1 or word.tokenType == 2 ] v.terminate() file = open(file_path + '_words', 'w') file.write('\n'.join(words)) file.close() return words
def sentence_to_index(index_file, file_path, dictionary): '''Read sentences from file and replace them with their corresponding word indices in the dictionary''' print("converting sentences to indices...") v = Voikko("fi") index_f = open(index_file, 'wb') with open(file_path) as f: index_sentences = [] for sentence in f: words = [ word.tokenText.lower() for word in v.tokens(sentence) if word.tokenType == 1 or word.tokenType == 2 ] index_words = [ dictionary[word] if word in dictionary else 0 for word in words ] index_sentences.append(index_words) v.terminate() # save sentence indices into a index_file pkl.dump(index_sentences, index_f, -1) index_f.close() return index_sentences
def __init__(self, lang="fi"): # Voikko dictrionary path. dict_path = instance_path() / "voikko" path = str(dict_path) if dict_path.exists() else None self.stem_map = {} self.voikko = Voikko(lang, path=path) self.regex_words = re.compile(r""" (\w+-(?:\w+)+ # Get wordcharacters conjucated by dash (-) |\w{1,} # OR all word characters len() > 1 )|(?::[\w]*) # ignore word characters after colon """, re.VERBOSE + re.MULTILINE) self.err_treshold = 0.5
def lemmatize_file(filename): print('lemmatizing ' + filename) v = Voikko("fi") lemmatized_filename = filename + '_lemmatized' lemmatized_file = open(lemmatized_filename, 'w') with open(filename, 'r') as f: for sentence in f: sent_toks = v.tokens(sentence) words_baseform = [] for word in sent_toks: if word.tokenType == 1: word_analyzed = v.analyze(word.tokenText) if len(word_analyzed) > 0: words_baseform.append(word_analyzed[0].get('BASEFORM')) else: words_baseform.append(word.tokenText) else: words_baseform.append(word.tokenText) sent_baseform = ''.join(words_baseform) lemmatized_file.write(sent_baseform) lemmatized_file.close() v.terminate() return lemmatized_filename
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, vectorizer) pyLDAvis.save_html(LDAvis_prepared, str(_instance_path() / "pyldavis.html")) joblib.dump(lda, LDA_FILE) joblib.dump(vectorizer, WORDS_FILE) number_words = 15 suitable_topic_classes = ["nimisana", "nimi"] topic_labels = {} topics = np.array([x.argsort()[::-1] for x in lda.components_]) v = Voikko("fi") def _is_suitable_label(word) -> bool: r = v.analyze(word) or [] for w in r: if w.get("CLASS") in suitable_topic_classes: return True else: logger.debug("%s CLASS is %s", word, w.get("CLASS")) return False i = 0 while len(topic_labels) < number_topics:
class VoikkoTokenizer(): """ Voikko Tokenizer ~~~~~~~~~~~~~~~~ Getting Voikko to work on Windows ================================= - Download voikko DLL into application directory from: https://www.puimula.org/htp/testing/voikko-sdk/win-crossbuild/ - Download and extract dictionary files into `instance/voikko` directory: https://www.puimula.org/htp/testing/voikko-snapshot-v5/ Select one contain morphological data. """ """ Tokenize text """ def __init__(self, lang="fi"): # Voikko dictrionary path. dict_path = instance_path() / "voikko" path = str(dict_path) if dict_path.exists() else None self.stem_map = {} self.voikko = Voikko(lang, path=path) self.regex_words = re.compile(r""" (\w+-(?:\w+)+ # Get wordcharacters conjucated by dash (-) |\w{1,} # OR all word characters len() > 1 )|(?::[\w]*) # ignore word characters after colon """, re.VERBOSE + re.MULTILINE) self.err_treshold = 0.5 def tokenize(self, text: str) -> List[str]: """ Return list of words """ # Split into paragraphs. paragraphs = text.splitlines() tokens = chain(*map(self.tokenize_paragraph, paragraphs)) return tokens def tokenize_paragraph(self, sentence, use_suggestions=True): """ Tokenize words using :class:`~Voikko` ..todo: - Detect abbrevations from CAPITAL letters. :param use_suggestions: Should stemming use spell checking. """ # Spell check mistake counters err_count = 0 def _stem(word: str) -> List[str]: """ Return :type:`list` of stemmed words. If word is found on voikko dataset, uses suggestion to lookup for first candidate. """ nonlocal err_count # See: https://github.com/voikko/voikko-sklearn/blob/master/voikko_sklearn.py FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana"] # Check for previous stemming result stemmed_word = self.stem_map.get(word, None) if stemmed_word is not None: return [stemmed_word] analysis = self.analyze(word) if not analysis: # If analyze didn't produce results, try spellcheking err_count += 1 analysis = [] if use_suggestions: # Get first suggestion. suggested, *xs = self.voikko.suggest(word) or [None] logger.debug(f"Voikko did not found word {word!r}; suggested spelling: {suggested!r}") if suggested is not None: # return tokenized suggestion - It can be two or more words. return self.tokenize_paragraph(suggested, use_suggestions=False) # Prefer nimisana over others analysis = sorted(analysis, key=lambda x: -1 if x.get('CLASS') in ["nimisana"] else 0) for _word in analysis: # Find first suitable iteration of word. _class = _word.get("CLASS", None) if _class not in FINNISH_STOPWORD_CLASSES: baseform = _word.get('BASEFORM').lower() self.stem_map[word] = baseform return [baseform] # Fall back to given word. self.stem_map[word] = word.lower() return [word.lower()] # Create list of words from string, separating from non-word characters. r = [x for x in re.findall(self.regex_words, sentence.lower()) if x != ""] r = [x for x in chain(*map(_stem, r)) if x] if len(r) * self.err_treshold < err_count: # Too many spelling errors. Presume incorrect language, and disregard paragraph. logger.debug("Too many spelling errors: %d out of %d", err_count, len(r)) return [] return r @cached(LFUCache(maxsize=512)) def analyze(self, word: str) -> List[Dict]: """ Analyze word, returning morhpological data. Uses :class:`LFUCache` - least frequently used - cache. """ return self.voikko.analyze(word) def __getstate__(self): """ Return pickleable attributes. :class:`Voikko` can't be serialized, so remove it. """ state = self.__dict__.copy() state['voikko_lang'] = self.voikko.listDicts()[0].language del state['voikko'] return state def __setstate__(self, state): state['voikko'] = Voikko(state['voikko_lang']) del state['voikko_lang'] self.__dict__.update(state)
def __setstate__(self, state): state['voikko'] = Voikko(state['voikko_lang']) del state['voikko_lang'] self.__dict__.update(state)
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import html, re from voikko.libvoikko import Voikko, Token from fatal_error import syntaxError from inflect import * LANGUAGE = "fi-x-morpho" ENCODING = "UTF-8" voikko = Voikko(LANGUAGE) def lexCode(code): output = [] for word in re.split(r'(\s|\.|,|;|\[|\]|"[^"]*"|#[^\n]*\n|\([^()]*\))', code): if word == "": continue if re.fullmatch(r'\s|\.|,|;|\[|\]|"[^"]*"|#[^\n]*\n|\([^()]*\)', word): output += [Punctuation(word)] continue cont = False for number in CASE_REGEXES: for case in CASE_REGEXES[number]: if re.fullmatch(CASE_REGEXES[number][case], word): bf = word[:word.index(":")]