def _get_morph(): morph = majka.Majka(os.path.join(NLP_DATA_PATH, 'majka.w-lt')) morph.tags = False morph.first_only = True morph.negative = "ne" morph.flags |= majka.IGNORE_CASE # ignore the word case whatsoever return morph
def convert_words_into_lemmas(text, text_language): try: morph = majka.Majka("../majka/" + text_language + ".w-lt") except: return text result = "" for word in text.split(): lemma_result = morph.find(word) if len(lemma_result) is 0: result += (" " + word) else: result += (" " + lemma_result[0]['lemma']) return result
def init_lemmatizer(self, majka_path): lemmatizer = majka.Majka('{}\\{}'.format(majka_path, 'majka.w-lt')) lemmatizer.first_only = True lemmatizer.tags = False lemmatizer.negative = 'ne' return lemmatizer
import psycopg2 import os import re import sys import majka import json import time morph_cz = majka.Majka('/home/jhu/PycharmProjects/ConEv/nlp/majka.w-lt') morph_en = majka.Majka('/home/jhu/PycharmProjects/ConEv/nlp/w-lt.en.fsa') morph_de = majka.Majka('/home/jhu/PycharmProjects/ConEv/nlp/w-lt.ger.fsa') # https://nlp.fi.muni.cz/czech-morphology-analyser/ db_connection = psycopg2.connect( "dbname=ConEv user=postgres password=forward host=127.0.0.1") db = db_connection.cursor() def find_word_attrs(word, language): #-----------------------------------------------------------------------------------------------CZ morph_cz.flags |= majka.ADD_DIACRITICS # find word forms with diacritics morph_cz.flags |= majka.DISALLOW_LOWERCASE # do not enable to find lowercase variants morph_cz.flags |= majka.IGNORE_CASE # ignore the word case whatsoever morph_cz.flags = 0 # unset all flags morph_cz.tags = False # return just the lemma, do not process the tags morph_cz.tags = True # turn tag processing back on (default) morph_cz.compact_tag = True # return tag in compact form (as returned by Majka) morph_cz.compact_tag = False # do not return compact tag (default)
import majka lemmatizer = majka.Majka('majka.w-lt') lemmatizer.first_only = True lemmatizer.tags = False lemmatizer.negative = 'ne' def lemmatize(word): lemma = lemmatizer.find(word) if len(lemma) > 0: return lemma[0]['lemma'] else: return word