def testOverrides(self): # run the inflection system once to assure the overrides is loaded (ie.. lazy loading) lemminflect.getInflection('watch', 'VBD'), ('watched', ) # Hack the code to replace the overrides dictionary orig_dict = lemminflect.Inflections().overrides_dict with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I') lemminflect.Inflections().overrides_dict = { 'watch': { 'VBD': ('xxx', ) } } inflections = lemminflect.getInflection('watch', 'VBD', inflect_oov=False) self.assertEqual(inflections, ('xxx', )) # put the original dictionary back lemminflect.Inflections().overrides_dict = orig_dict
def __init__(self): global lemminflect import lemminflect self.name = 'LemmInflect' self.version_string = 'LemmInflect version: %s' % lemminflect.__version__ # Force loading dictionary and model so lazy loading doesn't show up in run times lemmas = lemminflect.getAllLemmas('testing', 'VERB') lemmas = lemminflect.getAllLemmasOOV('xxtesting', 'VERB')
def testUPOSLog(self): with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I')
def lemmatize_eng(word): from lemminflect import getAllLemmas, getAllLemmasOOV result = "" is_known = True is_multiple_forms = False for w in word.split(): try: result += list(getAllLemmas(w).values())[0][0] + " " if len(list(getAllLemmas(w).values())) > 1: is_multiple_forms = True except IndexError: is_known = False result += list(getAllLemmasOOV(w, upos="NOUN").values())[0][0] + " " return { "normal_form": result, "is_known": is_known, "is_multiple_forms": is_multiple_forms, "pos_tag": "UNKNW", }
def preprocessing_raw_data(**kwargs): import re from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from lemminflect import getAllLemmas, getAllLemmasOOV from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT from nltk.corpus import stopwords from stop_words import get_stop_words from util.service_es import search, update_generator from util.util import is_latin process_num = kwargs['process_num'] total_proc = kwargs['total_proc'] number_of_documents = int( Variable.get("lemmatize_number_of_documents_eng", default_var=None)) if number_of_documents is None: raise Exception("No variable!") s = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=['id', 'text'], sort=['id'], get_search_obj=True) s = s.exclude('exists', field="is_english") stopwords = set( get_stop_words('ru') + get_stop_words('en') + stopwords.words('english')) success = 0 documents = [] for doc in s.params(raise_on_error=False).scan(): if int(doc.id) % total_proc != process_num: continue success += 1 if success > 50_000: break if success % 10_000 == 0: print(f"{success}/{50_000}") if not is_latin(doc.text): doc['is_english'] = False documents.append(doc) continue cleaned_doc = [ x.lower() for x in ' '.join( re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc.text).split()).split() if not x in stopwords and len(x) > 2 ] result = "" for word in cleaned_doc: try: result += list(getAllLemmas(word).values())[0][0] + " " except IndexError: result += list(getAllLemmasOOV( word, upos="NOUN").values())[0][0] + " " doc['text_lemmatized_eng_lemminflect'] = result doc['is_english'] = True documents.append(doc)
def api_getAllLemmasOOV(): content = request.json result = getAllLemmasOOV(content['word'], content['upos']) return jsonify(result)
def getLemmaOOVOnly(self, entry, upos): lemmas = lemminflect.getAllLemmasOOV(entry.infl, upos) lemma = lemmas.get(upos, ()) if not lemma: return () return lemma[0]