Exemplo n.º 1
0
 def testOverrides(self):
     # run the inflection system once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getInflection('watch', 'VBD'), ('watched', )
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Inflections().overrides_dict
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
     lemminflect.Inflections().overrides_dict = {
         'watch': {
             'VBD': ('xxx', )
         }
     }
     inflections = lemminflect.getInflection('watch',
                                             'VBD',
                                             inflect_oov=False)
     self.assertEqual(inflections, ('xxx', ))
     # put the original dictionary back
     lemminflect.Inflections().overrides_dict = orig_dict
Exemplo n.º 2
0
 def __init__(self):
     global lemminflect
     import lemminflect
     self.name = 'LemmInflect'
     self.version_string = 'LemmInflect version: %s' % lemminflect.__version__
     # Force loading dictionary and model so lazy loading doesn't show up in run times
     lemmas = lemminflect.getAllLemmas('testing', 'VERB')
     lemmas = lemminflect.getAllLemmasOOV('xxtesting', 'VERB')
Exemplo n.º 3
0
 def testUPOSLog(self):
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
Exemplo n.º 4
0
def lemmatize_eng(word):
    from lemminflect import getAllLemmas, getAllLemmasOOV
    result = ""
    is_known = True
    is_multiple_forms = False
    for w in word.split():
        try:
            result += list(getAllLemmas(w).values())[0][0] + " "
            if len(list(getAllLemmas(w).values())) > 1:
                is_multiple_forms = True
        except IndexError:
            is_known = False
            result += list(getAllLemmasOOV(w, upos="NOUN").values())[0][0] + " "
    return {
        "normal_form": result,
        "is_known": is_known,
        "is_multiple_forms": is_multiple_forms,
        "pos_tag": "UNKNW",
    }
def preprocessing_raw_data(**kwargs):
    import re

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from lemminflect import getAllLemmas, getAllLemmasOOV
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT
    from nltk.corpus import stopwords
    from stop_words import get_stop_words
    from util.service_es import search, update_generator
    from util.util import is_latin

    process_num = kwargs['process_num']
    total_proc = kwargs['total_proc']

    number_of_documents = int(
        Variable.get("lemmatize_number_of_documents_eng", default_var=None))
    if number_of_documents is None:
        raise Exception("No variable!")

    s = search(ES_CLIENT,
               ES_INDEX_DOCUMENT,
               query={},
               source=['id', 'text'],
               sort=['id'],
               get_search_obj=True)
    s = s.exclude('exists', field="is_english")

    stopwords = set(
        get_stop_words('ru') + get_stop_words('en') +
        stopwords.words('english'))
    success = 0
    documents = []
    for doc in s.params(raise_on_error=False).scan():
        if int(doc.id) % total_proc != process_num:
            continue
        success += 1
        if success > 50_000:
            break
        if success % 10_000 == 0:
            print(f"{success}/{50_000}")
        if not is_latin(doc.text):
            doc['is_english'] = False
            documents.append(doc)
            continue
        cleaned_doc = [
            x.lower() for x in ' '.join(
                re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ',
                       doc.text).split()).split()
            if not x in stopwords and len(x) > 2
        ]
        result = ""
        for word in cleaned_doc:
            try:
                result += list(getAllLemmas(word).values())[0][0] + " "
            except IndexError:
                result += list(getAllLemmasOOV(
                    word, upos="NOUN").values())[0][0] + " "
        doc['text_lemmatized_eng_lemminflect'] = result
        doc['is_english'] = True
        documents.append(doc)
Exemplo n.º 6
0
def api_getAllLemmasOOV():
    content = request.json
    result = getAllLemmasOOV(content['word'], content['upos'])
    return jsonify(result)
Exemplo n.º 7
0
 def getLemmaOOVOnly(self, entry, upos):
     lemmas = lemminflect.getAllLemmasOOV(entry.infl, upos)
     lemma = lemmas.get(upos, ())
     if not lemma:
         return ()
     return lemma[0]