def testOverrides(self): # run the inflection system once to assure the overrides is loaded (ie.. lazy loading) lemminflect.getInflection('watch', 'VBD'), ('watched', ) # Hack the code to replace the overrides dictionary orig_dict = lemminflect.Inflections().overrides_dict with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I') lemminflect.Inflections().overrides_dict = { 'watch': { 'VBD': ('xxx', ) } } inflections = lemminflect.getInflection('watch', 'VBD', inflect_oov=False) self.assertEqual(inflections, ('xxx', )) # put the original dictionary back lemminflect.Inflections().overrides_dict = orig_dict
def _save_lemmas(self, digest_record, cleared_description): words = nltk.word_tokenize(cleared_description) word_lemmas_counts = {} for word in words: word_lemmas = lemminflect.getAllLemmas(word) lemmas_keys = ('NOUN', 'VERB', 'AUX', 'ADV', 'ADJ') word_lemmas_plain = [] for lk in lemmas_keys: if lk in word_lemmas: word_lemmas_plain += (l.lower() for l in word_lemmas[lk]) if not word_lemmas and re.match(r'\w', word): word_lemmas_plain.append(word.lower()) for l in word_lemmas_plain: if l not in word_lemmas_counts: word_lemmas_counts[l] = 1 else: word_lemmas_counts[l] += 1 for lemma_text, lemma_count_in_dr in word_lemmas_counts.items(): existing_lemmas = Lemma.objects.filter(text=lemma_text) if not existing_lemmas: lemma_object = Lemma(text=lemma_text) lemma_object.save() else: lemma_object = existing_lemmas[0] existing_digest_record_lemmas = DigestRecordLemma.objects.filter(lemma=lemma_object, digest_record=digest_record) if not existing_digest_record_lemmas: digest_record_lemma = DigestRecordLemma(lemma=lemma_object, digest_record=digest_record, count=lemma_count_in_dr) digest_record_lemma.save()
def _get_replacement_words(self, word, word_part_of_speech): # only nouns, verbs, and adjectives are considered for replacement if word_part_of_speech not in self._enptb_to_universal: return [] # gets a dict that maps part-of-speech (POS) to available lemmas replacement_inflections_dict = lemminflect.getAllLemmas(word) # if dict is empty, there are no replacements for this word if not replacement_inflections_dict: return [] # map the fine-grained POS to a universal POS lemminflect_pos = self._enptb_to_universal[word_part_of_speech] # choose lemma with same POS, if ones exists; otherwise, choose lemma randomly if lemminflect_pos in replacement_inflections_dict: lemma = replacement_inflections_dict[lemminflect_pos][0] else: lemma = random.choice(list(replacement_inflections_dict.values()))[0] # get the available inflections for chosen lemma inflections = lemminflect.getAllInflections( lemma, upos=lemminflect_pos ).values() # merge tuples, remove duplicates, remove copy of the original word replacement_words = list(set([infl for tup in inflections for infl in tup])) replacement_words = [r for r in replacement_words if r != word] return replacement_words
def get_inflections(orig_tokenized, pos_tagged, constrain_pos): have_inflections = {'NOUN', 'VERB', 'ADJ'} token_inflections = [ ] # elements of form (i, inflections) where i is the token's position in the sequence for i, word in enumerate(orig_tokenized): lemmas = lemminflect.getAllLemmas(word) if lemmas and pos_tagged[i][1] in have_inflections: if pos_tagged[i][1] in lemmas: lemma = lemmas[pos_tagged[i][1]][0] else: lemma = random.choice(list(lemmas.values()))[0] if constrain_pos: inflections = ( i, list( set([ infl for tup in lemminflect.getAllInflections( lemma, upos=pos_tagged[i][1]).values() for infl in tup ]))) else: inflections = (i, list( set([ infl for tup in lemminflect. getAllInflections(lemma).values() for infl in tup ]))) random.shuffle(inflections[1]) token_inflections.append(inflections) return token_inflections
def __init__(self): global lemminflect import lemminflect self.name = 'LemmInflect' self.version_string = 'LemmInflect version: %s' % lemminflect.__version__ # Force loading dictionary and model so lazy loading doesn't show up in run times lemmas = lemminflect.getAllLemmas('testing', 'VERB') lemmas = lemminflect.getAllLemmasOOV('xxtesting', 'VERB')
def _get_replacement_words(self, word, word_part_of_speech): if word_part_of_speech not in self._flair_to_lemminflect_pos_map: # Only nouns, verbs, and adjectives have proper inflections. return [] replacement_inflections_dict = lemminflect.getAllLemmas(word) # `lemminflect.getAllLemmas` returns a dict mapping part-of-speech # to available inflections. First, map part-of-speech from flair # POS tag to lemminflect. lemminflect_pos = self._flair_to_lemminflect_pos_map[word_part_of_speech] return replacement_inflections_dict.get(lemminflect_pos, None)
def lemmatize_eng(word): from lemminflect import getAllLemmas, getAllLemmasOOV result = "" is_known = True is_multiple_forms = False for w in word.split(): try: result += list(getAllLemmas(w).values())[0][0] + " " if len(list(getAllLemmas(w).values())) > 1: is_multiple_forms = True except IndexError: is_known = False result += list(getAllLemmasOOV(w, upos="NOUN").values())[0][0] + " " return { "normal_form": result, "is_known": is_known, "is_multiple_forms": is_multiple_forms, "pos_tag": "UNKNW", }
def testUPOSLog(self): with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I')
def fill_lemmas_and_connections_to_digest_records(apps, schema_editor): all_valued_records = DigestRecord.objects.filter( language=Language.ENGLISH.name) one_percent_count = math.ceil(all_valued_records.count() / 100) last_printed_percent = None for dr_i, dr in enumerate(all_valued_records): s = dr.title if dr.cleared_description: s += ' ' + dr.cleared_description words = nltk.word_tokenize(s) word_lemmas_counts = {} for word in words: word_lemmas = lemminflect.getAllLemmas(word) lemmas_keys = ('NOUN', 'VERB', 'AUX', 'ADV', 'ADJ') word_lemmas_plain = [] for lk in lemmas_keys: if lk in word_lemmas: word_lemmas_plain += (l.lower() for l in word_lemmas[lk]) if not word_lemmas and re.match(r'\w', word): word_lemmas_plain.append(word.lower()) for l in word_lemmas_plain: if l not in word_lemmas_counts: word_lemmas_counts[l] = 1 else: word_lemmas_counts[l] += 1 for lemma_text, lemma_count_in_dr in word_lemmas_counts.items(): existing_lemmas = Lemma.objects.filter(text=lemma_text) if not existing_lemmas: lemma_object = Lemma(text=lemma_text) lemma_object.save() else: lemma_object = existing_lemmas[0] existing_digest_record_lemmas = DigestRecordLemma.objects.filter( lemma=lemma_object, digest_record=dr) if not existing_digest_record_lemmas: digest_record_lemma = DigestRecordLemma( lemma=lemma_object, digest_record=dr, count=lemma_count_in_dr) digest_record_lemma.save() if (dr_i + 1) % one_percent_count == 0: current_percent = math.ceil((dr_i + 1) / one_percent_count) if last_printed_percent is None or current_percent != last_printed_percent: last_printed_percent = current_percent print( f'Processed {current_percent}% ({dr_i + 1} records, {all_valued_records.count()} total, {all_valued_records.count() - dr_i - 1} left)' )
def testContractionLemmas(self): lemmas = lemminflect.getAllLemmas("'d") self.assertTrue(lemmas.items() >= {'AUX': ('will', 'have')}.items()) lemmas = lemminflect.getAllLemmas("'ll") self.assertTrue(lemmas.items() >= {'AUX': ('will', )}.items()) lemmas = lemminflect.getAllLemmas("'m") self.assertTrue(lemmas.items() >= {'AUX': ('be', )}.items()) lemmas = lemminflect.getAllLemmas("'re") self.assertTrue(lemmas.items() >= {'AUX': ('be', )}.items()) lemmas = lemminflect.getAllLemmas("'s") self.assertTrue(lemmas.items() >= {'AUX': ('be', )}.items()) lemmas = lemminflect.getAllLemmas("'ve") self.assertTrue(lemmas.items() >= {'AUX': ('have', )}.items()) lemmas = lemminflect.getAllLemmas("'ve") self.assertTrue(lemmas.items() >= {'AUX': ('have', )}.items())
def get_lemmas(self, word, tag=None, pos=None): lemmas = [] if tag: # infer pos from tag pos = Inflector.tag_to_pos(tag) if pos: lemma_dict = lemminflect.getLemma(word, upos=pos) lemmas = list(lemma_dict) else: # no pos provided, return all lemmas lemma_dict = lemminflect.getAllLemmas(word) for i in lemma_dict.values(): lemmas += list(i) return lemmas
def candidate_edits(self, text: str) -> List[Edit]: tokenized = self._spacy.tokenizer(text) candidate_edits = [] for token in tokenized: lemmas = { lemma for lemmas in lemminflect.getAllLemmas(token.text).values() for lemma in lemmas } inflections = { inflection for lemma in lemmas for inflections in lemminflect.getAllInflections(lemma).values() for inflection in inflections } substitutes = inflections - {token.text} current_candidate_edits = _edits(token.i, tokenized, substitutes) candidate_edits.extend(current_candidate_edits) return candidate_edits
def base_form(word): """ Return the base form of the given word. In the case where the word can be seen as different parts of speech with different base forms, this returns the shortest. If they are the same length, it chooses the alphabetically first one. For example, "outing" is base form of noun, but also the inflected form of the verb "out", so this method will return "out". :param word: :return: base form """ wl = word.lower() all_forms = set() for pos, lemmas in getAllLemmas(wl).items(): # getAllLemmas can return multiple possible baseforms for each POS, eg british and american versions. # The first listed form is supposed to be the most common so that is the only one we consider. all_forms.add(lemmas[0]) if all_forms: return min(all_forms, key=base_form_sort_key) else: return wl
def random_inflect(source: str, inflection_counts: Dict[str, int] = None) -> str: have_inflections = {'NOUN', 'VERB', 'ADJ'} tokenized = MosesTokenizer(lang='en').tokenize( source) # Tokenize the sentence upper = False if tokenized[0][0].isupper(): upper = True tokenized[0] = tokenized[0].lower() pos_tagged = nltk.pos_tag(tokenized, tagset='universal') # POS tag words in sentence for i, word in enumerate(tokenized): lemmas = lemminflect.getAllLemmas(word) # Only operate on content words (nouns/verbs/adjectives) if lemmas and pos_tagged[i][1] in have_inflections and pos_tagged[i][ 1] in lemmas: lemma = lemmas[pos_tagged[i][1]][0] inflections = (i, [(tag, infl) for tag, tup in lemminflect.getAllInflections( lemma, upos=pos_tagged[i][1]).items() for infl in tup]) if inflections[1]: # Use inflection distribution for weighted random sampling if specified # Otherwise unweighted if inflection_counts: counts = [ inflection_counts[tag] for tag, infl in inflections[1] ] inflection = random.choices(inflections[1], weights=counts)[0][1] else: inflection = random.choices(inflections[1])[0][1] tokenized[i] = inflection if upper: tokenized[0] = tokenized[0].title() return MosesDetokenizer(lang='en').detokenize(tokenized)
def preprocessing_raw_data(**kwargs): import re from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from lemminflect import getAllLemmas, getAllLemmasOOV from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT from nltk.corpus import stopwords from stop_words import get_stop_words from util.service_es import search, update_generator from util.util import is_latin process_num = kwargs['process_num'] total_proc = kwargs['total_proc'] number_of_documents = int( Variable.get("lemmatize_number_of_documents_eng", default_var=None)) if number_of_documents is None: raise Exception("No variable!") s = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=['id', 'text'], sort=['id'], get_search_obj=True) s = s.exclude('exists', field="is_english") stopwords = set( get_stop_words('ru') + get_stop_words('en') + stopwords.words('english')) success = 0 documents = [] for doc in s.params(raise_on_error=False).scan(): if int(doc.id) % total_proc != process_num: continue success += 1 if success > 50_000: break if success % 10_000 == 0: print(f"{success}/{50_000}") if not is_latin(doc.text): doc['is_english'] = False documents.append(doc) continue cleaned_doc = [ x.lower() for x in ' '.join( re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc.text).split()).split() if not x in stopwords and len(x) > 2 ] result = "" for word in cleaned_doc: try: result += list(getAllLemmas(word).values())[0][0] + " " except IndexError: result += list(getAllLemmasOOV( word, upos="NOUN").values())[0][0] + " " doc['text_lemmatized_eng_lemminflect'] = result doc['is_english'] = True documents.append(doc)
def runGetAllLemmasTests(self, tests): for test in tests: base, upos, form = test lemmas = lemminflect.getAllLemmas(form, upos).get(upos, {}) self.assertTrue(base in set(lemmas), msg='base=%s lemmas=%s' % (base, str(lemmas)))
def api_getAllLemmas(): content = request.json result = getAllLemmas(content['word'], content['upos']) return jsonify(result)
def getLemmaDictOnly(self, entry, upos): lemmas = lemminflect.getAllLemmas(entry.infl, upos) lemma = lemmas.get(upos, ()) if not lemma: return () return lemma[0]