def testGetInflection03(self): self.assertEqual(lemminflect.getAllInflections('watch'), {'NNS': ('watches', 'watch'), 'NN': ('watch',), 'VBD': ('watched',), 'VBG': ('watching',), 'VBZ': ('watches',), 'VB': ('watch',), 'VBP': ('watch',)}) self.assertEqual(lemminflect.getAllInflections('watch', 'VERB'), {'VBD': ('watched',), 'VBG': ('watching',), 'VBZ': ('watches',), 'VB': ('watch',), 'VBP': ('watch',)}) self.assertEqual(lemminflect.getInflection('watch', 'VBD'), ('watched',)) self.assertEqual(lemminflect.getAllInflections('watch', 'ADJ'), {})
def testGetInfections01(self): # Note this test may be a big problematic as the overrides file may change # VBN: awoken to awaked awake_dict = {'VBD': ('awoke',), 'VBN': ('awoken',), 'VBG': ('awaking',), 'VBZ': ('awakes',), 'VB': ('awake',), 'VBP': ('awake',)} #awake_dict['VBN'] = ('awaked',) # Applied in overrides but isn't preferred self.assertEqual(lemminflect.getAllInflections('awake', 'VERB'), awake_dict) self.assertEqual(lemminflect.getAllInflections('awoke', 'VERB'), {}) with self.assertLogs(): infls = lemminflect.getAllInflections('awake', 'X') # invalid upos self.assertEqual(infls, {})
def _get_replacement_words(self, word, word_part_of_speech): # only nouns, verbs, and adjectives are considered for replacement if word_part_of_speech not in self._enptb_to_universal: return [] # gets a dict that maps part-of-speech (POS) to available lemmas replacement_inflections_dict = lemminflect.getAllLemmas(word) # if dict is empty, there are no replacements for this word if not replacement_inflections_dict: return [] # map the fine-grained POS to a universal POS lemminflect_pos = self._enptb_to_universal[word_part_of_speech] # choose lemma with same POS, if ones exists; otherwise, choose lemma randomly if lemminflect_pos in replacement_inflections_dict: lemma = replacement_inflections_dict[lemminflect_pos][0] else: lemma = random.choice(list(replacement_inflections_dict.values()))[0] # get the available inflections for chosen lemma inflections = lemminflect.getAllInflections( lemma, upos=lemminflect_pos ).values() # merge tuples, remove duplicates, remove copy of the original word replacement_words = list(set([infl for tup in inflections for infl in tup])) replacement_words = [r for r in replacement_words if r != word] return replacement_words
def get_inflections(orig_tokenized, pos_tagged, constrain_pos): have_inflections = {'NOUN', 'VERB', 'ADJ'} token_inflections = [ ] # elements of form (i, inflections) where i is the token's position in the sequence for i, word in enumerate(orig_tokenized): lemmas = lemminflect.getAllLemmas(word) if lemmas and pos_tagged[i][1] in have_inflections: if pos_tagged[i][1] in lemmas: lemma = lemmas[pos_tagged[i][1]][0] else: lemma = random.choice(list(lemmas.values()))[0] if constrain_pos: inflections = ( i, list( set([ infl for tup in lemminflect.getAllInflections( lemma, upos=pos_tagged[i][1]).values() for infl in tup ]))) else: inflections = (i, list( set([ infl for tup in lemminflect. getAllInflections(lemma).values() for infl in tup ]))) random.shuffle(inflections[1]) token_inflections.append(inflections) return token_inflections
def all_forms(word): wl = word.lower() all_forms = set() all_forms.add(wl) for list in getAllInflections(wl).values(): all_forms.update(list) return all_forms
def testAuxModalInflections(self): # Modals auxilliary verbs infls = lemminflect.getAllInflections('can') self.assertTrue(infls.items() >= { 'VB': ('can', ), 'VBD': ('could', ) }.items()) infls = lemminflect.getAllInflections('may') self.assertTrue(infls.items() >= { 'VB': ('may', ), 'VBD': ('might', ) }.items()) infls = lemminflect.getAllInflections('will') self.assertTrue(infls.items() >= { 'VB': ('will', ), 'VBD': ('would', ) }.items()) infls = lemminflect.getAllInflections('shall') self.assertTrue(infls.items() >= { 'VB': ('shall', ), 'VBD': ('should', ) }.items()) infls = lemminflect.getAllInflections('must') self.assertTrue(infls.items() >= { 'VB': ('must', ), 'VBD': ('must', ) }.items()) infls = lemminflect.getAllInflections('ought') self.assertTrue(infls.items() >= { 'VB': ('ought', ), 'VBD': ('ought', ) }.items()) infls = lemminflect.getAllInflections('dare') self.assertTrue(infls.items() >= {'VB': ('dare', )}.items()) # Auxilliary verbs infls = lemminflect.getAllInflections('be') self.assertTrue(infls.items() >= {'VB': ('be',), 'VBD': ('was', 'were'), \ 'VBG': ('being',), 'VBN': ('been',), 'VBP': ('am', 'are'), 'VBZ': ('is',)}.items()) infls = lemminflect.getAllInflections('do') self.assertTrue(infls.items() >= { 'VB': ('do', 'does'), 'VBD': ('did', ) }.items()) infls = lemminflect.getAllInflections('have') self.assertTrue(infls.items() >= {'VB': ('have', 'has'), 'VBD': ('had',), \ 'VBG': ('having',)}.items())
def all_inflect(w, word_len): out = set() for k, v in getAllInflections(w).items(): if word_len is not None: out.update(filter(lambda x: len(x) == word_len, v)) else: out.update(v) return out
def testGetInflection04(self): self.assertEqual(lemminflect.getAllInflections('watch', 'ADJ'), {}) self.assertEqual( lemminflect.getInflection('watch', 'JJ', inflect_oov=False), ()) self.assertEqual( lemminflect.getInflection('watch', 'JJ', inflect_oov=True), ('watch', )) self.assertEqual(lemminflect.getInflection('watch', 'VBD'), ('watched', ))
def testUPOSLog(self): with self.assertLogs(): infl = lemminflect.getInflection('WORD', 'X') self.assertEqual(infl, ()) with self.assertLogs(): infls = lemminflect.getAllInflections('WORD', 'X') self.assertEqual(infls, {}) with self.assertLogs(): infls = lemminflect.getAllInflectionsOOV('WORD', 'X') self.assertEqual(infls, {}) token = self.nlp('testing')[0] self.assertEqual(token._.inflect('X'), 'testing')
def inflect_lemma(self, lemma, tag=None, pos=None): inflections = [] # tag based if tag: inflection_tuple = lemminflect.getInflection(lemma, tag=tag) inflections = list(inflection_tuple) else: # pos based, can be None too inflection_dict = lemminflect.getAllInflections(lemma, upos=pos) for i in inflection_dict.values(): inflections += list(i) return inflections
def get_lemmas(word: str, pos: PartOfSpeech): word = word.lower() if (" " in word or "." in word): return JSONResponse (status_code = 200, content = {"message": "Input must contain only a single word without spaces or punctuation."}) # Get the basic lemma version of the word first lemmas = getLemma(word, pos) if len(lemmas) > 0: lemma = getLemma(word, pos)[0] else: lemma = word inflections = merge_inflections(getAllInflections(lemma, upos=pos), getAllInflectionsOOV(lemma, upos=pos)) return {"lemma": lemma, "inflections": inflections}
def candidate_edits(self, text: str) -> List[Edit]: tokenized = self._spacy.tokenizer(text) candidate_edits = [] for token in tokenized: lemmas = { lemma for lemmas in lemminflect.getAllLemmas(token.text).values() for lemma in lemmas } inflections = { inflection for lemma in lemmas for inflections in lemminflect.getAllInflections(lemma).values() for inflection in inflections } substitutes = inflections - {token.text} current_candidate_edits = _edits(token.i, tokenized, substitutes) candidate_edits.extend(current_candidate_edits) return candidate_edits
def random_inflect(source: str, inflection_counts: Dict[str, int] = None) -> str: have_inflections = {'NOUN', 'VERB', 'ADJ'} tokenized = MosesTokenizer(lang='en').tokenize( source) # Tokenize the sentence upper = False if tokenized[0][0].isupper(): upper = True tokenized[0] = tokenized[0].lower() pos_tagged = nltk.pos_tag(tokenized, tagset='universal') # POS tag words in sentence for i, word in enumerate(tokenized): lemmas = lemminflect.getAllLemmas(word) # Only operate on content words (nouns/verbs/adjectives) if lemmas and pos_tagged[i][1] in have_inflections and pos_tagged[i][ 1] in lemmas: lemma = lemmas[pos_tagged[i][1]][0] inflections = (i, [(tag, infl) for tag, tup in lemminflect.getAllInflections( lemma, upos=pos_tagged[i][1]).items() for infl in tup]) if inflections[1]: # Use inflection distribution for weighted random sampling if specified # Otherwise unweighted if inflection_counts: counts = [ inflection_counts[tag] for tag, infl in inflections[1] ] inflection = random.choices(inflections[1], weights=counts)[0][1] else: inflection = random.choices(inflections[1])[0][1] tokenized[i] = inflection if upper: tokenized[0] = tokenized[0].title() return MosesDetokenizer(lang='en').detokenize(tokenized)
def get_inflections(token): result = set() for key, value in lemminflect.getAllInflections(token).items(): result.update(value) return result
import lemminflect import itertools # The points we remove for scores of inflected forms INFLECTED_PENALTY = 5 rw = dict() with open('RankedWiktionary.txt', 'r') as fid: for line in fid: line = line.strip() word, score = line.split('@') score = int(score) rw[word] = score # Go through inflected forms and add them rw2 = dict() for word, score in rw.items(): infl = lemminflect.getAllInflections(word) for word1 in itertools.chain(*infl.values()): try: rw[word1] except: rw2[word1] = max(1, score - INFLECTED_PENALTY) # Extend the dictionary rw.update(rw2) # Write the list with open('RankedWiktionary2.txt', 'a') as fid: for word, score in rw.items(): fid.write(f'{word}@{score}\n')
def api_getAllInflections(): content = request.json result = getAllInflections(content['lemma'], content['upos']) return jsonify(result)