def testOverrides(self): # run the lemmatizer once to assure the overrides is loaded (ie.. lazy loading) lemminflect.getLemma('Alaskans', 'NOUN', lemmatize_oov=False) # Hack the code to replace the overrides dictionary orig_dict = lemminflect.Lemmatizer().overrides_dict lemminflect.Lemmatizer().overrides_dict = { 'waltzes': { 'VERB': ('xxx', ) } } lemmas = lemminflect.getLemma('waltzes', 'VERB', lemmatize_oov=False) self.assertEqual(lemmas, ('xxx', )) # put the original dictionary back lemminflect.Lemmatizer().overrides_dict = orig_dict
def tokenize(self, sentence: Union[str, List[str]], pretokenize: bool = True, map_to_single_char: bool = False) -> List[str]: if pretokenize: pretokenized = self._pretokenize(sentence) else: # Allow users to pass in a list of tokens if using custom pretokenizers pretokenized = sentence ptb_pos_tagged = self.tagger.tag(pretokenized) universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in ptb_pos_tagged] tokenized = [] for i, (word, pos) in enumerate(ptb_pos_tagged): if universal_pos_tagged[i][ 1] in self.have_inflections and word not in ( string.punctuation + '—') and pos not in self.lemma_tags: lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0] if not lemma: lemma = word tokenized.append(lemma) tokenized.append('[' + pos + ']') else: tokenized.append(word) if map_to_single_char: tokenized = [ self.single_char_map[token] if token in self.inflection_tokens else token for token in tokenized ] return tokenized
def tense_of_verb(verb_str): """ Identifies the verb tense of a word, and returns it in a tuple along with its base word. @param verb_str: a str containing a verb @return: a tuple t, where t[0] is 'AUX' if the verb is a special auxiliary verb, is '?' if the verb tense cannot be recognized, and otherwise is 'VBD', 'VBP', or 'VBZ', which correspond to the Penn Treebank P.O.S. tags for past tense, non-3rd person present tense, and 3rd person present tense. """ aux_verbs = [ 'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could', 'ought' ] if verb_str.lower() in aux_verbs: return ('AUX', verb_str) lemm_str = getLemma(verb_str, upos='VERB')[0] if verb_str in getInflection(lemm_str, tag='VBD'): return ('VBD', lemm_str) elif verb_str in getInflection(lemm_str, tag='VBP'): return ('VBP', lemm_str) elif verb_str in getInflection(lemm_str, tag='VBZ'): return ('VBZ', lemm_str) else: return ('?', lemm_str)
def testOverrides(self): # run the inflection system once to assure the overrides is loaded (ie.. lazy loading) lemminflect.getInflection('watch', 'VBD'), ('watched', ) # Hack the code to replace the overrides dictionary orig_dict = lemminflect.Inflections().overrides_dict with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I') lemminflect.Inflections().overrides_dict = { 'watch': { 'VBD': ('xxx', ) } } inflections = lemminflect.getInflection('watch', 'VBD', inflect_oov=False) self.assertEqual(inflections, ('xxx', )) # put the original dictionary back lemminflect.Inflections().overrides_dict = orig_dict
def match_pronoun_present(verb_str: str, pronoun_str: str) -> str: """ Returns a verb form that matches the passed pronoun. This function should only be used for present tense """ pronoun_str = pronoun_str.lower() if pronoun_str not in __pronoun_to_verb_upenn_dict.keys(): raise ValueError( 'Unexpected value for pronoun "{}"'.format(pronoun_str)) aff_verb_str, negation_str = split_verb_negation(verb_str) if is_modal_verb(aff_verb_str): return verb_str lemma_lst = getLemma(aff_verb_str, "VERB") lemma_lst = __collapse_lemma_list(lemma_lst) if len(lemma_lst) != 1: logging.warning( 'WARNING: Ambigous or no lemma for "{}". Output was {}. Keeping original verb.' .format(verb_str, lemma_lst)) return verb_str lemma_str = lemma_lst[0] inflect_lst = getInflection(lemma_str, __pronoun_to_verb_upenn_dict[pronoun_str]) if len(inflect_lst) > 2 or not len(inflect_lst): logging.warning( 'WARNING: Ambigous or no inflection list for lemma "{}" from verb "{}". Output was {}. Keeping original verb.' .format(lemma_str, verb_str, inflect_lst)) return verb_str elif len(inflect_lst) == 2: if pronoun_str == 'i': new_verb_str = inflect_lst[0] else: new_verb_str = inflect_lst[1] else: new_verb_str = inflect_lst[0] return merge_verb_negation(new_verb_str, negation_str)
def _tokenize(self, text): tokenized = self.cased_tokenizer.tokenize( text, never_split=self.all_special_tokens) #print(tokenized) ptb_pos_tagged = self.tagger.tag(tokenized) #print(pos_tagged) #print(pos_tagged) universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in ptb_pos_tagged] #print(universal_pos_tagged) split_tokens = [] for i, (word, pos) in enumerate(ptb_pos_tagged): if self.do_lower_case: word = word.lower() if universal_pos_tagged[i][ 1] in self.have_inflections and word not in ( string.punctuation + '—') and pos not in self.lemma_tags: # (universal_)pos_tagged in the form of [(word, pos),(word, pos),...] # getLemma returns a tuple (lemma,) lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0] if not lemma: lemma = word wordpieced = self.wordpiece_tokenizer.tokenize(lemma) #print(wordpieced) split_tokens.extend(wordpieced) split_tokens.append('[' + pos + ']') else: wordpieced = self.wordpiece_tokenizer.tokenize(word) split_tokens.extend(wordpieced) return split_tokens
def get_lemmas(word: str, pos: PartOfSpeech): word = word.lower() if (" " in word or "." in word): return JSONResponse (status_code = 200, content = {"message": "Input must contain only a single word without spaces or punctuation."}) # Get the basic lemma version of the word first lemmas = getLemma(word, pos) if len(lemmas) > 0: lemma = getLemma(word, pos)[0] else: lemma = word inflections = merge_inflections(getAllInflections(lemma, upos=pos), getAllInflectionsOOV(lemma, upos=pos)) return {"lemma": lemma, "inflections": inflections}
def testProperNouns(self): lemmas = lemminflect.getLemma('Alaskans', 'NOUN', lemmatize_oov=False) self.assertEqual(len(lemmas), 0) lemmas = lemminflect.getLemma('Alaskans', 'PROPN', lemmatize_oov=False) self.assertEqual(len(lemmas), 1) self.assertEqual(lemmas[0], 'Alaskan') lemmas = lemminflect.getLemma('Axxlaskans', 'NOUN', lemmatize_oov=True) self.assertEqual(len(lemmas), 1) self.assertEqual(lemmas[0], 'Axxlaskan') lemmas = lemminflect.getLemma('Axxlaskans', 'PROPN', lemmatize_oov=True) self.assertEqual(len(lemmas), 1) self.assertEqual(lemmas[0], 'Axxlaskan') token = self.nlp('The Alaskans went South.')[1] self.assertEqual(token._.lemma(lemmatize_oov=False), 'Alaskan') token = self.nlp('The Axxlaskans went South.')[1] self.assertEqual(token._.lemma(lemmatize_oov=True), 'Axxlaskan')
def get_legal_words(tag: str, second_tag: Optional[str] = None, # also counterbalance list of other word forms (e.g. plural) seed: int = configs.Data.seed, exclude: Optional[Tuple[str, ...]] = None, verbose: bool = False, ) -> Union[List[str], List[Tuple[str, str]]]: print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}') # get words with requested tag and order df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv') bool_ids = df_legal['is_legal'].astype(bool).tolist() first_forms_ = df_legal['word'][bool_ids].tolist() # exclude any words ? if exclude: first_forms_ = [w for w in first_forms_ if w not in exclude] # also counterbalance 2nd forms of words ? if second_tag is None: second_forms_ = None elif second_tag == 'NNP': plural = inflect.engine() second_forms_ = [plural.plural(w) for w in first_forms_] elif second_tag.startswith('VB'): lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_] second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas] # requires lemma as input else: raise AttributeError('Invalid arg to second_tag') # remove words if their 2nd form is not in vocab or if it is identical to 1st form if second_tag is not None: first_forms = [] second_forms = [] for w1, w2 in zip(first_forms_, second_forms_): if w2 in vocab and w2 != w1: first_forms.append(w1) second_forms.append(w2) if verbose: print(f'Included {w1:<12} and {w2:<12}') assert first_forms assert second_forms else: first_forms = first_forms_ second_forms = second_forms_ # find subset of words such that their total corpus frequencies are approx equal across corpora num_words_in_sample = configs.Data.tag2num_words[tag] res = find_counterbalanced_subset(first_forms, min_size=num_words_in_sample, max_size=num_words_in_sample+100, second_forms=second_forms, seed=seed, verbose=verbose, ) return res
def testUPOSLog(self): with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I')
def get_lemmas(self, word, tag=None, pos=None): lemmas = [] if tag: # infer pos from tag pos = Inflector.tag_to_pos(tag) if pos: lemma_dict = lemminflect.getLemma(word, upos=pos) lemmas = list(lemma_dict) else: # no pos provided, return all lemmas lemma_dict = lemminflect.getAllLemmas(word) for i in lemma_dict.values(): lemmas += list(i) return lemmas
def api_getLemma(): content = request.get_json() result = getLemma(content['word'], content['upos'], content['lemmatize_oov']) return jsonify(result)
def lemmatize_with_lemminflect(source): lemmed = [] for w in source: lemmed.append(getLemma(w, upos='VERB')) return lemmed
def getLemma(self, entry, upos): lemmas = lemminflect.getLemma(entry.infl, upos) if not lemmas: return () return lemmas[0]
def runGetLemmaTests(self, tests): for test in tests: base, upos, form = test lemmas = lemminflect.getLemma(form, upos) self.assertTrue(base in set(lemmas), msg='base=%s lemmas=%s' % (base, str(lemmas)))
def get_stem(token): # return nlp(token)[0].lemma_ # return ps.stem(token) return lemminflect.getLemma(token, "VERB")[0]
def checkAuxLemmas(self, lemma, infls): for infl in infls: lemmas = lemminflect.getLemma(infl, 'AUX') self.assertEqual(len(lemmas), 1) self.assertEqual(lemmas[0], lemma)