Пример #1
0
 def testOverrides(self):
     # run the lemmatizer once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getLemma('Alaskans', 'NOUN', lemmatize_oov=False)
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Lemmatizer().overrides_dict
     lemminflect.Lemmatizer().overrides_dict = {
         'waltzes': {
             'VERB': ('xxx', )
         }
     }
     lemmas = lemminflect.getLemma('waltzes', 'VERB', lemmatize_oov=False)
     self.assertEqual(lemmas, ('xxx', ))
     # put the original dictionary back
     lemminflect.Lemmatizer().overrides_dict = orig_dict
Пример #2
0
 def tokenize(self,
              sentence: Union[str, List[str]],
              pretokenize: bool = True,
              map_to_single_char: bool = False) -> List[str]:
     if pretokenize:
         pretokenized = self._pretokenize(sentence)
     else:
         # Allow users to pass in a list of tokens if using custom pretokenizers
         pretokenized = sentence
     ptb_pos_tagged = self.tagger.tag(pretokenized)
     universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                             for (token, tag) in ptb_pos_tagged]
     tokenized = []
     for i, (word, pos) in enumerate(ptb_pos_tagged):
         if universal_pos_tagged[i][
                 1] in self.have_inflections and word not in (
                     string.punctuation +
                     '—') and pos not in self.lemma_tags:
             lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
             if not lemma:
                 lemma = word
             tokenized.append(lemma)
             tokenized.append('[' + pos + ']')
         else:
             tokenized.append(word)
     if map_to_single_char:
         tokenized = [
             self.single_char_map[token]
             if token in self.inflection_tokens else token
             for token in tokenized
         ]
     return tokenized
Пример #3
0
def tense_of_verb(verb_str):
    """
    Identifies the verb tense of a word, and returns it in a tuple along with its base word.
    @param verb_str: a str containing a verb
    @return: a tuple t, where t[0] is 'AUX' if the verb is a special auxiliary verb, 
        is '?' if the verb tense cannot be recognized, and otherwise is 'VBD', 'VBP', or 'VBZ',
        which correspond to the Penn Treebank P.O.S. tags for past tense, non-3rd person present 
        tense, and 3rd person present tense.
    """
    aux_verbs = [
        'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does',
        'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
        'can', 'could', 'ought'
    ]
    if verb_str.lower() in aux_verbs:
        return ('AUX', verb_str)
    lemm_str = getLemma(verb_str, upos='VERB')[0]
    if verb_str in getInflection(lemm_str, tag='VBD'):
        return ('VBD', lemm_str)
    elif verb_str in getInflection(lemm_str, tag='VBP'):
        return ('VBP', lemm_str)
    elif verb_str in getInflection(lemm_str, tag='VBZ'):
        return ('VBZ', lemm_str)
    else:
        return ('?', lemm_str)
Пример #4
0
 def testOverrides(self):
     # run the inflection system once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getInflection('watch', 'VBD'), ('watched', )
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Inflections().overrides_dict
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
     lemminflect.Inflections().overrides_dict = {
         'watch': {
             'VBD': ('xxx', )
         }
     }
     inflections = lemminflect.getInflection('watch',
                                             'VBD',
                                             inflect_oov=False)
     self.assertEqual(inflections, ('xxx', ))
     # put the original dictionary back
     lemminflect.Inflections().overrides_dict = orig_dict
Пример #5
0
def match_pronoun_present(verb_str: str, pronoun_str: str) -> str:
    """
    Returns a verb form that matches the passed pronoun.
    This function should only be used for present tense
    """
    pronoun_str = pronoun_str.lower()
    if pronoun_str not in __pronoun_to_verb_upenn_dict.keys():
        raise ValueError(
            'Unexpected value for pronoun "{}"'.format(pronoun_str))
    aff_verb_str, negation_str = split_verb_negation(verb_str)
    if is_modal_verb(aff_verb_str):
        return verb_str
    lemma_lst = getLemma(aff_verb_str, "VERB")
    lemma_lst = __collapse_lemma_list(lemma_lst)
    if len(lemma_lst) != 1:
        logging.warning(
            'WARNING: Ambigous or no lemma for "{}". Output was {}. Keeping original verb.'
            .format(verb_str, lemma_lst))
        return verb_str
    lemma_str = lemma_lst[0]
    inflect_lst = getInflection(lemma_str,
                                __pronoun_to_verb_upenn_dict[pronoun_str])
    if len(inflect_lst) > 2 or not len(inflect_lst):
        logging.warning(
            'WARNING: Ambigous or no inflection list for lemma "{}" from verb "{}". Output was {}. Keeping original verb.'
            .format(lemma_str, verb_str, inflect_lst))
        return verb_str
    elif len(inflect_lst) == 2:
        if pronoun_str == 'i':
            new_verb_str = inflect_lst[0]
        else:
            new_verb_str = inflect_lst[1]
    else:
        new_verb_str = inflect_lst[0]
    return merge_verb_negation(new_verb_str, negation_str)
Пример #6
0
 def _tokenize(self, text):
     tokenized = self.cased_tokenizer.tokenize(
         text, never_split=self.all_special_tokens)
     #print(tokenized)
     ptb_pos_tagged = self.tagger.tag(tokenized)
     #print(pos_tagged)
     #print(pos_tagged)
     universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                             for (token, tag) in ptb_pos_tagged]
     #print(universal_pos_tagged)
     split_tokens = []
     for i, (word, pos) in enumerate(ptb_pos_tagged):
         if self.do_lower_case:
             word = word.lower()
         if universal_pos_tagged[i][
                 1] in self.have_inflections and word not in (
                     string.punctuation +
                     '—') and pos not in self.lemma_tags:
             # (universal_)pos_tagged in the form of [(word, pos),(word, pos),...]
             # getLemma returns a tuple (lemma,)
             lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
             if not lemma:
                 lemma = word
             wordpieced = self.wordpiece_tokenizer.tokenize(lemma)
             #print(wordpieced)
             split_tokens.extend(wordpieced)
             split_tokens.append('[' + pos + ']')
         else:
             wordpieced = self.wordpiece_tokenizer.tokenize(word)
             split_tokens.extend(wordpieced)
     return split_tokens
Пример #7
0
def get_lemmas(word: str, pos: PartOfSpeech):
    word = word.lower()

    if (" " in word or "." in word):
        return JSONResponse (status_code = 200, content = {"message": "Input must contain only a single word without spaces or punctuation."})

    # Get the basic lemma version of the word first
    lemmas = getLemma(word, pos)
    if len(lemmas) > 0:
        lemma = getLemma(word, pos)[0]
    else:
        lemma = word

    inflections = merge_inflections(getAllInflections(lemma, upos=pos), getAllInflectionsOOV(lemma, upos=pos))
    
    return {"lemma": lemma, "inflections": inflections}
    
Пример #8
0
 def testProperNouns(self):
     lemmas = lemminflect.getLemma('Alaskans', 'NOUN', lemmatize_oov=False)
     self.assertEqual(len(lemmas), 0)
     lemmas = lemminflect.getLemma('Alaskans', 'PROPN', lemmatize_oov=False)
     self.assertEqual(len(lemmas), 1)
     self.assertEqual(lemmas[0], 'Alaskan')
     lemmas = lemminflect.getLemma('Axxlaskans', 'NOUN', lemmatize_oov=True)
     self.assertEqual(len(lemmas), 1)
     self.assertEqual(lemmas[0], 'Axxlaskan')
     lemmas = lemminflect.getLemma('Axxlaskans',
                                   'PROPN',
                                   lemmatize_oov=True)
     self.assertEqual(len(lemmas), 1)
     self.assertEqual(lemmas[0], 'Axxlaskan')
     token = self.nlp('The Alaskans went South.')[1]
     self.assertEqual(token._.lemma(lemmatize_oov=False), 'Alaskan')
     token = self.nlp('The Axxlaskans went South.')[1]
     self.assertEqual(token._.lemma(lemmatize_oov=True), 'Axxlaskan')
Пример #9
0
def get_legal_words(tag: str,
                    second_tag: Optional[str] = None,  # also counterbalance list of other word forms (e.g. plural)
                    seed: int = configs.Data.seed,
                    exclude: Optional[Tuple[str, ...]] = None,
                    verbose: bool = False,
                    ) -> Union[List[str], List[Tuple[str, str]]]:

    print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}')

    # get words with requested tag and order
    df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv')
    bool_ids = df_legal['is_legal'].astype(bool).tolist()
    first_forms_ = df_legal['word'][bool_ids].tolist()

    # exclude any words ?
    if exclude:
        first_forms_ = [w for w in first_forms_ if w not in exclude]

    # also counterbalance 2nd forms of words ?
    if second_tag is None:
        second_forms_ = None
    elif second_tag == 'NNP':
        plural = inflect.engine()
        second_forms_ = [plural.plural(w) for w in first_forms_]
    elif second_tag.startswith('VB'):
        lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_]
        second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas]  # requires lemma as input
    else:
        raise AttributeError('Invalid arg to second_tag')

    # remove words if their 2nd form is not in vocab or if it is identical to 1st form
    if second_tag is not None:
        first_forms = []
        second_forms = []
        for w1, w2 in zip(first_forms_, second_forms_):
            if w2 in vocab and w2 != w1:
                first_forms.append(w1)
                second_forms.append(w2)
                if verbose:
                    print(f'Included {w1:<12} and {w2:<12}')
        assert first_forms
        assert second_forms
    else:
        first_forms = first_forms_
        second_forms = second_forms_

    # find subset of words such that their total corpus frequencies are approx equal across corpora
    num_words_in_sample = configs.Data.tag2num_words[tag]
    res = find_counterbalanced_subset(first_forms,
                                      min_size=num_words_in_sample,
                                      max_size=num_words_in_sample+100,
                                      second_forms=second_forms,
                                      seed=seed,
                                      verbose=verbose,
                                      )

    return res
Пример #10
0
 def testUPOSLog(self):
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
Пример #11
0
    def get_lemmas(self, word, tag=None, pos=None):

        lemmas = []

        if tag:
            # infer pos from tag
            pos = Inflector.tag_to_pos(tag)

        if pos:
            lemma_dict = lemminflect.getLemma(word, upos=pos)
            lemmas = list(lemma_dict)
        else:
            # no pos provided, return all lemmas
            lemma_dict = lemminflect.getAllLemmas(word)
            for i in lemma_dict.values():
                lemmas += list(i)

        return lemmas
Пример #12
0
def api_getLemma():
    content = request.get_json()
    result = getLemma(content['word'], content['upos'],
                      content['lemmatize_oov'])
    return jsonify(result)
Пример #13
0
def lemmatize_with_lemminflect(source):
    lemmed = []
    for w in source:
        lemmed.append(getLemma(w, upos='VERB'))
    return lemmed
Пример #14
0
 def getLemma(self, entry, upos):
     lemmas = lemminflect.getLemma(entry.infl, upos)
     if not lemmas:
         return ()
     return lemmas[0]
Пример #15
0
 def runGetLemmaTests(self, tests):
     for test in tests:
         base, upos, form = test
         lemmas = lemminflect.getLemma(form, upos)
         self.assertTrue(base in set(lemmas),
                         msg='base=%s  lemmas=%s' % (base, str(lemmas)))
def get_stem(token):
    # return nlp(token)[0].lemma_
    # return ps.stem(token)
    return lemminflect.getLemma(token, "VERB")[0]
Пример #17
0
 def checkAuxLemmas(self, lemma, infls):
     for infl in infls:
         lemmas = lemminflect.getLemma(infl, 'AUX')
         self.assertEqual(len(lemmas), 1)
         self.assertEqual(lemmas[0], lemma)