Python getLemma примеры, lemminflect.getLemma Python примеры использования

Пример #1

0

Показать файл

 def testOverrides(self):
     # run the lemmatizer once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getLemma('Alaskans', 'NOUN', lemmatize_oov=False)
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Lemmatizer().overrides_dict
     lemminflect.Lemmatizer().overrides_dict = {
         'waltzes': {
             'VERB': ('xxx', )
         }
     }
     lemmas = lemminflect.getLemma('waltzes', 'VERB', lemmatize_oov=False)
     self.assertEqual(lemmas, ('xxx', ))
     # put the original dictionary back
     lemminflect.Lemmatizer().overrides_dict = orig_dict

Пример #2

0

Показать файл

Файл: base_inflect.py Проект: salesforce/bite

 def tokenize(self,
              sentence: Union[str, List[str]],
              pretokenize: bool = True,
              map_to_single_char: bool = False) -> List[str]:
     if pretokenize:
         pretokenized = self._pretokenize(sentence)
     else:
         # Allow users to pass in a list of tokens if using custom pretokenizers
         pretokenized = sentence
     ptb_pos_tagged = self.tagger.tag(pretokenized)
     universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                             for (token, tag) in ptb_pos_tagged]
     tokenized = []
     for i, (word, pos) in enumerate(ptb_pos_tagged):
         if universal_pos_tagged[i][
                 1] in self.have_inflections and word not in (
                     string.punctuation +
                     '—') and pos not in self.lemma_tags:
             lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
             if not lemma:
                 lemma = word
             tokenized.append(lemma)
             tokenized.append('[' + pos + ']')
         else:
             tokenized.append(word)
     if map_to_single_char:
         tokenized = [
             self.single_char_map[token]
             if token in self.inflection_tokens else token
             for token in tokenized
         ]
     return tokenized

Пример #3

0

Показать файл

def tense_of_verb(verb_str):
    """
    Identifies the verb tense of a word, and returns it in a tuple along with its base word.
    @param verb_str: a str containing a verb
    @return: a tuple t, where t[0] is 'AUX' if the verb is a special auxiliary verb, 
        is '?' if the verb tense cannot be recognized, and otherwise is 'VBD', 'VBP', or 'VBZ',
        which correspond to the Penn Treebank P.O.S. tags for past tense, non-3rd person present 
        tense, and 3rd person present tense.
    """
    aux_verbs = [
        'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does',
        'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
        'can', 'could', 'ought'
    ]
    if verb_str.lower() in aux_verbs:
        return ('AUX', verb_str)
    lemm_str = getLemma(verb_str, upos='VERB')[0]
    if verb_str in getInflection(lemm_str, tag='VBD'):
        return ('VBD', lemm_str)
    elif verb_str in getInflection(lemm_str, tag='VBP'):
        return ('VBP', lemm_str)
    elif verb_str in getInflection(lemm_str, tag='VBZ'):
        return ('VBZ', lemm_str)
    else:
        return ('?', lemm_str)

Пример #4

0

Показать файл

 def testOverrides(self):
     # run the inflection system once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getInflection('watch', 'VBD'), ('watched', )
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Inflections().overrides_dict
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
     lemminflect.Inflections().overrides_dict = {
         'watch': {
             'VBD': ('xxx', )
         }
     }
     inflections = lemminflect.getInflection('watch',
                                             'VBD',
                                             inflect_oov=False)
     self.assertEqual(inflections, ('xxx', ))
     # put the original dictionary back
     lemminflect.Inflections().overrides_dict = orig_dict

Пример #5

0

Показать файл

def match_pronoun_present(verb_str: str, pronoun_str: str) -> str:
    """
    Returns a verb form that matches the passed pronoun.
    This function should only be used for present tense
    """
    pronoun_str = pronoun_str.lower()
    if pronoun_str not in __pronoun_to_verb_upenn_dict.keys():
        raise ValueError(
            'Unexpected value for pronoun "{}"'.format(pronoun_str))
    aff_verb_str, negation_str = split_verb_negation(verb_str)
    if is_modal_verb(aff_verb_str):
        return verb_str
    lemma_lst = getLemma(aff_verb_str, "VERB")
    lemma_lst = __collapse_lemma_list(lemma_lst)
    if len(lemma_lst) != 1:
        logging.warning(
            'WARNING: Ambigous or no lemma for "{}". Output was {}. Keeping original verb.'
            .format(verb_str, lemma_lst))
        return verb_str
    lemma_str = lemma_lst[0]
    inflect_lst = getInflection(lemma_str,
                                __pronoun_to_verb_upenn_dict[pronoun_str])
    if len(inflect_lst) > 2 or not len(inflect_lst):
        logging.warning(
            'WARNING: Ambigous or no inflection list for lemma "{}" from verb "{}". Output was {}. Keeping original verb.'
            .format(lemma_str, verb_str, inflect_lst))
        return verb_str
    elif len(inflect_lst) == 2:
        if pronoun_str == 'i':
            new_verb_str = inflect_lst[0]
        else:
            new_verb_str = inflect_lst[1]
    else:
        new_verb_str = inflect_lst[0]
    return merge_verb_negation(new_verb_str, negation_str)

Пример #6

0

Показать файл

Файл: bite_wordpiece.py Проект: salesforce/bite

 def _tokenize(self, text):
     tokenized = self.cased_tokenizer.tokenize(
         text, never_split=self.all_special_tokens)
     #print(tokenized)
     ptb_pos_tagged = self.tagger.tag(tokenized)
     #print(pos_tagged)
     #print(pos_tagged)
     universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                             for (token, tag) in ptb_pos_tagged]
     #print(universal_pos_tagged)
     split_tokens = []
     for i, (word, pos) in enumerate(ptb_pos_tagged):
         if self.do_lower_case:
             word = word.lower()
         if universal_pos_tagged[i][
                 1] in self.have_inflections and word not in (
                     string.punctuation +
                     '—') and pos not in self.lemma_tags:
             # (universal_)pos_tagged in the form of [(word, pos),(word, pos),...]
             # getLemma returns a tuple (lemma,)
             lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
             if not lemma:
                 lemma = word
             wordpieced = self.wordpiece_tokenizer.tokenize(lemma)
             #print(wordpieced)
             split_tokens.extend(wordpieced)
             split_tokens.append('[' + pos + ']')
         else:
             wordpieced = self.wordpiece_tokenizer.tokenize(word)
             split_tokens.extend(wordpieced)
     return split_tokens

Пример #7

0

Показать файл

Файл: main.py Проект: justinbhopper/spacy-api

def get_lemmas(word: str, pos: PartOfSpeech):
    word = word.lower()

    if (" " in word or "." in word):
        return JSONResponse (status_code = 200, content = {"message": "Input must contain only a single word without spaces or punctuation."})

    # Get the basic lemma version of the word first
    lemmas = getLemma(word, pos)
    if len(lemmas) > 0:
        lemma = getLemma(word, pos)[0]
    else:
        lemma = word

    inflections = merge_inflections(getAllInflections(lemma, upos=pos), getAllInflectionsOOV(lemma, upos=pos))
    
    return {"lemma": lemma, "inflections": inflections}

Пример #8

0

Показать файл

 def testProperNouns(self):
     lemmas = lemminflect.getLemma('Alaskans', 'NOUN', lemmatize_oov=False)
     self.assertEqual(len(lemmas), 0)
     lemmas = lemminflect.getLemma('Alaskans', 'PROPN', lemmatize_oov=False)
     self.assertEqual(len(lemmas), 1)
     self.assertEqual(lemmas[0], 'Alaskan')
     lemmas = lemminflect.getLemma('Axxlaskans', 'NOUN', lemmatize_oov=True)
     self.assertEqual(len(lemmas), 1)
     self.assertEqual(lemmas[0], 'Axxlaskan')
     lemmas = lemminflect.getLemma('Axxlaskans',
                                   'PROPN',
                                   lemmatize_oov=True)
     self.assertEqual(len(lemmas), 1)
     self.assertEqual(lemmas[0], 'Axxlaskan')
     token = self.nlp('The Alaskans went South.')[1]
     self.assertEqual(token._.lemma(lemmatize_oov=False), 'Alaskan')
     token = self.nlp('The Axxlaskans went South.')[1]
     self.assertEqual(token._.lemma(lemmatize_oov=True), 'Axxlaskan')

Пример #9

0

Показать файл

Файл: words.py Проект: phueb/Zorro

def get_legal_words(tag: str,
                    second_tag: Optional[str] = None,  # also counterbalance list of other word forms (e.g. plural)
                    seed: int = configs.Data.seed,
                    exclude: Optional[Tuple[str, ...]] = None,
                    verbose: bool = False,
                    ) -> Union[List[str], List[Tuple[str, str]]]:

    print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}')

    # get words with requested tag and order
    df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv')
    bool_ids = df_legal['is_legal'].astype(bool).tolist()
    first_forms_ = df_legal['word'][bool_ids].tolist()

    # exclude any words ?
    if exclude:
        first_forms_ = [w for w in first_forms_ if w not in exclude]

    # also counterbalance 2nd forms of words ?
    if second_tag is None:
        second_forms_ = None
    elif second_tag == 'NNP':
        plural = inflect.engine()
        second_forms_ = [plural.plural(w) for w in first_forms_]
    elif second_tag.startswith('VB'):
        lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_]
        second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas]  # requires lemma as input
    else:
        raise AttributeError('Invalid arg to second_tag')

    # remove words if their 2nd form is not in vocab or if it is identical to 1st form
    if second_tag is not None:
        first_forms = []
        second_forms = []
        for w1, w2 in zip(first_forms_, second_forms_):
            if w2 in vocab and w2 != w1:
                first_forms.append(w1)
                second_forms.append(w2)
                if verbose:
                    print(f'Included {w1:<12} and {w2:<12}')
        assert first_forms
        assert second_forms
    else:
        first_forms = first_forms_
        second_forms = second_forms_

    # find subset of words such that their total corpus frequencies are approx equal across corpora
    num_words_in_sample = configs.Data.tag2num_words[tag]
    res = find_counterbalanced_subset(first_forms,
                                      min_size=num_words_in_sample,
                                      max_size=num_words_in_sample+100,
                                      second_forms=second_forms,
                                      seed=seed,
                                      verbose=verbose,
                                      )

    return res

Пример #10

0

Показать файл

 def testUPOSLog(self):
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')

Пример #11

0

Показать файл

Файл: inflector.py Проект: databill86/replaCy

    def get_lemmas(self, word, tag=None, pos=None):

        lemmas = []

        if tag:
            # infer pos from tag
            pos = Inflector.tag_to_pos(tag)

        if pos:
            lemma_dict = lemminflect.getLemma(word, upos=pos)
            lemmas = list(lemma_dict)
        else:
            # no pos provided, return all lemmas
            lemma_dict = lemminflect.getAllLemmas(word)
            for i in lemma_dict.values():
                lemmas += list(i)

        return lemmas

Пример #12

0

Показать файл

Файл: app.py Проект: aolney/LemmInflect

def api_getLemma():
    content = request.get_json()
    result = getLemma(content['word'], content['upos'],
                      content['lemmatize_oov'])
    return jsonify(result)

Пример #13

0

Показать файл

def lemmatize_with_lemminflect(source):
    lemmed = []
    for w in source:
        lemmed.append(getLemma(w, upos='VERB'))
    return lemmed

Пример #14

0

Показать файл

 def getLemma(self, entry, upos):
     lemmas = lemminflect.getLemma(entry.infl, upos)
     if not lemmas:
         return ()
     return lemmas[0]

Пример #15

0

Показать файл

 def runGetLemmaTests(self, tests):
     for test in tests:
         base, upos, form = test
         lemmas = lemminflect.getLemma(form, upos)
         self.assertTrue(base in set(lemmas),
                         msg='base=%s  lemmas=%s' % (base, str(lemmas)))

Пример #16

0

Показать файл

Файл: get_vpc_corpus.py Проект: juliawatson/verb_particle_constructions

def get_stem(token):
    # return nlp(token)[0].lemma_
    # return ps.stem(token)
    return lemminflect.getLemma(token, "VERB")[0]

Пример #17

0

Показать файл

 def checkAuxLemmas(self, lemma, infls):
     for infl in infls:
         lemmas = lemminflect.getLemma(infl, 'AUX')
         self.assertEqual(len(lemmas), 1)
         self.assertEqual(lemmas[0], lemma)

Python getLemma примеры использования