Exemplo n.º 1
0
def tense_of_verb(verb_str):
    """
    Identifies the verb tense of a word, and returns it in a tuple along with its base word.
    @param verb_str: a str containing a verb
    @return: a tuple t, where t[0] is 'AUX' if the verb is a special auxiliary verb, 
        is '?' if the verb tense cannot be recognized, and otherwise is 'VBD', 'VBP', or 'VBZ',
        which correspond to the Penn Treebank P.O.S. tags for past tense, non-3rd person present 
        tense, and 3rd person present tense.
    """
    aux_verbs = [
        'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does',
        'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
        'can', 'could', 'ought'
    ]
    if verb_str.lower() in aux_verbs:
        return ('AUX', verb_str)
    lemm_str = getLemma(verb_str, upos='VERB')[0]
    if verb_str in getInflection(lemm_str, tag='VBD'):
        return ('VBD', lemm_str)
    elif verb_str in getInflection(lemm_str, tag='VBP'):
        return ('VBP', lemm_str)
    elif verb_str in getInflection(lemm_str, tag='VBZ'):
        return ('VBZ', lemm_str)
    else:
        return ('?', lemm_str)
Exemplo n.º 2
0
 def testProperNouns(self):
     infls = lemminflect.getInflection('Alaskan', 'NN', inflect_oov=False)
     self.assertEqual(len(infls), 0)
     infls = lemminflect.getInflection('Alaskan', 'NNP', inflect_oov=False)
     self.assertEqual(len(infls), 1)
     self.assertEqual(infls[0], 'Alaskan')
     infls = lemminflect.getInflection('Alaskan', 'NNPS', inflect_oov=False)
     self.assertEqual(len(infls), 1)
     self.assertEqual(infls[0], 'Alaskans')
     infls = lemminflect.getInflection('Axxlaskan', 'NNP', inflect_oov=True)
     self.assertEqual(len(infls), 1)
     self.assertEqual(infls[0], 'Axxlaskan')
     infls = lemminflect.getInflection('Axxlaskan',
                                       'NNPS',
                                       inflect_oov=True)
     self.assertEqual(len(infls), 1)
     self.assertEqual(infls[0], 'Axxlaskans')
     lemminflect.Inflections().setUseInternalLemmatizer(
         True)  # lemmatize with lemminflect
     token = self.nlp('The Alaskan went South.')[1]
     self.assertEqual(token._.inflect('NNPS', inflect_oov=False),
                      'Alaskans')
     token = self.nlp('The Axxlaskan went South.')[1]
     self.assertEqual(token._.inflect('NNPS', inflect_oov=True),
                      'Axxlaskans')
Exemplo n.º 3
0
 def testOverrides(self):
     # run the inflection system once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getInflection('watch', 'VBD'), ('watched', )
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Inflections().overrides_dict
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
     lemminflect.Inflections().overrides_dict = {
         'watch': {
             'VBD': ('xxx', )
         }
     }
     inflections = lemminflect.getInflection('watch',
                                             'VBD',
                                             inflect_oov=False)
     self.assertEqual(inflections, ('xxx', ))
     # put the original dictionary back
     lemminflect.Inflections().overrides_dict = orig_dict
Exemplo n.º 4
0
 def testGetInflection04(self):
     self.assertEqual(lemminflect.getAllInflections('watch', 'ADJ'), {})
     self.assertEqual(
         lemminflect.getInflection('watch', 'JJ', inflect_oov=False), ())
     self.assertEqual(
         lemminflect.getInflection('watch', 'JJ', inflect_oov=True),
         ('watch', ))
     self.assertEqual(lemminflect.getInflection('watch', 'VBD'),
                      ('watched', ))
 def testGetInflectionOOV(self):
     self.assertEqual(lemminflect.getInflection('xxbike',    'NN',  inflect_oov=False), ())
     self.assertEqual(lemminflect.getInflection('xxbike',    'NNS', inflect_oov=False), ())
     self.assertEqual(lemminflect.getInflection('xxbike',    'NN',  inflect_oov=True), ('xxbike',))         # reg
     self.assertEqual(lemminflect.getInflection('xxbike',    'NNS', inflect_oov=True), ('xxbikes',))        # reg
     self.assertEqual(lemminflect.getInflection('xxbaggy',   'JJR', inflect_oov=True), ('xxbaggier',))      # reg
     self.assertEqual(lemminflect.getInflection('xxclean',   'RBS', inflect_oov=True), ('xxcleanest',))     # reg
     self.assertEqual(lemminflect.getInflection('xxformat',  'VBG', inflect_oov=True), ('xxformatting',))   # regd
     self.assertEqual(lemminflect.getInflection('xxbacklog', 'VBD', inflect_oov=True), ('xxbacklogged',))   # regd
     self.assertEqual(lemminflect.getInflection('xxgenesis', 'NNS', inflect_oov=True), ('xxgeneses',))      # glreg
     self.assertEqual(lemminflect.getInflection('xxalumus',  'NNS', inflect_oov=True), ('xxalumi',))        # glreg
Exemplo n.º 6
0
def match_pronoun_present(verb_str: str, pronoun_str: str) -> str:
    """
    Returns a verb form that matches the passed pronoun.
    This function should only be used for present tense
    """
    pronoun_str = pronoun_str.lower()
    if pronoun_str not in __pronoun_to_verb_upenn_dict.keys():
        raise ValueError(
            'Unexpected value for pronoun "{}"'.format(pronoun_str))
    aff_verb_str, negation_str = split_verb_negation(verb_str)
    if is_modal_verb(aff_verb_str):
        return verb_str
    lemma_lst = getLemma(aff_verb_str, "VERB")
    lemma_lst = __collapse_lemma_list(lemma_lst)
    if len(lemma_lst) != 1:
        logging.warning(
            'WARNING: Ambigous or no lemma for "{}". Output was {}. Keeping original verb.'
            .format(verb_str, lemma_lst))
        return verb_str
    lemma_str = lemma_lst[0]
    inflect_lst = getInflection(lemma_str,
                                __pronoun_to_verb_upenn_dict[pronoun_str])
    if len(inflect_lst) > 2 or not len(inflect_lst):
        logging.warning(
            'WARNING: Ambigous or no inflection list for lemma "{}" from verb "{}". Output was {}. Keeping original verb.'
            .format(lemma_str, verb_str, inflect_lst))
        return verb_str
    elif len(inflect_lst) == 2:
        if pronoun_str == 'i':
            new_verb_str = inflect_lst[0]
        else:
            new_verb_str = inflect_lst[1]
    else:
        new_verb_str = inflect_lst[0]
    return merge_verb_negation(new_verb_str, negation_str)
Exemplo n.º 7
0
 def filter_out_tense(self, sent, so, eo, candidates):
     stems = []
     out = []
     word_tag = nltk.pos_tag([sent[so:eo]])[0][1]
     stems.append(self.ps.stem(sent[so:eo]))
     for word in candidates:
         cand_stem = self.ps.stem(word)
         if cand_stem not in stems:
             stems.append(cand_stem)
             try:
                 cand_tag = self.tag_for_lemmatizer(word)
                 if cand_tag is None:
                     out.append(
                         getInflection(self.lem.lemmatize(word,
                                                          pos=cand_tag),
                                       tag=word_tag)[0])
                 else:
                     out.append(word)
             except IndexError:
                 # Lemminflect does not support all POS tags - lemminflect.readthedocs.io/en/latest/tags/
                 out.append(word)
                 logger.debug(
                     "ERROR: Lemminflect cannot convert {} with type {}, skipping"
                     .format(word, word_tag))
     return out
Exemplo n.º 8
0
def inflect(string, mode):

    words = string.split(" ")
    for i, word in enumerate(words):
        if word[0] == "[" and word[-1] == "]":
            words[i] = word[1:-1]
        elif len(words) > 1:
            continue

        # Local checking for forms 3rd party library does wrong
        override = override_inflection(words[i], mode)
        if override != None:
            return override

        if mode == "ppart":
            words[i] = lemminflect.getInflection(words[i], tag='VBN')[0]
        elif mode == "part":
            words[i] = lemminflect.getInflection(words[i], tag='VBG')[0]
        elif mode == "3sg":
            words[i] = lemminflect.getInflection(words[i], tag='VBZ')[0]
        elif mode == "inf":
            continue
        elif mode == "sg":
            words[i] = lemminflect.getInflection(words[i], tag='NN')[0]
        elif mode == "pl":
            words[i] = lemminflect.getInflection(words[i], tag='NNS')[0]
        elif mode == "mass":
            words[i] = lemminflect.getInflection(words[i], tag='NN')[0]
        elif mode == "singleton":
            words[i] = lemminflect.getInflection(words[i], tag='NN')[0]

    return " ".join(words)
Exemplo n.º 9
0
 def testGetInflection03(self):
     self.assertEqual(lemminflect.getAllInflections('watch'),
         {'NNS': ('watches', 'watch'), 'NN': ('watch',), 'VBD': ('watched',),
         'VBG': ('watching',), 'VBZ': ('watches',), 'VB': ('watch',), 'VBP': ('watch',)})
     self.assertEqual(lemminflect.getAllInflections('watch', 'VERB'),
         {'VBD': ('watched',), 'VBG': ('watching',), 'VBZ': ('watches',),
          'VB': ('watch',), 'VBP': ('watch',)})
     self.assertEqual(lemminflect.getInflection('watch', 'VBD'), ('watched',))
     self.assertEqual(lemminflect.getAllInflections('watch', 'ADJ'), {})
Exemplo n.º 10
0
Arquivo: words.py Projeto: phueb/Zorro
def get_legal_words(tag: str,
                    second_tag: Optional[str] = None,  # also counterbalance list of other word forms (e.g. plural)
                    seed: int = configs.Data.seed,
                    exclude: Optional[Tuple[str, ...]] = None,
                    verbose: bool = False,
                    ) -> Union[List[str], List[Tuple[str, str]]]:

    print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}')

    # get words with requested tag and order
    df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv')
    bool_ids = df_legal['is_legal'].astype(bool).tolist()
    first_forms_ = df_legal['word'][bool_ids].tolist()

    # exclude any words ?
    if exclude:
        first_forms_ = [w for w in first_forms_ if w not in exclude]

    # also counterbalance 2nd forms of words ?
    if second_tag is None:
        second_forms_ = None
    elif second_tag == 'NNP':
        plural = inflect.engine()
        second_forms_ = [plural.plural(w) for w in first_forms_]
    elif second_tag.startswith('VB'):
        lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_]
        second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas]  # requires lemma as input
    else:
        raise AttributeError('Invalid arg to second_tag')

    # remove words if their 2nd form is not in vocab or if it is identical to 1st form
    if second_tag is not None:
        first_forms = []
        second_forms = []
        for w1, w2 in zip(first_forms_, second_forms_):
            if w2 in vocab and w2 != w1:
                first_forms.append(w1)
                second_forms.append(w2)
                if verbose:
                    print(f'Included {w1:<12} and {w2:<12}')
        assert first_forms
        assert second_forms
    else:
        first_forms = first_forms_
        second_forms = second_forms_

    # find subset of words such that their total corpus frequencies are approx equal across corpora
    num_words_in_sample = configs.Data.tag2num_words[tag]
    res = find_counterbalanced_subset(first_forms,
                                      min_size=num_words_in_sample,
                                      max_size=num_words_in_sample+100,
                                      second_forms=second_forms,
                                      seed=seed,
                                      verbose=verbose,
                                      )

    return res
Exemplo n.º 11
0
def sample_verb(tag_list, source_tag, source):
    tag_list = [tag for tag in tag_list if tag != source_tag]
    tag = rd.choice(tag_list)
    cand_list = getInflection(source, tag)
    if cand_list == []:
        cand_list = getAllInflectionsOOV(source, upos='VERB').values()
    if len(cand_list) > 0:
        cand = rd.choice(cand_list)
    else:
        cand = None
    return cand
Exemplo n.º 12
0
def inflection(pred_lemma, pred_pos, pred_word):
    if pred_pos == "VERB":
        inflection = getInflection(pred_lemma, tag='VBG')[0]
        # to cater to the errors in the lemma
        if pred_lemma.lower().endswith("ing"):
            return pred_word
        else:
            return inflection
        return
    else:
        return pred_word
Exemplo n.º 13
0
 def testUPOSLog(self):
     with self.assertLogs():
         infl = lemminflect.getInflection('WORD', 'X')
     self.assertEqual(infl, ())
     with self.assertLogs():
         infls = lemminflect.getAllInflections('WORD', 'X')
     self.assertEqual(infls, {})
     with self.assertLogs():
         infls = lemminflect.getAllInflectionsOOV('WORD', 'X')
     self.assertEqual(infls, {})
     token = self.nlp('testing')[0]
     self.assertEqual(token._.inflect('X'), 'testing')
Exemplo n.º 14
0
    def inflect_lemma(self, lemma, tag=None, pos=None):

        inflections = []
        # tag based
        if tag:
            inflection_tuple = lemminflect.getInflection(lemma, tag=tag)
            inflections = list(inflection_tuple)
        else:
            # pos based, can be None too
            inflection_dict = lemminflect.getAllInflections(lemma, upos=pos)
            for i in inflection_dict.values():
                inflections += list(i)

        return inflections
Exemplo n.º 15
0
 def convert_tokens_to_string(self, tokens):
     result = []
     for i, token in enumerate(tokens):
         # combine wordpiece tokens
         if len(token) > 2 and token[:2] == '##':
             if result:
                 result[-1] += token[2:]
             else:
                 result.append(token[2:])
             continue
         if token in self.inflection_tokens:
             if i != 0:
                 inflected = getInflection(result[-1], tag=token[1:-1])
                 if inflected:
                     result[-1] = inflected[0]
         else:
             result.append(token)
     return ' '.join(result)
Exemplo n.º 16
0
def get_lemminflect(token):
    text = token.text
    lemma = token.lemma_
    tag = token.tag_
    pos = token.pos_
    word_lemminflect = set()
    if pos not in REPLACE_POS:
        return list(word_lemminflect)

    tags = POS_TO_TAGS[pos]
    for tg in tags:
        if tg == tag: continue
        inflects = getInflection(lemma, tag=tg)
        for word in inflects:
            if word.lower() != text.lower():
                word_lemminflect.add(word)

    return list(word_lemminflect)
Exemplo n.º 17
0
    def detokenize(self,
                   tokens: List[str],
                   as_list: bool = False) -> Union[str, List[str]]:
        result = []
        for i, token in enumerate(tokens):
            # combine wordpiece tokens
            if token in self.reverse_single_char_map:
                token = self.reverse_single_char_map[token]
            if token in self.inflection_tokens:
                if i != 0:
                    inflected = getInflection(result[-1], tag=token[1:-1])
                    if inflected:
                        result[-1] = inflected[0]
            else:
                result.append(token)

        if as_list:
            # Allow users to detokenize using their own detokenizers
            return result
        if self.pretok_type == 'moses':
            return self.detokenizer.detokenize(result)
        return ' '.join(result)
Exemplo n.º 18
0
    def __call__(self, sent, index):
        # get word
        if sent[index].org is not None:
            word = sent[index].org
        else:
            word = sent[index].lemma

        # get cand
        cand = None
        source = word.lower()
        if source != '':
            source_tag = sent[index].tag
            tag_list = [tag for tag in self.tag_list if tag != source_tag]
            tag = rd.choice(tag_list)
            cand_list = getInflection(source, tag)
            if cand_list == []:
                cand_list = getAllInflectionsOOV(source, upos='VERB').values()
            if len(cand_list) > 0:
                cand = rd.choice(cand_list)

        # replace to cand
        if cand is not None:
            if word.istitle():
                cand = cand.title()
            sent[index].org = cand
            if ((index >= 1 and sent[index - 1].pos != 'AUX')
                    and (index >= 2 and sent[index - 2].pos != 'AUX')
                    and self.sampler() < self.aux_ratio
                ):  # 直前にAUXがなくVBG, VBNなら"have (been)"の変化を直前に挿入する
                if tag == 'VBG':
                    sent[index].addition.append(
                        EnToken(index=sent[index].index - 0.25,
                                org=self.vbg_sampler()))
                elif tag == 'VBN':
                    sent[index].addition.append(
                        EnToken(index=sent[index].index - 0.25,
                                org=self.vbn_sampler()))
            sent[index] = self.add_history(sent[index])
        return sent
Exemplo n.º 19
0
 def inflect(self, tag):
     self.tag = tag
     self.text = lemminflect.getInflection(self.lemma, tag)[0]
     self.text_with_ws = self.text + self.whitespace
Exemplo n.º 20
0
def sample_cand(tag_list, source_tag, source):
    tag_list = [tag for tag in tag_list if tag != source_tag]
    tag = rd.choice(tag_list)
    cand_list = getInflection(source, tag)
    cand = rd.choice(cand_list)
    return cand
Exemplo n.º 21
0
def api_getInflection():
    content = request.json
    result = getInflection(content['lemma'], content['tag'],
                           content['inflect_oov'])
    return jsonify(result)
Exemplo n.º 22
0
import json
import codecs

with codecs.open('svo_triples_lemmatised.txt', 'r', 'utf-8-sig') as json_file:
    svo_triples = json.load(json_file)

svo_triples = svo_triples[3:]

pos_sentences = []
neg_sentences = []
passive_sentences = []
swapped_sentences = []

for triple in svo_triples:
    triple_subject = lem.getInflection(triple[0], tag='NNS', inflect_oov=False)
    triple_verb = lem.getInflection(triple[1], tag='VBD', inflect_oov=False)
    triple_negative_verb = lem.getInflection(triple[1],
                                             tag='VB',
                                             inflect_oov=False)
    triple_passive_verb = lem.getInflection(triple[1],
                                            tag='VBN',
                                            inflect_oov=False)
    triple_object = lem.getInflection(triple[2], tag='NNS', inflect_oov=False)
    if triple_subject != () and triple_verb != () and triple_object != ():
        #print((triple_subject,triple_verb,triple_object))
        #print(triple)
        pos_sentences += [
            triple_subject[0].capitalize() + " " + triple_verb[0] + " " +
            triple_object[0] + "."
        ]
Exemplo n.º 23
0
async def postInflection(word: Word, pos: PartOfSpeech = Body(...)):
  text = word.text
  tag = pos.tag
  inflection = getInflection(text, tag)
  return { "inflection": inflection }
Exemplo n.º 24
0
def inflection(pred_lemma, pred_pos, pred_word):
    #print(f"lemma: {pred_lemma}, pos: {pred_pos}, word: {pred_word}")
    if pred_pos=="VERB":
        return getInflection(pred_lemma, tag='VBG')[0]
    else:
        return pred_word
Exemplo n.º 25
0
 def testGetInflection02(self):
     self.assertEqual(lemminflect.getInflection('squirrel', 'NN'),
                      ('squirrel', ))
     self.assertEqual(lemminflect.getInflection('squirrel', 'NNS'),
                      ('squirrels', 'squirrel'))