Python _morphy 예제들, nltk.corpus.wordnet._morphy Python 예제들

예제 #1

0

파일 보기

파일: util.py 프로젝트: chauncyf/LING131-Final-project

def lemmatize(word, pos):
    # Need to specify pos=NOUN|ADJ|VERB|
    if pos == None:
        lemmas = wordnet._morphy(word, NOUN)
    else:
        lemmas = wordnet._morphy(word, pos)
    return min(lemmas, key=len) if lemmas else word

예제 #2

0

파일 보기

파일: doc_to_feature.py 프로젝트: gofortargets/CNN_brandsafety

 def lemmatize(self, word, pos=None):
     try:
         if pos is not None:
             return max(wn._morphy(word, pos),
                        key=lambda x: count_word(x, pos))
         else:
             return max(chain.from_iterable(
                 wn._morphy(word, pos) for pos in {'n', 'v', 'a', 'r'}),
                        key=lambda x: count_word(x))
     except ValueError:
         return word

예제 #3

0

파일 보기

파일: wordcount.py 프로젝트: bashir4909/geohappiness

def stem_word(word):
    # get word into basic form
    # going -> go, greates -> great
    # it is rather trivial way to do it
    stem = wn._morphy(word, ADJ)
    if (stem != [] and stem[-1] != word):
        return stem[-1]
    stem = wn._morphy(word, VERB)
    if (stem != [] and stem[0] != word):
        return stem[0]
    stem = wn._morphy(word, NOUN)
    if (stem == []):
        return word
    return stem[-1]

예제 #4

0

파일 보기

 def getBaseFormNLTK(self, token, pos):
     baseForm = ''
     baseList = wn._morphy(token, pos)
     if len(baseList) != 0:
         for base in baseList:
             if base in myGlobal.wordDict:
                 baseForm = base
     return baseForm

예제 #5

0

파일 보기

 def createWordVariants(self, pos_tagged_s):
     for w, pos in pos_tagged_s:
         #print "Extracting roots for ", w, " with pos ", pos
         wn_tag = self.getWordnetTagFromPosTag(pos)
         if wn_tag != None:
             #print "Considering word to morphy " , w, "pos " , pos
             self.root_word_variants[w] = wn._morphy(w.lower(), wn_tag)
         else:
             self.root_word_variants[w] = [(wnl.lemmatize(w.lower()))]

예제 #6

0

파일 보기

파일: wsi.py 프로젝트: nvanva/LexSubGen

 def _find_target_word_idx(tokens: List[str], word_forms: Set):
     for idx, token in enumerate(tokens):
         token_lower = token.lower()
         lemmas = {
             lemma
             for pos in ['v', 'n', 'a', 'r']
             for lemma in wn._morphy(token_lower, pos)
         }
         if lemmas.intersection(word_forms) or token_lower in word_forms:
             return idx
     raise ValueError(f"Target word was not found {tokens}\n{word_forms}")

예제 #7

0

파일 보기

파일: ParsFunc.py 프로젝트: xcw0420/Sentence-Parser

def ManageSentence(org_sentence, sentence, T):
    for word, line_num in org_sentence:
        roots = set(wordnet._morphy(word.lower(), wordnet.NOUN) + \
             wordnet._morphy(word.lower(), wordnet.VERB))
        if not roots:
            roots.add(word.lower())

        for root in roots:
            if root in T:
                break
        else:
            sys.exit("error: missing rule for '" + word +
                     "' (or wrong input form)")

        if {word} != roots:
            print(word, 'STRING', ' '.join(roots), line_num)
        else:
            print(word, 'STRING', line_num)
        sentence.append(roots)

    print('ENDFILE\n\n')

예제 #8

0

파일 보기

파일: text_parsing.py 프로젝트: dhiviyadhanasekar/MiningYelp

def stem(phrase, stem_map):

	phrase_array = phrase.split(' ')
	word = phrase_array[len(phrase_array)-1]
	stemmed_word = wn._morphy(word, 'n')
	if len(stemmed_word) > 0:
		stemmed_word = stemmed_word[len(stemmed_word)-1]
		if stemmed_word in stem_map: word = stem_map.get(stemmed_word)
		else: stem_map[stemmed_word] = word
		phrase_array[len(phrase_array)-1] = word
		return ' '.join(phrase_array)
	else: return phrase

예제 #9

0

파일 보기

파일: get_definition.py 프로젝트: Midren/coursework

def lemmatize(word, tokens_dict, order=0):
    pos_tag = get_pos(word, tokens_dict)
    poss_lemm = wn._morphy(word, pos_tag)
    if len(poss_lemm) == 1:
        return poss_lemm[0]
    else:
        if tokens_dict[word][order] == "VBD":
            for w in poss_lemm:
                if w != word:
                    return w
    print(poss_lemm, tokens_dict[word])
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = wordnet_lemmatizer.lemmatize(word, pos_tag)
    return lemmatized

예제 #10

0

파일 보기

파일: wordnet.py 프로젝트: vishalbelsare/nltk

    def lemmatize(self, word: str, pos: str = "n") -> str:
        """Lemmatize `word` using WordNet's built-in morphy function.
        Returns the input word unchanged if it cannot be found in WordNet.

        :param word: The input word to lemmatize.
        :type word: str
        :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
            `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
            for satellite adjectives.
        :param pos: str
        :return: The lemma of `word`, for the given `pos`.
        """
        lemmas = wn._morphy(word, pos)
        return min(lemmas, key=len) if lemmas else word

예제 #11

0

파일 보기

def is_too_sim(word, concept, t=0.30):

    for stem in nlwn._morphy(word, pos='n'):
        s = word_concept_sim(stem, concept)

        if s > t:
            return True

        synset = nlwn.synset(concept)
        for ss in nlwn.synsets(stem, pos='n'):
            if synset in features.all_hypernyms(ss):
                return True

        return False

예제 #12

0

파일 보기

def check_remove(data, tok, word_range, which_type="all"):
    gen_fam_term = ["father", "mother", "son", "daughter", "husband", "wife", "brother", "sister",
                    "grandfather", "grandmother", "grandson", "granddaughter", "uncle", "aunt", "nephew", "niece"]
    gen_term = ["female", "male", "woman", "man", "girl", "boy"]
    pro_lst = ["he", "she", "him", "her", "his", "hers", "himself", "herself"]

    result = []
    for cluster in word_range:
        if (which_type == "name"):  # check if the cluster has name link

            # Check all the instances of human names in the sentence and build "name_lst"
            name_lst = []
            for sent_chunk in ne_chunk(pos_tag(tok)):
                if hasattr(sent_chunk, 'label'):
                    if (sent_chunk.label() == "PERSON"):
                        name_lst.append(' '.join(c[0] for c in sent_chunk))
                        # (print("TESTING", c[0]) for c in sent_chunk)
            result.append(any([((' '.join(w for w in tok[c[0]:c[1] + 1])) in name_lst)
                               for c in cluster]))

        elif (which_type == "pro"):  # check if the cluster has only pronoun links
            result.append(all([((c[0] == c[1]) and (tok[c[0]]).lower() in pro_lst) for c in cluster]))


        elif (which_type == "term"):  # check if the cluster has gendered term

            for c in cluster:
                for i in c:
                    word_disam = lesk(tok, tok[i], 'n')  # check definition assigned from word disambiguation
                    # if the word is a valid English word check if it's person word and the definition contains gendered meaning
                    if (word_disam is not None) and (word_disam.lexname() == "noun.person"):
                        # now looking at all nouns in the range but after ACL we can use dependency parsing and only look at the head noun
                        result.append(any([wn_lem.lemmatize(w) in (gen_fam_term + gen_term + pro_lst)
                                           and (x in (gen_fam_term + gen_term + pro_lst)
                                                for x in wn._morphy(w, wn.NOUN))
                                           # checks all possible morphological functions
                                           for w in word_tokenize(word_disam.definition())]
                                          + [tok[i] in (gen_fam_term + gen_term + pro_lst)]))
                    else:
                        result.append(False)
                else:
                    continue
        else:  # check all conditions at the same time
            result.append(any([check_remove(data, tok, word_range, which_type="name"),
                               check_remove(data, tok, word_range, which_type="pro"),
                               check_remove(data, tok, word_range, which_type="term")]))

    return any(result)

예제 #13

0

파일 보기

파일: wordstat.py 프로젝트: Vanderhoof/WordStat

def normalize(word):
    """
    normalize(word)

    Correctly normalizes a word and returns the normalized form.
    word must be ['word','PoS'], PoS of nltk format,
    the result is a normalized string 'word'
    """
    w = word[0]
    if word[1] in NOZMALIZED:
        return w
    else:
        morphy = wordnet._morphy(w, tag_to_wn[word[1][0]])
        if w in morphy and len(morphy) > 1:
            morphy.remove(w)
        return morphy[0] if morphy else w

예제 #14

0

파일 보기

파일: wordnet.py 프로젝트: BPYap/BERT-WSD

def _get_info(lemma, pos, info_type):
    results = dict()

    wn_pos = WORDNET_POS[pos] if pos is not None else None
    morphemes = wn._morphy(lemma, pos=wn_pos) if pos is not None else []
    for i, synset in enumerate(set(wn.synsets(lemma, pos=wn_pos))):
        sense_key = None
        for l in synset.lemmas():
            if l.name().lower() == lemma.lower():
                sense_key = l.key()
                break
            elif l.name().lower() in morphemes:
                sense_key = l.key()
        assert sense_key is not None
        results[sense_key] = synset.examples() if info_type == 'examples' else synset.definition()

    return results

예제 #15

0

파일 보기

def lemmatize(word, tokens_dict, order=0):
    pos_tag = get_pos_wn(word, tokens_dict)
    if pos_tag:
        poss_lemm = wn._morphy(word, pos_tag)
    else:
        poss_lemm = ""
    if len(poss_lemm) == 1:
        return poss_lemm[0]
    else:
        if tokens_dict[word][order] in "VBD VBN":
            for w in poss_lemm:
                if w != word:
                    return w
    # print(poss_lemm, tokens_dict[word])
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = wordnet_lemmatizer.lemmatize(word)
    return lemmatized

예제 #16

0

파일 보기

def preprocess(inputfile):

    lemmatizer = WordNetLemmatizer()
    verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    rows = []
    line = [x.strip().split('\t') for x in inputfile.readlines()[1:]]
    for l in line:
        if l[3].isalpha():
            if l[3] not in verbs:
                if l[2] not in string.punctuation:
                    l[2] = lemmatizer.lemmatize(l[2].lower())
                    rows.append(l)
            else:
                l[2] = wn._morphy(l[2].lower(), pos='v')
                lemmatized = str(l[2]).strip("['").strip("']")
                l[2] = lemmatized
                rows.append(l)

    cols = ["word nº", "sentence nº", "word", "POS tag", "entity type"]

    return pd.DataFrame(rows, columns=cols)

예제 #17

0

파일 보기

def lemmatize(word, tokens_dict, order=0):
    """
    (str, dict, int) -> str.

    Return lemmatized word using dict of tokens and order of word in
    sentence.
    """
    pos_tag = get_pos_wn(word, tokens_dict)
    if pos_tag:
        poss_lemm = wn._morphy(word, pos_tag)
    else:
        poss_lemm = ""
    if len(poss_lemm) == 1:
        return poss_lemm[0]
    else:
        if tokens_dict[word][order] in "VBD VBN":
            for w in poss_lemm:
                if w != word:
                    return w
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = wordnet_lemmatizer.lemmatize(word)
    return lemmatized

예제 #18

0

파일 보기

파일: wordnet.py 프로젝트: nlp-anonymous-happy/anonymous-KG-guided-NLP

 def get_morphy(self, lemma, check_exceptions=True):
     morphy_list = [
         form for p in self.pos
         for form in wn._morphy(lemma, p, check_exceptions)
     ]
     return set(morphy_list)

예제 #19

0

파일 보기

파일: wordnet.py 프로젝트: wrand/tweater

 def lemmatize(self, word, pos=NOUN):
     lemmas = _wordnet._morphy(word, pos)
     if not lemmas:
         return word
     lemmas.sort(key=len)
     return lemmas[0]

예제 #20

0

파일 보기

파일: wordnet.py 프로젝트: BohanHsu/developer

 def lemmatize(self, word, pos=NOUN):
     lemmas = wordnet._morphy(word, pos)
     return min(lemmas, key=len) if lemmas else word

예제 #21

0

파일 보기

파일: eliot_summary_statistics.py 프로젝트: laurasher/nlp-poetry

    sentences = []
    s = []
    sample = open(f"./data/{poem}.txt", "r")
    s = sample.read()

    # Replaces escape character with space
    f = s.replace("\n", " ")

    # iterate through each sentence in the file
    for i in sent_tokenize(f):
        # tokenize the sentence into words
        for j in word_tokenize(i):
            if j.isalnum(
            ) and not j.lower() in stop_words and not j.lower in ignore_words:
                j = j.lower()
                stems = wn._morphy(j, wn.NOUN)
                if len(stems) >= 1:
                    j = stems[0]
                #if len(stems) == 1:
                #	print(stems)
                if j not in ignore_words:
                    temp.append(j)
                    df_dict['word'].append(j)
                    df_dict['year'].append(
                        pub_year_df.loc[pub_year_df['poem_title'] == poem,
                                        'pub_year'].values[0])
                    df_dict['poem_title'].append(poem)

df = pd.DataFrame.from_dict(df_dict)
uniques = df['word'].unique().tolist()
print(f"df length: {len(df)}")

예제 #22

0

파일 보기

파일: text2dict.py 프로젝트: real-artswan/text2dict

def main():
    if len(sys.argv) < 2:
        print('Specify an input plain text file!')

    from nltk import download
    inputFile = sys.argv[1]
    if inputFile == '--help':
        download('tagsets')
        from nltk.help import upenn_tagset
        print(upenn_tagset())
        return

    with open(inputFile, 'r') as f:
        texts = f.read()

    texts = texts.split('\n')
    # Process texts
    print('Got texts:', len(texts))

    # Tokenize texts
    print('Tokenizing texts...')
    download('punkt')
    from nltk.tokenize import word_tokenize
    words = set()

    for text in texts:
        text_words = word_tokenize(text)
        words.update(text_words)

    print('Normalization...')
    import re
    # Normalize words (remove punctuation etc)
    words = map(lambda w: re.sub(r'[^a-zA-Z\-\'\`\"]', '', w), words)

    # Filter out unnecessary stuff
    words = filter(lambda w: len(w) > 2 and len(w) <= 45, words)
    words = set(words)
    print('All filtered words:', len(words))

    # # Filter out names words
    # from nltk.tag import StanfordNERTagger
    # st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2018-10-16/stanford-ner.jar')
    # tagged = st.tag(words)
    # tagged_words_common = filter(lambda tag: tag[1] == 'O', tagged)
    # words = map(lambda tag: tag[0].lower(), tagged_words_common)
    # words = set(words)
    # print('Filtered common words:', len(words))
    #
    # Print names words (just for info)
    # tagged_names = list(filter(lambda tag: tag[1] != 'O', tagged))
    # print('Names words:', len(tagged_names))
    # print(tagged_names)

    # cast to lowercase
    words = set(map(str.lower, words))

    print('Processing words...')
    # Stemming to use stems as key
    download('stopwords')
    from nltk.stem import SnowballStemmer
    snowball_stemmer = SnowballStemmer('english', ignore_stopwords=True)
    # Lemmatize to use lemmas as base word
    download('wordnet')
    from nltk.corpus.reader import wordnet as wnr
    from nltk.corpus import wordnet

    # POS tagger
    download('averaged_perceptron_tagger')
    from nltk import pos_tag

    # Process words
    keyed_words = {}
    tagged_words = pos_tag(words)

    def tagged2str(word, tag):
        return '{} ({})'.format(word, tag)

    for word, tag in tagged_words:
        stem = snowball_stemmer.stem(word)
        str_word = tagged2str(word, tag)
        if stem in keyed_words:
            keyed_words[stem] += (str_word, )
        else:
            pos = wnr.VERB if tag in {
                'VB', 'VBP', 'VBZ', 'VBN', 'VBG', 'VBD', 'VERB'
            } else wnr.NOUN
            lemmas = wordnet._morphy(word, pos=pos)
            lemma = tagged2str(min(lemmas, key=len),
                               pos) if lemmas else str_word
            keyed_words[stem] = (lemma, str_word)

    print('Keys count:', len(keyed_words))

    print('Writing file ', OUTFILE)

    def mapToCSVLine(item):
        return ','.join(map(str, (item[0], ) + item[1])) + '\n'

    # Output to file
    with open(OUTFILE, 'w') as f:
        f.write(mapToCSVLine(('STEM', ('LEMMA', 'FORMS...'))))  # Header
        for key in sorted(keyed_words):
            f.write(mapToCSVLine((key, keyed_words[key])))

    print('Done! Use "{} --help" To know what abbreviations mean.'.format(
        sys.argv[0]))

예제 #23

0

파일 보기

파일: wordnet.py 프로젝트: SMBingham/Destination-Better-Lemmas-for-Natural-Language-Processing

    def lemmatize(self,
                  word,
                  pos=None,
                  multiple_pos_calls=[NOUN],
                  print_results=False):

        # Bingham updates:
        #   Added alternative default (pos=None) processing.
        #   Added print results option.

        if print_results:
            dash = '-' * 50
            hdr_template = '{:<20s}{:>10s}{:>10s}{:>8s}'
            data_template = '{:<20s}{:>10d}{:>10d}{:>8.2f}'
            total_template = '{:<20s}{:>20d}'
            print(dash)
            print(hdr_template.format('Type', 'Number', 'Hits', '% Total'))
            print(dash)
            print(
                data_template.format(
                    'snapwords', len(self._snap_words), self._snap_count,
                    100 * round(self._snap_count / self._call_count, 3)))
            print(
                data_template.format(
                    'local dictionary', len(self._dictionary),
                    self._dictionary_count,
                    100 * round(self._dictionary_count / self._call_count, 3)))
            print(
                data_template.format(
                    'remembered words', len(self._word_memory),
                    self._memory_count,
                    100 * round(self._memory_count / self._call_count, 3)))
            print(
                data_template.format(
                    '* not cached *', 0, len(self._word_memory),
                    100 * round(len(self._word_memory) / self._call_count, 3)))
            print()
            print(total_template.format('Total', self._call_count))

            return

        self._call_count += 1

        # Always before calling WordNet to look-up a word,
        # try to find it in a cached area (snapwords, the local dictionary,
        # and the word memory).  All these areas are optional.

        # Look in snapwords.  Snapwords are snapped back (i.e. they are high frequency words for which the word and the lemma are the same.)
        if word in self._snap_words:
            self._snap_count += 1
            return word

        # Build a global key (has no pos attahed).  There might be a global override in the local dictionary.
        key = word + '__'
        if key in self._dictionary:
            self._dictionary_count += 1
            return self._dictionary[key]

        # Initialize call order.  If multiple pos calls have been set, call WordNet in the order provided as an argument until a match is found.  Otherwise set up to call WordNet with the pos of noun.

        wordnet_call_order = []

        if pos == None:
            wordnet_call_order = multiple_pos_calls
        else:
            wordnet_call_order.append(pos)

        for call_pos in wordnet_call_order:

            # Build a key for the dictionary and word_memory.
            key = word + '_' + call_pos

            # Look in dictionary for the specific pos.
            if key in self._dictionary:
                self._dictionary_count += 1
                return self._dictionary[key]

            # Look in the word memory for the specific pos.
            if key in self._word_memory:
                self._memory_count += 1
                return self._word_memory[key]

            # No match has been found in cache.  Call WordNet.
            lemmas = wordnet._morphy(word, call_pos)

            # If WordNet returned a lemma, do not try any other pos.
            if lemmas:
                break

        # Remember this word/pos/lemma if option set.  If WordNet did not find the word, return the word as its own lemma.  If found, return the shortest lemma in the return list.

        if self._remember_words:
            if lemmas:
                self._word_memory[key] = min(lemmas, key=len)
            else:
                self._word_memory[key] = word

        if lemmas:
            return min(lemmas, key=len)
        else:
            return word

예제 #24

0

파일 보기

파일: wn_coverage.py 프로젝트: alexrudnick/terere

#!/usr/bin/env python3

import fileinput
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet

for line in fileinput.input():
    line = line.strip().lower()
    if not line:
        continue
    lemmas = None
    for pos in ['a', 's', 'r', 'n', 'v']:
        ans = wordnet._morphy(line, pos)
        if ans:
            lemmas = ans
    if lemmas:
        print("COVERED", lemmas)
    else:
        print("NOTCOVERED", line)

예제 #25

0

파일 보기

파일: categorize_conceptnet.py 프로젝트: boywaiter/MCS-Analyses

        cached[category][syn.name()] = result
        return result


# Iterate over, get what percent of the notes connected by the relations are physical
with open('datasets/simplified_english_conceptnet.csv', 'r') as inf:
    for line in inf:
        node_1, relation, node_2 = line.split()[0:3]

        tokens = [relation]
        for node in [node_1, node_2]:
            label = 'NOT_WN'
            if len(node.split('_')) < 2:
                new_node = ''
                if node not in wn_lemmas:
                    nouns = wordnet._morphy(node, wordnet.NOUN)
                    if len(nouns) > 0:
                        new_node = nouns[0]
                    else:
                        verbs = wordnet._morphy(node, wordnet.VERB)
                        if len(verbs) > 0:
                            new_node = verbs[0]
                        else:
                            adjs = wordnet._morphy(node, wordnet.ADJ)
                            if len(adjs) > 0:
                                new_node = adjs[0]
                    if new_node in wn_lemmas:
                        node = new_node

                if node in wn_lemmas:
                    in_category = False

예제 #26

0

파일 보기

파일: stuff.py 프로젝트: AtikJam/souma

def lemmatize(word, pos='n'):
    from nltk.corpus import wordnet
    lemmas = wordnet._morphy(word, pos)
    return min(lemmas, key=len) if lemmas else None

예제 #27

0

파일 보기

파일: wordnet.py 프로젝트: B-Rich/Fem-Coding-Challenge

 def lemmatize(self, word, pos=NOUN):
     lemmas = _wordnet._morphy(word, pos)
     if not lemmas:
         return word
     lemmas.sort(key=len)
     return lemmas[0]

예제 #28

0

파일 보기

파일: train.py 프로젝트: lakomi/semantic-guesser

def verb_vocab(tcm = None, postagger = None, min_length=0):
    """
    Return all verbs found in wordnet in various inflected forms.
    """
    if not postagger:
        postagger = BackoffTagger.from_pickle()

    getpostag = lambda word : postagger.tag([word])[0][1]

    # Most of the time lexeme() returns 4 or 5 words, inflected as declared below
    # To avoid assumptions on the tagset used, we query the tags using easy examples
    # (verb give). These POS tags are then bound to lexeme's results.
    infinitive_pos = getpostag("give")
    present_pos    = getpostag("gives")
    pres_prog_pos  = getpostag("giving")
    past_pos       = getpostag("gave")
    past_prog_pos  = getpostag("given")

    # three possibilities for return of function tenses
    # depending on how many variations a verb has
    tenses3 = [infinitive_pos, present_pos, pres_prog_pos]
    tenses4 = tenses3 + [past_pos]
    tenses5 = tenses4 + [past_prog_pos]

    verbs = set()

    for lemma in wn.all_lemma_names(pos = 'v'):
        if len(lemma) < min_length:
            continue
        if '_' in lemma:
            continue

        forms = lexeme(lemma) # all possible conjugations of this verb (lemma)

        if len(forms) == 3:
            forms = zip(forms, tenses3)
        elif len(forms) == 4:
            forms = zip(forms, tenses4)
        elif len(forms) == 5:
            forms = zip(forms, tenses5)
        else:
            # this step can introduce errors, as getpostag isn't
            # guaranteed to return a verb tag
            forms = [(form, getpostag(form)) for form in forms]

        # ignore forms that do not map back to lemma by wordnet's
        # lemmatizer, as they are likely erroneous
        forms = list(filter(lambda form: lemma in wn._morphy(form[0], 'v'), forms))

        if tcm is not None:
            classes = [classy for syn in wn.synsets(lemma, 'v') for classy in tcm.predict(syn)]
        else:
            classes = [syn.name() for syn in wn.synsets(lemma, 'v')]

        for classy in classes:
            for form, postag in forms:
                if not postag:
                    log.warning("{} has POS==None".format(form))
                    continue
                if postag[0] == 'n': # dirty hack to avoid inconsistency introduced by postagger
                    continue
                verbs.add((form, postag, classy))
                if "'" in form:  # remove ' (couldn't -> couldnt)
                    verbs.add((form.replace("'", ""), postag, classy))

    return verbs

예제 #29

0

파일 보기

파일: wordnet.py 프로젝트: EricChh20/EZ-Mail

 def lemmatize(self, word, pos=NOUN):
     lemmas = wordnet._morphy(word, pos)
     return min(lemmas, key=len) if lemmas else word

예제 #30

0

파일 보기

def get_wordnet_relation(target: str,
                         subst: str,
                         pos: Optional[str] = None) -> str:
    """
    Finds WordNet relation between a target word and a substitute by analyzing
    their synsets. Optionally one could specify pos tag of the target word for
    more robust analysis.

    Args:
        target: target word
        subst: substitute
        pos: pos tag of the target word

    Returns:
        WordNet relation between the target word and a substitute.
    """
    if pos:
        pos = pos.lower()

    if pos is None:
        pos = wn.NOUN

    if len(subst.split(" ")) > 1:
        return Relation.mwe.name

    if target == subst:
        return Relation.same.name

    if set(wn._morphy(target, pos)).intersection(set(wn._morphy(subst, pos))):
        return Relation.target_form.name

    target_synsets = get_synsets(target, pos=pos)
    subst_synsets = get_synsets(subst, pos=pos)
    if len(subst_synsets) == 0:
        return Relation.unknown_word.name

    target_lemmas = {
        lemma
        for ss in target_synsets for lemma in ss.lemma_names()
    }
    subst_lemmas = {
        lemma
        for ss in subst_synsets for lemma in ss.lemma_names()
    }
    if len(target_lemmas.intersection(subst_lemmas)) > 0:
        return Relation.synonym.name

    if subst in get_similar_tos(target, pos):
        return Relation.similar_to.name

    tgt_sense, sbt_sense = find_nearest_synsets(target_synsets, subst_synsets,
                                                pos)

    if tgt_sense is None or sbt_sense is None:
        return Relation.no_path.name

    extract_name = lambda synset: synset.name().split(".")[0]
    tgt_name, sbt_name = extract_name(tgt_sense), extract_name(sbt_sense)

    target_holonyms = get_holonyms(tgt_sense)
    target_meronyms = get_meronyms(tgt_sense)

    if sbt_name in {
            lemma
            for ss in target_holonyms for lemma in ss.lemma_names()
    }:
        return Relation.holonym.name
    if sbt_name in {
            lemma
            for ss in target_meronyms for lemma in ss.lemma_names()
    }:
        return Relation.meronym.name

    target_entailments = {
        lemma
        for ss in tgt_sense.entailments() for lemma in ss.lemma_names()
    }
    if sbt_name in target_entailments:
        return Relation.entailment.name

    subst_entailments = {
        lemma
        for ss in sbt_sense.entailments() for lemma in ss.lemma_names()
    }
    if tgt_name in subst_entailments:
        return Relation.anti_entailment.name

    for common_hypernym in tgt_sense.lowest_common_hypernyms(sbt_sense):
        tgt_hyp_path = tgt_sense.shortest_path_distance(common_hypernym)
        sbt_hyp_path = sbt_sense.shortest_path_distance(common_hypernym)

        if tgt_hyp_path == 1 and sbt_hyp_path == 0:
            return Relation.direct_hypernym.name  # substitute is a hypernym of target
        elif tgt_hyp_path == 0 and sbt_hyp_path == 1:
            return Relation.direct_hyponym.name
        elif tgt_hyp_path > 1 and sbt_hyp_path == 0:
            return Relation.transitive_hypernym.name
        elif tgt_hyp_path == 0 and sbt_hyp_path > 1:
            return Relation.transitive_hyponym.name
        elif tgt_hyp_path == 1 and sbt_hyp_path == 1:
            return Relation.co_hyponym.name
        elif max(tgt_hyp_path, sbt_hyp_path) <= 3:
            return Relation.co_hyponym_3.name

    return Relation.unknown_relation.name

예제 #31

0

파일 보기

def morphy2(noun, pos=nlwn.NOUN):
    noun = noun.replace(' ', '_').lower()
    return [
        n for n in nlwn._morphy(noun, pos)
        if len(nlwn.synsets(n, pos)) > 0 and len(n) > 0
    ]

예제 #32

0

파일 보기

 def my_lemmatize(self, word, pos=NOUN):
     lemmas = wordnet._morphy(word, pos)
     return (min(lemmas, key=len), True) if lemmas else (word, False)