def lemmatize(word, pos): # Need to specify pos=NOUN|ADJ|VERB| if pos == None: lemmas = wordnet._morphy(word, NOUN) else: lemmas = wordnet._morphy(word, pos) return min(lemmas, key=len) if lemmas else word
def lemmatize(self, word, pos=None): try: if pos is not None: return max(wn._morphy(word, pos), key=lambda x: count_word(x, pos)) else: return max(chain.from_iterable( wn._morphy(word, pos) for pos in {'n', 'v', 'a', 'r'}), key=lambda x: count_word(x)) except ValueError: return word
def stem_word(word): # get word into basic form # going -> go, greates -> great # it is rather trivial way to do it stem = wn._morphy(word, ADJ) if (stem != [] and stem[-1] != word): return stem[-1] stem = wn._morphy(word, VERB) if (stem != [] and stem[0] != word): return stem[0] stem = wn._morphy(word, NOUN) if (stem == []): return word return stem[-1]
def getBaseFormNLTK(self, token, pos): baseForm = '' baseList = wn._morphy(token, pos) if len(baseList) != 0: for base in baseList: if base in myGlobal.wordDict: baseForm = base return baseForm
def createWordVariants(self, pos_tagged_s): for w, pos in pos_tagged_s: #print "Extracting roots for ", w, " with pos ", pos wn_tag = self.getWordnetTagFromPosTag(pos) if wn_tag != None: #print "Considering word to morphy " , w, "pos " , pos self.root_word_variants[w] = wn._morphy(w.lower(), wn_tag) else: self.root_word_variants[w] = [(wnl.lemmatize(w.lower()))]
def _find_target_word_idx(tokens: List[str], word_forms: Set): for idx, token in enumerate(tokens): token_lower = token.lower() lemmas = { lemma for pos in ['v', 'n', 'a', 'r'] for lemma in wn._morphy(token_lower, pos) } if lemmas.intersection(word_forms) or token_lower in word_forms: return idx raise ValueError(f"Target word was not found {tokens}\n{word_forms}")
def ManageSentence(org_sentence, sentence, T): for word, line_num in org_sentence: roots = set(wordnet._morphy(word.lower(), wordnet.NOUN) + \ wordnet._morphy(word.lower(), wordnet.VERB)) if not roots: roots.add(word.lower()) for root in roots: if root in T: break else: sys.exit("error: missing rule for '" + word + "' (or wrong input form)") if {word} != roots: print(word, 'STRING', ' '.join(roots), line_num) else: print(word, 'STRING', line_num) sentence.append(roots) print('ENDFILE\n\n')
def stem(phrase, stem_map): phrase_array = phrase.split(' ') word = phrase_array[len(phrase_array)-1] stemmed_word = wn._morphy(word, 'n') if len(stemmed_word) > 0: stemmed_word = stemmed_word[len(stemmed_word)-1] if stemmed_word in stem_map: word = stem_map.get(stemmed_word) else: stem_map[stemmed_word] = word phrase_array[len(phrase_array)-1] = word return ' '.join(phrase_array) else: return phrase
def lemmatize(word, tokens_dict, order=0): pos_tag = get_pos(word, tokens_dict) poss_lemm = wn._morphy(word, pos_tag) if len(poss_lemm) == 1: return poss_lemm[0] else: if tokens_dict[word][order] == "VBD": for w in poss_lemm: if w != word: return w print(poss_lemm, tokens_dict[word]) wordnet_lemmatizer = WordNetLemmatizer() lemmatized = wordnet_lemmatizer.lemmatize(word, pos_tag) return lemmatized
def lemmatize(self, word: str, pos: str = "n") -> str: """Lemmatize `word` using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet. :param word: The input word to lemmatize. :type word: str :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns, `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` for satellite adjectives. :param pos: str :return: The lemma of `word`, for the given `pos`. """ lemmas = wn._morphy(word, pos) return min(lemmas, key=len) if lemmas else word
def is_too_sim(word, concept, t=0.30): for stem in nlwn._morphy(word, pos='n'): s = word_concept_sim(stem, concept) if s > t: return True synset = nlwn.synset(concept) for ss in nlwn.synsets(stem, pos='n'): if synset in features.all_hypernyms(ss): return True return False
def check_remove(data, tok, word_range, which_type="all"): gen_fam_term = ["father", "mother", "son", "daughter", "husband", "wife", "brother", "sister", "grandfather", "grandmother", "grandson", "granddaughter", "uncle", "aunt", "nephew", "niece"] gen_term = ["female", "male", "woman", "man", "girl", "boy"] pro_lst = ["he", "she", "him", "her", "his", "hers", "himself", "herself"] result = [] for cluster in word_range: if (which_type == "name"): # check if the cluster has name link # Check all the instances of human names in the sentence and build "name_lst" name_lst = [] for sent_chunk in ne_chunk(pos_tag(tok)): if hasattr(sent_chunk, 'label'): if (sent_chunk.label() == "PERSON"): name_lst.append(' '.join(c[0] for c in sent_chunk)) # (print("TESTING", c[0]) for c in sent_chunk) result.append(any([((' '.join(w for w in tok[c[0]:c[1] + 1])) in name_lst) for c in cluster])) elif (which_type == "pro"): # check if the cluster has only pronoun links result.append(all([((c[0] == c[1]) and (tok[c[0]]).lower() in pro_lst) for c in cluster])) elif (which_type == "term"): # check if the cluster has gendered term for c in cluster: for i in c: word_disam = lesk(tok, tok[i], 'n') # check definition assigned from word disambiguation # if the word is a valid English word check if it's person word and the definition contains gendered meaning if (word_disam is not None) and (word_disam.lexname() == "noun.person"): # now looking at all nouns in the range but after ACL we can use dependency parsing and only look at the head noun result.append(any([wn_lem.lemmatize(w) in (gen_fam_term + gen_term + pro_lst) and (x in (gen_fam_term + gen_term + pro_lst) for x in wn._morphy(w, wn.NOUN)) # checks all possible morphological functions for w in word_tokenize(word_disam.definition())] + [tok[i] in (gen_fam_term + gen_term + pro_lst)])) else: result.append(False) else: continue else: # check all conditions at the same time result.append(any([check_remove(data, tok, word_range, which_type="name"), check_remove(data, tok, word_range, which_type="pro"), check_remove(data, tok, word_range, which_type="term")])) return any(result)
def normalize(word): """ normalize(word) Correctly normalizes a word and returns the normalized form. word must be ['word','PoS'], PoS of nltk format, the result is a normalized string 'word' """ w = word[0] if word[1] in NOZMALIZED: return w else: morphy = wordnet._morphy(w, tag_to_wn[word[1][0]]) if w in morphy and len(morphy) > 1: morphy.remove(w) return morphy[0] if morphy else w
def _get_info(lemma, pos, info_type): results = dict() wn_pos = WORDNET_POS[pos] if pos is not None else None morphemes = wn._morphy(lemma, pos=wn_pos) if pos is not None else [] for i, synset in enumerate(set(wn.synsets(lemma, pos=wn_pos))): sense_key = None for l in synset.lemmas(): if l.name().lower() == lemma.lower(): sense_key = l.key() break elif l.name().lower() in morphemes: sense_key = l.key() assert sense_key is not None results[sense_key] = synset.examples() if info_type == 'examples' else synset.definition() return results
def lemmatize(word, tokens_dict, order=0): pos_tag = get_pos_wn(word, tokens_dict) if pos_tag: poss_lemm = wn._morphy(word, pos_tag) else: poss_lemm = "" if len(poss_lemm) == 1: return poss_lemm[0] else: if tokens_dict[word][order] in "VBD VBN": for w in poss_lemm: if w != word: return w # print(poss_lemm, tokens_dict[word]) wordnet_lemmatizer = WordNetLemmatizer() lemmatized = wordnet_lemmatizer.lemmatize(word) return lemmatized
def preprocess(inputfile): lemmatizer = WordNetLemmatizer() verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] rows = [] line = [x.strip().split('\t') for x in inputfile.readlines()[1:]] for l in line: if l[3].isalpha(): if l[3] not in verbs: if l[2] not in string.punctuation: l[2] = lemmatizer.lemmatize(l[2].lower()) rows.append(l) else: l[2] = wn._morphy(l[2].lower(), pos='v') lemmatized = str(l[2]).strip("['").strip("']") l[2] = lemmatized rows.append(l) cols = ["word nº", "sentence nº", "word", "POS tag", "entity type"] return pd.DataFrame(rows, columns=cols)
def lemmatize(word, tokens_dict, order=0): """ (str, dict, int) -> str. Return lemmatized word using dict of tokens and order of word in sentence. """ pos_tag = get_pos_wn(word, tokens_dict) if pos_tag: poss_lemm = wn._morphy(word, pos_tag) else: poss_lemm = "" if len(poss_lemm) == 1: return poss_lemm[0] else: if tokens_dict[word][order] in "VBD VBN": for w in poss_lemm: if w != word: return w wordnet_lemmatizer = WordNetLemmatizer() lemmatized = wordnet_lemmatizer.lemmatize(word) return lemmatized
def get_morphy(self, lemma, check_exceptions=True): morphy_list = [ form for p in self.pos for form in wn._morphy(lemma, p, check_exceptions) ] return set(morphy_list)
def lemmatize(self, word, pos=NOUN): lemmas = _wordnet._morphy(word, pos) if not lemmas: return word lemmas.sort(key=len) return lemmas[0]
def lemmatize(self, word, pos=NOUN): lemmas = wordnet._morphy(word, pos) return min(lemmas, key=len) if lemmas else word
sentences = [] s = [] sample = open(f"./data/{poem}.txt", "r") s = sample.read() # Replaces escape character with space f = s.replace("\n", " ") # iterate through each sentence in the file for i in sent_tokenize(f): # tokenize the sentence into words for j in word_tokenize(i): if j.isalnum( ) and not j.lower() in stop_words and not j.lower in ignore_words: j = j.lower() stems = wn._morphy(j, wn.NOUN) if len(stems) >= 1: j = stems[0] #if len(stems) == 1: # print(stems) if j not in ignore_words: temp.append(j) df_dict['word'].append(j) df_dict['year'].append( pub_year_df.loc[pub_year_df['poem_title'] == poem, 'pub_year'].values[0]) df_dict['poem_title'].append(poem) df = pd.DataFrame.from_dict(df_dict) uniques = df['word'].unique().tolist() print(f"df length: {len(df)}")
def main(): if len(sys.argv) < 2: print('Specify an input plain text file!') from nltk import download inputFile = sys.argv[1] if inputFile == '--help': download('tagsets') from nltk.help import upenn_tagset print(upenn_tagset()) return with open(inputFile, 'r') as f: texts = f.read() texts = texts.split('\n') # Process texts print('Got texts:', len(texts)) # Tokenize texts print('Tokenizing texts...') download('punkt') from nltk.tokenize import word_tokenize words = set() for text in texts: text_words = word_tokenize(text) words.update(text_words) print('Normalization...') import re # Normalize words (remove punctuation etc) words = map(lambda w: re.sub(r'[^a-zA-Z\-\'\`\"]', '', w), words) # Filter out unnecessary stuff words = filter(lambda w: len(w) > 2 and len(w) <= 45, words) words = set(words) print('All filtered words:', len(words)) # # Filter out names words # from nltk.tag import StanfordNERTagger # st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2018-10-16/stanford-ner.jar') # tagged = st.tag(words) # tagged_words_common = filter(lambda tag: tag[1] == 'O', tagged) # words = map(lambda tag: tag[0].lower(), tagged_words_common) # words = set(words) # print('Filtered common words:', len(words)) # # Print names words (just for info) # tagged_names = list(filter(lambda tag: tag[1] != 'O', tagged)) # print('Names words:', len(tagged_names)) # print(tagged_names) # cast to lowercase words = set(map(str.lower, words)) print('Processing words...') # Stemming to use stems as key download('stopwords') from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english', ignore_stopwords=True) # Lemmatize to use lemmas as base word download('wordnet') from nltk.corpus.reader import wordnet as wnr from nltk.corpus import wordnet # POS tagger download('averaged_perceptron_tagger') from nltk import pos_tag # Process words keyed_words = {} tagged_words = pos_tag(words) def tagged2str(word, tag): return '{} ({})'.format(word, tag) for word, tag in tagged_words: stem = snowball_stemmer.stem(word) str_word = tagged2str(word, tag) if stem in keyed_words: keyed_words[stem] += (str_word, ) else: pos = wnr.VERB if tag in { 'VB', 'VBP', 'VBZ', 'VBN', 'VBG', 'VBD', 'VERB' } else wnr.NOUN lemmas = wordnet._morphy(word, pos=pos) lemma = tagged2str(min(lemmas, key=len), pos) if lemmas else str_word keyed_words[stem] = (lemma, str_word) print('Keys count:', len(keyed_words)) print('Writing file ', OUTFILE) def mapToCSVLine(item): return ','.join(map(str, (item[0], ) + item[1])) + '\n' # Output to file with open(OUTFILE, 'w') as f: f.write(mapToCSVLine(('STEM', ('LEMMA', 'FORMS...')))) # Header for key in sorted(keyed_words): f.write(mapToCSVLine((key, keyed_words[key]))) print('Done! Use "{} --help" To know what abbreviations mean.'.format( sys.argv[0]))
def lemmatize(self, word, pos=None, multiple_pos_calls=[NOUN], print_results=False): # Bingham updates: # Added alternative default (pos=None) processing. # Added print results option. if print_results: dash = '-' * 50 hdr_template = '{:<20s}{:>10s}{:>10s}{:>8s}' data_template = '{:<20s}{:>10d}{:>10d}{:>8.2f}' total_template = '{:<20s}{:>20d}' print(dash) print(hdr_template.format('Type', 'Number', 'Hits', '% Total')) print(dash) print( data_template.format( 'snapwords', len(self._snap_words), self._snap_count, 100 * round(self._snap_count / self._call_count, 3))) print( data_template.format( 'local dictionary', len(self._dictionary), self._dictionary_count, 100 * round(self._dictionary_count / self._call_count, 3))) print( data_template.format( 'remembered words', len(self._word_memory), self._memory_count, 100 * round(self._memory_count / self._call_count, 3))) print( data_template.format( '* not cached *', 0, len(self._word_memory), 100 * round(len(self._word_memory) / self._call_count, 3))) print() print(total_template.format('Total', self._call_count)) return self._call_count += 1 # Always before calling WordNet to look-up a word, # try to find it in a cached area (snapwords, the local dictionary, # and the word memory). All these areas are optional. # Look in snapwords. Snapwords are snapped back (i.e. they are high frequency words for which the word and the lemma are the same.) if word in self._snap_words: self._snap_count += 1 return word # Build a global key (has no pos attahed). There might be a global override in the local dictionary. key = word + '__' if key in self._dictionary: self._dictionary_count += 1 return self._dictionary[key] # Initialize call order. If multiple pos calls have been set, call WordNet in the order provided as an argument until a match is found. Otherwise set up to call WordNet with the pos of noun. wordnet_call_order = [] if pos == None: wordnet_call_order = multiple_pos_calls else: wordnet_call_order.append(pos) for call_pos in wordnet_call_order: # Build a key for the dictionary and word_memory. key = word + '_' + call_pos # Look in dictionary for the specific pos. if key in self._dictionary: self._dictionary_count += 1 return self._dictionary[key] # Look in the word memory for the specific pos. if key in self._word_memory: self._memory_count += 1 return self._word_memory[key] # No match has been found in cache. Call WordNet. lemmas = wordnet._morphy(word, call_pos) # If WordNet returned a lemma, do not try any other pos. if lemmas: break # Remember this word/pos/lemma if option set. If WordNet did not find the word, return the word as its own lemma. If found, return the shortest lemma in the return list. if self._remember_words: if lemmas: self._word_memory[key] = min(lemmas, key=len) else: self._word_memory[key] = word if lemmas: return min(lemmas, key=len) else: return word
#!/usr/bin/env python3 import fileinput from nltk.corpus.reader.wordnet import NOUN from nltk.corpus import wordnet for line in fileinput.input(): line = line.strip().lower() if not line: continue lemmas = None for pos in ['a', 's', 'r', 'n', 'v']: ans = wordnet._morphy(line, pos) if ans: lemmas = ans if lemmas: print("COVERED", lemmas) else: print("NOTCOVERED", line)
cached[category][syn.name()] = result return result # Iterate over, get what percent of the notes connected by the relations are physical with open('datasets/simplified_english_conceptnet.csv', 'r') as inf: for line in inf: node_1, relation, node_2 = line.split()[0:3] tokens = [relation] for node in [node_1, node_2]: label = 'NOT_WN' if len(node.split('_')) < 2: new_node = '' if node not in wn_lemmas: nouns = wordnet._morphy(node, wordnet.NOUN) if len(nouns) > 0: new_node = nouns[0] else: verbs = wordnet._morphy(node, wordnet.VERB) if len(verbs) > 0: new_node = verbs[0] else: adjs = wordnet._morphy(node, wordnet.ADJ) if len(adjs) > 0: new_node = adjs[0] if new_node in wn_lemmas: node = new_node if node in wn_lemmas: in_category = False
def lemmatize(word, pos='n'): from nltk.corpus import wordnet lemmas = wordnet._morphy(word, pos) return min(lemmas, key=len) if lemmas else None
def verb_vocab(tcm = None, postagger = None, min_length=0): """ Return all verbs found in wordnet in various inflected forms. """ if not postagger: postagger = BackoffTagger.from_pickle() getpostag = lambda word : postagger.tag([word])[0][1] # Most of the time lexeme() returns 4 or 5 words, inflected as declared below # To avoid assumptions on the tagset used, we query the tags using easy examples # (verb give). These POS tags are then bound to lexeme's results. infinitive_pos = getpostag("give") present_pos = getpostag("gives") pres_prog_pos = getpostag("giving") past_pos = getpostag("gave") past_prog_pos = getpostag("given") # three possibilities for return of function tenses # depending on how many variations a verb has tenses3 = [infinitive_pos, present_pos, pres_prog_pos] tenses4 = tenses3 + [past_pos] tenses5 = tenses4 + [past_prog_pos] verbs = set() for lemma in wn.all_lemma_names(pos = 'v'): if len(lemma) < min_length: continue if '_' in lemma: continue forms = lexeme(lemma) # all possible conjugations of this verb (lemma) if len(forms) == 3: forms = zip(forms, tenses3) elif len(forms) == 4: forms = zip(forms, tenses4) elif len(forms) == 5: forms = zip(forms, tenses5) else: # this step can introduce errors, as getpostag isn't # guaranteed to return a verb tag forms = [(form, getpostag(form)) for form in forms] # ignore forms that do not map back to lemma by wordnet's # lemmatizer, as they are likely erroneous forms = list(filter(lambda form: lemma in wn._morphy(form[0], 'v'), forms)) if tcm is not None: classes = [classy for syn in wn.synsets(lemma, 'v') for classy in tcm.predict(syn)] else: classes = [syn.name() for syn in wn.synsets(lemma, 'v')] for classy in classes: for form, postag in forms: if not postag: log.warning("{} has POS==None".format(form)) continue if postag[0] == 'n': # dirty hack to avoid inconsistency introduced by postagger continue verbs.add((form, postag, classy)) if "'" in form: # remove ' (couldn't -> couldnt) verbs.add((form.replace("'", ""), postag, classy)) return verbs
def get_wordnet_relation(target: str, subst: str, pos: Optional[str] = None) -> str: """ Finds WordNet relation between a target word and a substitute by analyzing their synsets. Optionally one could specify pos tag of the target word for more robust analysis. Args: target: target word subst: substitute pos: pos tag of the target word Returns: WordNet relation between the target word and a substitute. """ if pos: pos = pos.lower() if pos is None: pos = wn.NOUN if len(subst.split(" ")) > 1: return Relation.mwe.name if target == subst: return Relation.same.name if set(wn._morphy(target, pos)).intersection(set(wn._morphy(subst, pos))): return Relation.target_form.name target_synsets = get_synsets(target, pos=pos) subst_synsets = get_synsets(subst, pos=pos) if len(subst_synsets) == 0: return Relation.unknown_word.name target_lemmas = { lemma for ss in target_synsets for lemma in ss.lemma_names() } subst_lemmas = { lemma for ss in subst_synsets for lemma in ss.lemma_names() } if len(target_lemmas.intersection(subst_lemmas)) > 0: return Relation.synonym.name if subst in get_similar_tos(target, pos): return Relation.similar_to.name tgt_sense, sbt_sense = find_nearest_synsets(target_synsets, subst_synsets, pos) if tgt_sense is None or sbt_sense is None: return Relation.no_path.name extract_name = lambda synset: synset.name().split(".")[0] tgt_name, sbt_name = extract_name(tgt_sense), extract_name(sbt_sense) target_holonyms = get_holonyms(tgt_sense) target_meronyms = get_meronyms(tgt_sense) if sbt_name in { lemma for ss in target_holonyms for lemma in ss.lemma_names() }: return Relation.holonym.name if sbt_name in { lemma for ss in target_meronyms for lemma in ss.lemma_names() }: return Relation.meronym.name target_entailments = { lemma for ss in tgt_sense.entailments() for lemma in ss.lemma_names() } if sbt_name in target_entailments: return Relation.entailment.name subst_entailments = { lemma for ss in sbt_sense.entailments() for lemma in ss.lemma_names() } if tgt_name in subst_entailments: return Relation.anti_entailment.name for common_hypernym in tgt_sense.lowest_common_hypernyms(sbt_sense): tgt_hyp_path = tgt_sense.shortest_path_distance(common_hypernym) sbt_hyp_path = sbt_sense.shortest_path_distance(common_hypernym) if tgt_hyp_path == 1 and sbt_hyp_path == 0: return Relation.direct_hypernym.name # substitute is a hypernym of target elif tgt_hyp_path == 0 and sbt_hyp_path == 1: return Relation.direct_hyponym.name elif tgt_hyp_path > 1 and sbt_hyp_path == 0: return Relation.transitive_hypernym.name elif tgt_hyp_path == 0 and sbt_hyp_path > 1: return Relation.transitive_hyponym.name elif tgt_hyp_path == 1 and sbt_hyp_path == 1: return Relation.co_hyponym.name elif max(tgt_hyp_path, sbt_hyp_path) <= 3: return Relation.co_hyponym_3.name return Relation.unknown_relation.name
def morphy2(noun, pos=nlwn.NOUN): noun = noun.replace(' ', '_').lower() return [ n for n in nlwn._morphy(noun, pos) if len(nlwn.synsets(n, pos)) > 0 and len(n) > 0 ]
def my_lemmatize(self, word, pos=NOUN): lemmas = wordnet._morphy(word, pos) return (min(lemmas, key=len), True) if lemmas else (word, False)