def lowest_common_hypernym(fr): """ Returns the lowest common hypernym of the two mentions (based on WordNet). Again assuming that the last word = head word, and that it represents the phrase. Also considering only the first sense. """ try: i_final=wn.morphy(re.sub(r"\W", r"",fr.i_token.split('_')[-1])) j_final=wn.morphy(re.sub(r"\W", r"",fr.j_token.split('_')[-1])) if i_final is None or j_final is None: return "lowest_common_hypernym={}".format(False) if _is_pronoun(i_final) or _is_pronoun(j_final): return "lowest_common_hypernym={}".format(False) i_synsets=wn.synsets(i_final) j_synsets=wn.synsets(j_final) lowest_common_hypernym=i_synsets[0].lowest_common_hypernyms(j_synsets[0])[0] return "lowest_common_hypernym={}".format(lowest_common_hypernym) except wn_error: return "lowest_common_hypernym={}".format(False)
def get_similarity(self,word1,word2): '''计算相似度:基于WordNet语义词典''' ''' print 'before stemmed:',word1 print 'after stemmed:',wn.morphy(word1.lower()) print 'before stemmed:',word2 print 'after stemmed:',wn.morphy(word2.lower()) ''' #stemmed word if wn.morphy(word1.lower()) != None : word1 = wn.morphy(word1.lower()) if wn.morphy(word2.lower()) != None : word2 = wn.morphy(word2.lower()) word1_synsets = wn.synsets(word1) #print word1_synsets word2_synsets = wn.synsets(word2) #print word2_synsets sim = 0 for syn1 in word1_synsets: w1 = wn.synset(syn1.name()) for syn2 in word2_synsets: w2 = wn.synset(syn2.name()) tmp = w1.path_similarity(w2) #print tmp,syn1.name(),syn2.name() if tmp > sim: sim = tmp return sim
def expand_queries(file): ''' For each term in a query, takes the first synset of the word from wordnet and adds all synonyms of that synset ''' file = open(file) for sentence in file: sentence = sentence.strip() if sentence.find('<text>') != -1: query = sentence[sentence.find('>')+1: sentence.rfind('<')] additions = '' updated_q = nltk.pos_tag(nltk.wordpunct_tokenize(query.lower())) full_q = query for word, pos in updated_q: if word not in stopwords.words('english'): looking_for = str(word)+'.'+str(get_wordnet_pos(pos))+'.01' synsets = wn.synsets(word) if looking_for in str(synsets): new_words = (wn.synset(looking_for).lemma_names) #was .definition for new_word in new_words: if new_word.lower() != word.lower(): full_q = full_q +' '+ str(new_word) else: if wn.morphy(word) != None: word = wn.morphy(word) looking_for = str(word)+'.'+str(get_wordnet_pos(pos))+'.01' print str(looking_for) + ' THIS IS WORD' synsets = wn.synsets(word) if looking_for in str(synsets): new_words = (wn.synset(looking_for).lemma_names) #was .definition for new_word in new_words: if new_word.lower() != word.lower(): full_q = full_q +' '+ str(new_word) print query + ' '+ full_q
def preprocessWords(lst): index = 0 while index < len(lst): word = lst[index].lower() if word not in reservedWordList: #special handling from java code if word == 'financial': lst[index] = 'finance' #avoid _id is a word in dscrp if word == '_id': lst[index] = 'id' #only VERB and NOUN are saved, do not know if wn.morphy has many speech stem, which will return as wn.morphy(word) # if wn.morphy(word, wn.VERB) and wn.morphy(word, wn.NOUN) and wn.morphy(word, wn.VERB) != wn.morphy(word, wn.NOUN): # print word, wn.morphy(word, wn.VERB), wn.morphy(word, wn.NOUN), wn.morphy(word) if wn.morphy(word, wn.VERB) or wn.morphy(word, wn.NOUN): if wn.morphy(word) != word: lst[index] = wn.morphy(word) word = lst[index] elif wn.morphy(PorterStemmer().stem_word(word)): lst[index] = PorterStemmer().stem_word(word) word = lst[index] else: del lst[index] continue if len(word) == 1 or word in stopWordList or word.isdigit(): del lst[index] continue index += 1 return lst
def subclass(feats): if string_match(feats).endswith("False"): try: result = False i_clean = wn.morphy(feats.i_cleaned.lower(), wn.NOUN) i_synsets = wn.synsets(i_clean) j_clean = wn.morphy(feats.j_cleaned.lower(), wn.NOUN) j_synsets = wn.synsets(j_clean) def get_common_hypernym(i_synset,j_synset): i_hypernyms = i_synset.hypernyms() j_hypernyms = j_synset.hypernyms() if len(i_hypernyms) == 0: i_synset = i_synset.instance_hypernyms()[0] if len(j_hypernyms) == 0: j_synset = j_synset.instance_hypernyms()[0] subc = i_synset.common_hypernyms(j_synset) return (i_synset in subc) or (j_synset in subc) for synset in i_synsets: for syn in j_synsets: result = get_common_hypernym(synset,syn) if result: break if result:break return "subclass={}".format(result) except: wn_error return "subclass={}".format(False) else: return "subclass={}".format(False)
def same_hypernym(fr): """ True if the two mentions have the same hypernym in WordNet. In multiword mentions, considering only the last word (I'm assuming last word=head). Not considering pronouns. Most of the logic was borrowed from Julia's WN function in the coref project - thank you. """ try: i_final=wn.morphy(re.sub(r"\W", r"",fr.i_token.split('_')[-1])) j_final=wn.morphy(re.sub(r"\W", r"",fr.j_token.split('_')[-1])) if i_final is None or j_final is None: return "same_hypernym={}".format(False) if _is_pronoun(i_final) or _is_pronoun(j_final): return "same_hypernym={}".format(False) i_synsets=wn.synsets(i_final) j_synsets=wn.synsets(j_final) for i_synset in i_synsets: i_hypernym_set=set(i_synset.hypernyms()) for j_synset in j_synsets: j_hypernym_set=set(j_synset.hypernyms()) if i_hypernym_set.intersection(j_hypernym_set): return "same_hypernym={}".format(True) return "same_hypernym={}".format(False) except wn_error: return "same_hypernym={}".format(False)
def _wnbase(self): if self.postag == 'n': return wn.morphy(self.lemma, wn.NOUN) elif self.postag == 'v': return wn.morphy(self.lemma, wn.VERB) elif self.postag == 'a': return wn.morphy(self.lemma, wn.ADJ) return None
def ApplyBNB(doc_tokens, classes_postings, condprob, prior, vocabulary, selected_features): ## Assumes global dictionaries defined: stop_words, names, negation_words global stop_words, names, negation_words scores = dict() for c in classes_postings: scores[c] = 0 # math.log(prior[c]) negation_found = False adverb_found = False adverb_condprob = 0.0 doc_features = [] for t in doc_tokens: t = t.lower() if constants.LA and t in negation_words: negation_found = True continue if t in stop_words: continue if t in names: continue isAdj = wn.morphy(t, wn.ADJ) is not None isNoun = wn.morphy(t, wn.NOUN) is not None isVerb = wn.morphy(t, wn.VERB) is not None isAdv = wn.morphy(t, wn.ADV) is not None if constants.LA and negation_found: negation_found = False continue t = process_word(t) if t not in vocabulary: continue if constants.FEATURE_SELECTION is not None and t not in selected_features[c]: continue doc_features.append(t) vocab = vocabulary if constants.FEATURE_SELECTION is not None: vocab = selected_features[c] for t in vocabulary: if t in doc_features: scores[c] += math.log(condprob[t][c]) else: scores[c] += math.log(1.0 - condprob[t][c]) diff = math.fabs(scores["0"] - scores["1"]) return (scores, diff)
def getRoot(w, tag=False): if tag == False: for tag in ['v', 'n', 'a', 'r']: r = wordnet.morphy(w, tagequiv(tag)) if r: return r return w try: return wordnet.morphy(w, tag) except: return w
def get_synonyms_as_set(input_word): if input_word is None: return set() synonyms = set() synSets = wn.synsets(input_word) for syn in synSets: for lemma_name in syn.lemma_names(): if wn.morphy(lemma_name) is not None: synonyms.add(str(wn.morphy(lemma_name).encode('utf-8').decode('ascii','ignore'))) return synonyms
def getGroup(count, word, threshold, wordsSeen, groups): word = "".join(l for l in word if l not in string.punctuation) best = 0 group = word #searchForExisting if(wordsSeen.has_key(word)): return wordsSeen.get(word) #get synset of word if(wn.synsets(word)): wordSyn = wn.synsets(word)[0] elif(wn.morphy(word)): wordSyn = wn.morphy(word)[0] else: #no synset; use word wordsSeen.update({word: group}) if(groups.has_key(group)): newValue = groups.get(group) newValue.update([word]) groups.update({group: newValue}) else: newValue = set() newValue.update([word]) groups.update({group: newValue}) wordsSeen.update({word: group}) return word #compare to each group # is there a way to compare one word to many words? for super_word in count.keys(): #get synset of group being tested against comparisons = groups.get(super_word) sim = nSim(wordSyn, comparisons) if(sim >= threshold and sim > best): group = super_word best = sim wordsSeen.update({word: group}) if(groups.has_key(group)): newValue = groups.get(group) newValue.update([word]) groups.update({group: newValue}) else: newValue = set() newValue.update([word]) groups.update({group: newValue}) wordsSeen.update({word: group}) return group
def chunktaged(tokens, tagged, word): ''' Extract the meaningful chunk (phrase) from the sentence. Also can be imagined as a phrase detection. PARAMETER LIST: tokens is a list of the words in the sentence: ['I', 'previously', 'booked', 'the', 'nice', 'flight', '.'] tagged is a list of tuples consisting of word and POS: [('I', 'PRP'), ('previously', 'RB'), ('booked', 'VBD'), ('the', 'DT'), ('nice', 'JJ'), ('flight', 'NN'), ('.', '.')] word is what we look up for: 'booked' The return value should be a phrase like 'turn_on' or just the origin word. # the rules as our knowledge: # 1, consecutive nouns # 2, verb before a preposition ''' word_index = tokens.index(word) if (pos_map.has_key(tagged[word_index][1])): word_pos = pos_map[tagged[word_index][1]] else: return word if (word_pos == 'VERB' and (wn.morphy(word, wn.VERB) != None)): word = wn.morphy(word, wn.VERB) elif (word_pos == 'NOUN' and (wn.morphy(word, wn.NOUN) != None)): word = wn.morphy(word, wn.NOUN) if word_index == len(tokens) - 1: return word if (pos_map.has_key(tagged[word_index + 1][1])): next_word_pos = pos_map[tagged[word_index + 1][1]] else: return word if (word_pos == 'VERB' and next_word_pos == 'PP') or \ (word_pos == 'NOUN' and next_word_pos == 'NOUN'): possible_chunk = word + '_' + tokens[word_index+1] # in case the consecutive Noun is not a phrase if wn.synsets(possible_chunk) == []: return word else: return possible_chunk else: return word
def get_roots(sentence): roots = [] for idx, token in enumerate(sentence.clean_tokens): if sentence.tokens_pos[idx] == "VB": root = wn.morphy(token, wn.VERB) else: root = wn.morphy(token) if root is None: root = token roots.append(root) return roots
def main(): punIn = raw_input("Pun File: ") # get it it's a pun on "punning" hah hah f = open(punIn, "r") for line in f: posList = POSCheck(line) # returns a list of words that stood out in the POS tagging hList = homophoneCheck(line) # returns a list of homophones, along with the original word from the sentence print (posList) print (hList) extText = POSextract(line) # returns a list with all of the important words extracted print (extText) hiscore = 0 highSim = [] for word in extText: for i in range(0, len(hList)): hSim = conceptCheck(word, hList[i]) if hSim == []: continue elif hSim[2] > hiscore: highSim = hSim hiscore = highSim[2] for a in range(0, len(hList)): mword = wn.morphy(word) if mword: hMorphSim = conceptCheck(mword, hList[a]) if hMorphSim == []: continue elif hMorphSim[2] > hiscore: highSim = hMorphSim hiscore = highSim[2] else: break for j in range(0, len(posList)): pSim = conceptCheck(word, posList[j]) if pSim == []: continue elif pSim[2] > hiscore: highSim = pSim hiscore = highSim[2] for b in range(0, len(posList)): mword = wn.morphy(word) if mword: pMorphSim = conceptCheck(mword, posList[b]) if pMorphSim == []: continue elif pMorphSim[2] > hiscore: highSim = pMorphSim hiscore = highSim[2] else: break print (highSim)
def simple_pos_morphy(self, word): """ Helper funcion for simpletextprocess function. It doesn't process the PoS tagging in the first place. @param word: the raw word before morph @type word: C{string} """ morphied = wn.morphy(word, wn.NOUN) if morphied != None: return morphied else: morphied = wn.morphy(word, wn.VERB) if morphied != None: return morphied return word
def get_antonyms_as_set(input_word): if input_word is None: return set() antonyms = set() synonyms = wn.synsets(input_word) for syn in synonyms: lemmas = syn.lemmas() for lem in lemmas: for ant in lem.antonyms(): if wn.morphy(ant.name()) is not None: antonyms.add(str(wn.morphy(ant.name()).encode('utf-8').decode('ascii', 'ignore'))) return antonyms
def __load_lesk_vector(self, dicty, window_size=100): # print example.word, example.pos, example.target # print example.senses # Generate WordSets of surrounding words other_sets = [] words = self.words_window(window_size) for word, pos in words: # print word, pos baseword = wn.morphy(word) if baseword is not None: pos = penn_to_wn(pos) synsets = wn.synsets(baseword, pos=pos) if pos is not None else wn.synsets(baseword) for synset in synsets: other_sets.append(WordSet(synset.definition)) # for sety in other_sets: # print sety.words # Loop through possible wordsets and note counts: counts = [] for sense in self.all_senses(dicty): curr_set = WordSet(sense.gloss) # print curr_set.words counts.append(curr_set.overlap(other_sets)) # print counts # Normalize and return countfirsts = [count[0] for count in counts] countfirsts_max = max(countfirsts) if countfirsts_max > 0: return [float(count)/countfirsts_max for count in countfirsts] else: return [0.0 for count in countfirsts]
def search_for_all_strings(line, file_format): '''Search for all strings with NLTK''' result = [] for regexp in Config.excluded_lines: for match in re.finditer(regexp, line): if match: return([]) for regexp in Config.strings_patterns[file_format]: for match in re.finditer(regexp, line): if not match: continue group = match.group(1) if len(group) > 0 and not contains_forbidden_patterns(group): try: tokens = nltk.word_tokenize(group) if len(tokens) > 0: for word in tokens: morf = wn.morphy(word) if morf and len(str(morf)) > 1: if (output_format == "csv") | (group not in global_word_pull): result.append(group) global_word_pull.add(group) break except: print ("Unexpected error:{0}".format(sys.exc_info())) traceback.print_tb(sys.exc_info()[2]) url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html") print("See here for installation instructions:\n" + url) webbrowser.open_new(url) nltk.download() sys.exit(2) return result
def wordnet_formatted(word): word = word.lower() words = search_wordnet(word) response_data = [] for found_word in words: word_data = dict() word_data['label'] = found_word.lemma word_data['desc'] = str(found_word.definition) word_data['sensenum'] = found_word.sensenum response_data.append(word_data) synsetid = found_word.lemma + '.' + str(found_word.pos) + '.' + str(found_word.sensenum) word_data['synset'] = synsetid stem = wn.morphy(word) if stem is not None and stem is not word: # print '*' + str(stem) + '*' word_stem_words = search_wordnet(stem) for found_word in word_stem_words: word_data = dict() word_data['label'] = found_word.lemma word_data['desc'] = str(found_word.definition) word_data['sensenum'] = found_word.sensenum response_data.append(word_data) synsetid = found_word.lemma + '.' + str(found_word.pos) + '.' + str(found_word.sensenum) word_data['synset'] = synsetid return response_data
def get_words(text): text = text.lower() words = word_tokenize(text) taggedWords = nltk.pos_tag(words) #j=0; word_list = [] #stemming for index, item in enumerate(taggedWords): if 'VB' in item[1]: pos = wn.ADV #print item[1]+'vb' elif 'AJ' in item[1]: #print item[1]+'aj' pos = wn.ADJ else: #print item[1]+'noun' pos = wn.NOUN #morphy needs to take the pos... Fix this! test = wn.morphy(item[0], pos) #word_list[j]=wn.morphy(w) if(test is None): word_list.append(item[0]) else: word_list.append(test) #j=j+1 return word_list
def find_candidates(self, property_subtree): if not isinstance(property_subtree, ParentedTree): raise AttributeError candidates = set(self.__get_property_string_forms(property_subtree)) new_candidates = set() for candidate in candidates: for label in self.__fetch_from_wikibase(candidate): new_candidates.add(label) candidates.update(new_candidates) new_candidates = set() for candidate in candidates: new_candidates.update(self.__fetch_synonyms_and_hypernyms(candidate)) candidates.update(new_candidates) new_candidates = set() for candidate in candidates: for POS in [wordnet.ADJ, wordnet.ADV, wordnet.NOUN, wordnet.VERB]: morphy = wordnet.morphy(candidate, POS) if morphy is not None: new_candidates.add(morphy) candidates.update(new_candidates) return candidates
def __get_sem_class__(token): token = re.sub(r'[`]','',token) per_pronouns = ["she", "he", "they", "you", "we", "i", "them", "her", "him", "us", "who","whom"] if token.lower() in per_pronouns: sem_class = "PER" else: sem_class = "PER" #it will crash with NNP that are not GPE, so probably people try: token_clean = wn.morphy(token.lower(), wn.NOUN) token_sense = ""+token_clean+".n.01" token_synset = wn.synset(token_sense) for synset in SEM_CLASSES.keys(): hypernyms = token_synset.hypernyms() if len(hypernyms) == 0: #need to get the instance token_synset = token_synset.instance_hypernyms()[0] if synset in token_synset.common_hypernyms(synset): sem_class = SEM_CLASSES[synset] break else: sem_class = "OTHER" except: wn_error return sem_class
def __set_has_homework_or_assignment(text=str, replacement_text=str, word_list=list): """ Checks if the text contains synonyms to homework, and replaces words with 'has_homework' Arguments: text (str): Text to check for "homework words" replacement_text (str): Text to replace "homework" words with word_list (list): List of words to use as comparison against the text Returns: str: Text with replaced "homework words", or the original text """ word_set = set() tokenized_text = nltk.word_tokenize(text) # loop through all the words to see if it contains homework or its synonyms for word in tokenized_text: word_lem = wordnet.morphy(word, wordnet.NOUN) if (word_lem is not None) and (word_lem in word_list): word_set.add(word) # convert to list and sort based on length word_set = list(word_set) word_set.sort(key=len, reverse=True) # replace those words, if any, with the replacement text for word in word_set: text = text.replace(word, replacement_text) return text
def guessFromGrammarStructs(meaning_generator): # NUM_HAIKUS = 400 # with open('haikus.json') as haikus_file: # dataset = json.load(haikus_file) # pos_counter = tokenize_dataset(dataset, haikus_limit=NUM_HAIKUS) for x in xrange(1,10): # pos_tags = [] # for i in xrange(3): # grammar_struct = pick_random_structure(pos_counter) # print(grammar_struct) # pos_tags += list(grammar_struct) # pos_tags += ['\n'] pos_tags = ['DT', 'JJ', 'NN','\n', 'NNS', 'VBG','\n', 'JJ', 'JJ','NN', ] haiku = '' seedWord = 'nature' oldWords = [meaning_generator.random_word('NN')] for postag in pos_tags: lastWord = wordFromPOStag(postag,seedWord,oldWords,meaning_generator) oldWords += [wn.morphy(lastWord)] haiku += lastWord+' ' print(haiku) print("")
def document_extraction(self, documnet): features = {} words_in_doc = get_tokens(documnet) for word in self.words.keys()[:min(len(self.words), self.amount_of_words)]: word = wn.morphy(word) or word features['contains(%s)' % word] = (word in words_in_doc) return features
def clean_file(text, entities): clean = [] recognised = 0 dirty = [] ctext = clean_text(text) tokens = nltk.word_tokenize(ctext) unusual = unusual_words(tokens) for token in tokens: ltoken = token.lower().strip() if ltoken in PUNCTUATION: clean.append(token) recognised += 1 else: if len(ltoken) > 1 or ltoken in ['a', 'i']: if ltoken in unusual: stem = wn.morphy(ltoken) if stem: clean.append(token) recognised += 1 else: if ltoken in entities: clean.append(token) recognised += 1 else: dirty.append(token) clean.append('[?]') else: clean.append(token) recognised += 1 else: dirty.append(token) return {'clean': clean, 'dirty': dirty, 'recognised': recognised, 'total': len(tokens)}
def generate_line(pos_tags,word_dump, inspiration,meaning_generator): words = [] for tag in pos_tags: word = generate_word(tag, pos_tags,words, word_dump, inspiration,meaning_generator) words.append(word) word_dump.append(wn.morphy(word)) return words
def statement_stemmer(stmt): """ str -> [list of strs] Return list of stemmed words from a single statement string. """ stmt_stems = [] #only stem statements with strings (used in statement_corpus() below) if type(stmt) is float: return #stmt_text_str = rev_text.encode('utf-8') ##deprecated. stmt_text_str = stmt.lower() sentence_list = nltk.sent_tokenize(stmt_text_str) for sent in sentence_list: word_list = nltk.word_tokenize(sent) for word in word_list: #compare with WordNet Corpus wn_word = wordnet.morphy(word) if wn_word is not None: wn_stem = PorterStemmer().stem_word(wn_word) stmt_stems.append(wn_stem) return stmt_stems
def nominalise(verb): # to be used when turning verbs to noun forms when extracting simple co-occ. stem = STP.stem(verb) # going with pure stem for suff in SUFFIXES: noun_form = wn.morphy(stem+suff,wn.NOUN) if noun_form != None: return noun_form # trying with the last character of the stem doubled stem += stem[-1] for suff in SUFFIXES: noun_form = wn.morphy(stem+suff,wn.NOUN) if noun_form != None: return noun_form # returning None if nothing works return None
def search_files_by_noun(noun, folder_path): """ Search files that contain noun :param noun: searching noun :param folder_path: folder with files :return: list of the files which contain noun """ # get general form of the noun noun = wordnet.morphy(noun, wordnet.NOUN) if noun is None: raise ValueError("Input word isn't noun") # check that directory exists if not os.path.exists(folder_path): raise IOError('Path "' + folder_path + '" does\'t exist') if not os.path.isdir(folder_path): raise IOError('"' + folder_path + '" isn\'t directory') # get file list file_list = os.listdir(folder_path) file_list = [os.path.join(folder_path, f) for f in file_list] file_list = filter(lambda f: path.isfile(f), file_list) file_list = filter(lambda f: mimetypes.guess_type(f)[0].startswith("text/"), file_list) # search word in the file process_pool = Pool() find_fun_args = [(noun, f_val) for f_val in file_list] file_list_contain_word = process_pool.map(_find_noun, find_fun_args) res_files = [os.path.basename(f_val) for f_val, mark in zip(file_list, file_list_contain_word) if mark] return res_files
def morphy_stem(word): """ Simple stemmer """ stem = wn.morphy(word) if stem: return stem.lower() else: return word.lower()
def get_lemma(word): """ lemmatization des mots (pour le NLP) """ lemma = wn.morphy(word) if lemma is None: return word else: return lemma
def get_lemma(word): #nltk.download('wordnet') from nltk.corpus import wordnet as wn lemma = wn.morphy(word) if lemma is None: return word else: return lemma
def morphy(multi_word): morphyed_words = [] for word in multi_word.split("_"): morphyed_word = wn.morphy(word, wn.VERB) if morphyed_word == None: morphyed_word = word # print word,"not found in WN" morphyed_words.append(morphyed_word) r = "_".join(morphyed_words) return r
def pos(a, p): ret = set() for i in a: m = wn.morphy(i) if m is None: continue for ss in wn.synsets(m): if ss.pos() == p: ret.add(i) return list(ret)
def getWordnetCandidates(clue, length): clue = re.sub('[' + string.punctuation + ']', '', clue.lower()) if clue is not wn.morphy(clue): ## TODO reverse the morph transform morphedclue = wn.morphy(clue) morph = clue.replace(morphedclue, '') clue = morphedclue print("morph-", morph) synsets = wn.synsets(clue) names = functools.reduce(lambda x, y: x + y.lemma_names(), synsets, []) for syn in synsets: if syn.hyponyms(): for hyposet in syn.hyponyms(): names += [lemma.name() for lemma in hyposet.lemmas()] if syn.hypernyms(): for hyperset in syn.hypernyms(): names += [lemma.name() for lemma in hyperset.lemmas()] names = list({x for x in names if len(x) == length}) return names
def lemmatize_word(word): # make words lower example: Python =>python word = word.lower() # lemmatize example: cooked=>cook lemma = wn.morphy(word) if lemma is None: return word else: return lemma
def preproc_word(self, w, pos=None): if pos == 'j': pos = 'a' w = re.sub(r'[\$,\{\}\[\]\(\)`\'\":;!\?\.]', '', w).lower() w = re.sub(r'\-', '_', w) # hyphen -> underscore if w == 'an': w = 'a' # dirty hack.... if w == 'oclock': w = 'o\'clock' if w.isdigit(): w = '<NUM>' wp = wn.morphy(w, pos=pos) if wp is None: wp = w return wp
def get_synsets(text): words = get_words(text) synsets = [] for word in words: #print(word) word_synsets = wn.synsets(word, NOUN) if word_synsets == []: morpied_word = wn.morphy(word, NOUN) while morpied_word != None: word_synsets += wn.synsets(morphied_word, NOUN) morphied_word = wn.morphy(word, NOUN) synsets += word_synsets return synsets
def notNeutral(word, pos): morphy = wn.morphy(word, pos) if morphy is not None: if morphy in lexiconDict or morphy in lexiconDict: return True if word in lexiconDict or word in disgustingWords: return True else: return False
def suitable_word(seedWord, oldWords, POStag, meaning_generator): for x in xrange(1, 3): for word in reversed( oldWords + [seedWord] ): # will say, the seedword and then last word is most significant newWord = meaning_generator.associate(word, POStag) if newWord != None and wn.morphy(newWord) not in oldWords: return newWord # couldn't find any, let's just return random word return meaning_generator.random_word(POStag)
def printStuff(s): line = 1 print('Stemmer:') for i in range(len(s)): if s[i - 1] == ';' or (s[i] != ';' and isfloat(s[i - 1]) == True): line = line + 1 if isfloat(s[i]) == True: print(s[i] + ' DOUBLE ' + str(line)) elif isOP(s[i]) == True: print(s[i] + ' OP ' + str(line)) else: s[i] = s[i].lower() if isLegal(s[i]) and str(type(wordnet.morphy( s[i]))) != "<class 'NoneType'>": print(s[i] + ' STRING ' + str(line) + ' ' + wordnet.morphy(s[i])) else: print(s[i] + ' STRING ' + str(line)) print('ENDFILE')
def score_words(self): ''' generates self.word_scores and self.min_word_scores for the markov chain calculation ''' g = nx.DiGraph() added = set() for n in self.ranked_nodes: score = self.ranked_nodes[n] triple = json.loads(n) #sdistance = abs(TextBlob(triple[0][0] + ' ' + triple[2][0]).sentiment.polarity - self.overall_sentiment); score = score #/ sdistance; a = wn.morphy(triple[0][0]) or triple[0][0] b = wn.morphy(triple[2][0]) or triple[2][0] if a not in added: g.add_node(a, pos=triple[0][1], weight=score) added.add(a) if b not in added: g.add_node(b, pos=triple[2][1], weight=score) added.add(b) for n in self.ranked_nodes: score = self.ranked_nodes[n] triple = json.loads(n) a = wn.morphy(triple[0][0]) or triple[0][0] b = wn.morphy(triple[2][0]) or triple[2][0] relationship = triple[1] g.add_edge(a, b, dep=relationship, weight=score) nx.draw(g, with_labels = True) plt.show() o = g.out_degree() i = g.in_degree() t = {} for n in o: t[n] = o[n] + i[n] s = sorted(t, key=t.get) pprint(s) ranked = nx.pagerank(g) self.word_scores = ranked self.min_word_score = min(self.word_scores.values())
def get_pun_token(pun): """ Return a set the contains the pun word and its lemma """ for wordID, word in pun.items(): try: if word['ispun']: return wordID, set([word['token'], wn.morphy(word['token'])]) except: pass
def _lemmatize(word): """ Use morphy from WordNet to find the base form of verbs. """ from nltk.corpus import wordnet as wn lemma = wn.morphy(word, pos=wn.VERB) if lemma is not None: return lemma return word
def word_in_awl(word): global awl_words if word in string.punctuation: return 0 lemma = wn.morphy(word) if lemma is None: lemma = word if lemma in awl_words: return 1 return 0
def get_lemma(word): """Return lemmatized word from the input text. Keyword arguments: word -- the line(s) of text to be lemmatized. """ lemma = wn.morphy(word) if lemma is None: return word else: return lemma
def valid_en_word(word): global all_words if word in string.punctuation: return False lemma = wn.morphy(word) if lemma is None: lemma = word if lemma in all_words: return True return False
def findword(self, word): if word.isdigit(): w = word else: w = wn.morphy(strip(word)) if w in self.dictionary: return self.dictionary[w] return set()
def _get_similarity_wordnet_2word(self, word1, word2): ''' print 'before stemmed:',word1 print 'after stemmed:',wn.morphy(word1.lower()) print 'before stemmed:',word2 print 'after stemmed:',wn.morphy(word2.lower()) ''' #stemmed word if wn.morphy(word1.lower()) != None: word1 = wn.morphy(word1.lower()) if wn.morphy(word2.lower()) != None: word2 = wn.morphy(word2.lower()) key1 = '(%s,%s)' % (word1, word2) key2 = '(%s,%s)' % (word2, word1) if self.sim_2word.has_key(key1): return self.sim_2word[key1] if self.sim_2word.has_key(key2): return self.sim_2word[key2] word1_synsets = wn.synsets(word1) #print word1_synsets word2_synsets = wn.synsets(word2) #print word2_synsets sim = 0 for syn1 in word1_synsets: w1 = wn.synset(syn1.name()) for syn2 in word2_synsets: w2 = wn.synset(syn2.name()) tmp = w1.path_similarity(w2) #print tmp,syn1.name(),syn2.name() if tmp > sim: sim = tmp if sim == 1.0: break if sim == 1.0: break self.sim_2word[key1] = sim self.sim_2word[key2] = sim return sim
def get_synsets(text): tokens = nlp(text) synsets = [] for t in tokens: if t.pos_ == "NOUN": word = t.text word_synsets = wn.synsets(word, NOUN) if word_synsets == []: morpied_word = wn.morphy(word, NOUN) while morpied_word != None: word_synsets += wn.synsets(morphied_word, NOUN) morphied_word = wn.morphy(word, NOUN) synsets += word_synsets return synsets
def lesk(word1, sentence): bestsense = None maxoverlap = 0 word=wordnet.morphy(word1) for sense in wordnet.synsets(word): overlap = calculateOverlap(sense,sentence) if overlap > maxoverlap: maxoverlap = overlap bestsense = sense return bestsense
def morphy_stem(word): """ Simple stemmer """ # Morphy returns the base form of a word, ie, dogs -> dog # unknown 'stem' returns word.lower() stem = wn.morphy(word) if stem: return stem.lower() else: return '0'
def stemWord(words_list): result = [] for word in words_list: word = word.strip() word = word.lower() aa = wn.morphy(word) if aa == None: #我发现词干化会出现返回为空的情况 result.append(word) else: result.append(aa) return result
def __init__(self, word, part_of_speech, lookuptable): self.part_of_speech = morphy_tag[ part_of_speech] if part_of_speech in morphy_tag else wordnet.NOUN self.word = wordnet.morphy(word, self.part_of_speech) #Lemmatization self.synset = listify(wordnet.synsets( word, pos=self.part_of_speech)) if self.word else None self.orphan = not self.synset self.db = lookuptable self.lemmatizer = WordNetLemmatizer() self.kernel = {}
def nounify(adj_word): set_of_related_nouns = set() for lemma in wn.lemmas(wn.morphy(adj_word, wn.ADJ), pos="a"): for related_form in lemma.derivationally_related_forms(): for synset in wn.synsets(related_form.name(), pos=wn.NOUN): if wn.synset('person.n.01') in synset.closure( lambda s: s.hypernyms()): set_of_related_nouns.add(synset) return set_of_related_nouns
def find_lemma(start, text, lemmas, ll_conn, is_kfx): from nltk.corpus import wordnet as wn for match in re.finditer(r'[a-zA-Z\u00AD]{3,}', text): lemma = wn.morphy(match.group(0).replace('\u00AD', '').lower()) if lemma in lemmas: if is_kfx: index = start + match.start() else: index = start + len(text[:match.start()].encode('utf-8')) insert_lemma(ll_conn, (index, ) + tuple(lemmas[lemma]))
def find_lemma(start, text, lemmas, ll_conn): from nltk.corpus import wordnet as wn bytes_str = isinstance(text, bytes) pattern = b'[a-zA-Z]{3,}' if bytes_str else r'[a-zA-Z]{3,}' for match in re.finditer(pattern, text): word = match.group(0).decode('utf-8') if bytes_str else match.group(0) lemma = wn.morphy(word.lower()) if lemma in lemmas: insert_lemma(ll_conn, (start + match.start(),) + tuple(lemmas[lemma]))
def get_lemma(self, word): """ Create a lemmatized version of a word :param word: :return: """ lemma = wn.morphy(word) if lemma is None: return word else: return lemma
def get_lemma(word): """ The lemmas corresponding to a word :param word: a single word from the dictionary :return: the lemma """ lemma = wn.morphy(word) if lemma is None: return word else: return lemma