def unify_query(query): """ a peek of sorts .. param: list of Word objs return: synset entry from wn """ #build query from query = build_wn_query(query) print 'wordnet query: {0}'.format(query) s = wordnet.synsets(singularize(query), pos=wordnet.NOUN) if len(s) == 0: #this is a bit hacky.. it's based on the assumption, if it fails, it may be a two word NN #i.e. thrill ride fails, ride doesn't print 'no entry for {0}..'.format(query) s = wordnet.synsets(singularize(query.split()[1]), pos=wordnet.NOUN) if len(s) == 0: print 'no entry for {0}'.format(query.split()[1]) return s
def test_synset_is_physical_object(self): from extract import synset_is_physical_object tree_ss = wordnet.synsets("tree")[0] self.assertTrue(synset_is_physical_object(tree_ss)) truth_ss = wordnet.synsets("truth")[0] self.assertFalse(synset_is_physical_object(truth_ss))
def special_similarity(word): if word.startswith("special"): return (1.0) return (custom_similarity(word, [ wordnet.synsets('special', pos=wordnet.ADJECTIVE)[1], wordnet.synsets('special', pos=wordnet.ADJECTIVE)[3] ]))
def test(): word = 'walk' pos = 'VB' print(wordnet.synsets(word, pos)) return hypers = get_words(word, pos, 'hyper', True, 10) # for hyper in hypers: # print(hyper, hypers[hyper]) print(hypers.keys()) # print('fruit' in hypers) hypos = get_words(word, pos, 'hypo', True, 1) # for hypo in hypos: # print(hypo, hypos[hypo]) print(hypos.keys()) print() print(get_synonyms('big', 'JJ')) print(get_antonyms('big', 'JJ')) print(get_similar('big', 'JJ')) print() print(get_synonyms('man', 'NN')) print(get_antonyms('man', 'NN')) print(get_similar('man', 'NN')) print() print(wordnet.synsets('go', pos='VB')[0]) print(wordnet.synsets('nice', pos='JJ')[0]) print(wordnet.synsets('well', pos='RB')[0]) print(wordnet.synsets('musical instrument', pos="NN")[0])
def number_similarity(word): return (custom_similarity(word, [ wordnet.synsets('number')[0], wordnet.synsets('number')[1], wordnet.synsets('number')[4] ], pos=wordnet.NOUN))
def alphabet_similarity(word): return (custom_similarity(word, [ wordnet.synsets('alphabet')[0], wordnet.synsets('character')[-2], wordnet.synsets('letter')[1] ], pos=wordnet.NOUN))
def hypernym_search(text, search_word): try: from pattern.en import wordnet except: print( 'Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip' ) sys.exit() output = [] try: for search_word in search_word.split('|'): synset = wordnet.synsets(search_word)[0] pos = synset.pos possible_words = re_search(text, pos) for match in possible_words: word = match[0].string synsets = wordnet.synsets(word) if len(synsets) > 0: hypernyms = synsets[0].hypernyms(recursive=True) if any(search_word == h.senses[0] for h in hypernyms): output.append(word) except IndexError: pass return set(output)
def word_similarity(word1, word2): """ Similarity of 2 words as a score from 0 to 1, uses wordnet """ from pattern.en import wordnet try: a = wordnet.synsets(word1)[0] b = wordnet.synsets(word2)[0] return wordnet.similarity(a, b) except: return 0
def get_related_noun_or_not(noun, d=True): w = wordnet.synsets(noun) if w: w = w[0] w1 = w.hyponyms() w2 = w.hypernyms() if w1 + w2: nw = random.choice(w1 + w2) if nw and nw.senses: return nw.senses[0] elif wordnet.synsets(singularize(noun)) and d: return get_related_noun_or_not(singularize(noun, False)) return noun
def hypernym_search(text, search_word): output = [] synset = wordnet.synsets(search_word)[0] pos = synset.pos possible_words = re_search(text, pos) for match in possible_words: word = match[0].string synsets = wordnet.synsets(word) if len(synsets) > 0: hypernyms = synsets[0].hypernyms(recursive=True) if any(search_word == h.senses[0] for h in hypernyms): output.append(word) return set(output)
def get_related_or_not(word, d=True, pos='NN'): w = wordnet.synsets(word, pos=pos) if w: w = w[0] w1 = w.hyponyms() w2 = w.hypernyms() if w1 + w2: nw = random.choice([w] + w1 + w2) if nw and nw.senses: return nw.senses[0] elif wordnet.synsets(singularize(word)) and d: return get_related_or_not(singularize(word, False, pos)) return word
def custom_similarity(word, synsets, pos=None): word = singularize(word.lower()) similarities = [] if pos: word_synsets = wordnet.synsets(word, pos=pos) else: word_synsets = wordnet.synsets(word) for i in synsets: for j in word_synsets: try: similarities.append(wordnet.similarity(i, j)) except Exception, e: pass
def wordnet_potential_parent(word1, pos1, word2, pos2, min_sim=0.0): syns1 = wn.synsets(word1, pos1) syns2 = wn.synsets(word2, pos2) parents = [] for s1 in syns1: for s2 in syns2: family = wn.ancestor(s1,s2) if family: sim = wn.similarity(s1,s2) if sim > min_sim: parents.append( (family, sim) ) return parents
def lemma_is_geological_formation(lemma): synsets = wordnet.synsets(lemma, wordnet.NOUN) if len(synsets) > 0 and all([synset_is_proper(s) for s in synsets]): return False else: return any([synset_is_geological_formation(s) for s in synsets \ if not(synset_is_proper(s))])
def find_replacement(word, pos): shuffle(BASIC) try: w = next(w for w in BASIC if wordnet.synsets(w, pos=pos)) return (w, True) if random() < 0.3 else (word, False) except: return word, False
def synonyms(lemma, pos=NOUN): synonyms = list() for synset in wordnet.synsets(lemma, pos): for synonym in synset.synonyms: if synonym != lemma: synonyms.append(synonym) return synonyms
def validate(self, tag, media): # check with wordnet # if synset continue else return -1 # check category and decide if verify # use google vision api to verify # result: 1 -> good (keep image) ; 0 -> bad (discard image) ; -1 -> cannot validate (keep) tag = singularize(tag).lower() synset = wordnet.synsets(tag, pos=NOUN) if not synset: return -1 category = synset[0].lexname if self.VALIDATE_CATEGORY == 'all': pass elif category in self.VALIDATE_CATEGORY: pass # do not return yet else: return -1 # not all and cannot be validated img = requests.get(media) gImage = google.cloud.vision.types.Image(content=img.content) response = self.vision_client.label_detection(image=gImage) labels = map(lambda d: d.description if d.score > 0.9 else None, response.label_annotations) # check if tag in the detected labels with a good probability (score) if tag in labels: return 1 # compare synonyms synonyms = synset[0].synonyms # check if any synonym in labels for synonym in synonyms: if singularize(tag).lower() in labels: return 1 return 0
def glossdef(token): tokens = [] for term in token: for token in wordnet.synsets(term): token = token.gloss token = tokens.append(token) return tokens
def shift(noun): """ Returns a (random hyponym, description)-tuple for the given noun. """ s = wordnet.synsets(noun) s = s[0] h = choice(s.hyponyms(recursive=True) or [s]) return [h.synonyms[0], h.gloss]
def sentiment(content): if len(wordnet.sentiment) == 0: wordnet.sentiment.load() relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs score = 0 sentences = split(parse(content)) for sentence in sentences: for index, word in enumerate(sentence.words): if word.string != '' and word.type in relevant_types: try: synset = wordnet.synsets(word.string, word.type) except KeyError: #incorrect part of speech tag or not in wordnet, skip it continue pos, neg, obj = synset[0].weight #weights concluding statements #idea from [Ohana, Tierney '09] documentpos = index / float(len(sentence.words)) #weights more subjective statements subjscore = ((pos - neg) * (1 - obj)) score = score + subjscore * documentpos return score
def lemma_is_natural(lemma): synsets = wordnet.synsets(lemma, wordnet.NOUN) if len(synsets) > 0 and all([synset_is_proper(s) for s in synsets]): return False else: return any([synset_is_natural(s) for s in synsets \ if not(synset_is_proper(s))])
def get_random_word(t, ref=False): """Return a random word from a set filtering on lexname category if necessary""" # If there are entries in the lexnames list for the given POS tag, limit results to that, # otherwise just return a random word for that POS word = None if len(POS[t]['lexnames']): lexname = '' while lexname not in POS[t]['lexnames']: word = random.choice(POS[t]['words'])[0] synset = wordnet.synsets(get_singular(word), pos=t) if synset: lexname = synset[0].lexname else: word = random.choice(POS[t]['words'])[0] # If required, prefix with an article if ref: word = referenced(word) return word.lower()
def list_hyponyms(word): output = [] synsets = wordnet.synsets(word) if len(synsets) > 0: hyponyms = synsets[0].hyponyms(recursive=True) output = [h.senses[0] for h in hyponyms] return output
def expand(term, limit=3): # EXPANSION IS ONLY DONE FOR FIRST SYNSET. WHY? try: hypernyms = WN.synsets(term, 'NN')[0].hypernyms(recursive=True) return {w.senses[0] for w in hypernyms[:limit]}.union({term}) except IndexError: return {}
def define_word(self, word): synsets = wordnet.synsets(word) if len(synsets) > 0: gloss = synsets[0].gloss if gloss.find(';') > -1: gloss = gloss[:gloss.find(';')] word = word + " (comprising of " + gloss + ") " return word
def get_synonyms(word, pos): """ return a set of strings, lowercase unless proper noun """ ans = set() for ss in wordnet.synsets(word, pos): synonyms = [synonym.replace('_', ' ') for synonym in ss.synonyms] # print(synonyms) ans.update(synonyms) return ans
def wn_filter_pos(text, pos): synsets = wordnet.synsets(text, pos=pos) for s in synsets: for synonym in s.synonyms: if synonym[0].isupper(): return False if len(synsets) > 0: return True
def define_word(self, word): synsets = wordnet.synsets(word) if len(synsets) > 0: gloss = synsets[0].gloss if gloss.find(";") > -1: gloss = gloss[: gloss.find(";")] word = word + " (comprising of " + gloss + ") " return word
def WordnetFeatures(token): synset = wordnet.synsets(token) if len(synset) > 0: synset = synset[0] hypernym = synset.hypernyms(depth=2, recursive=True) # hypernym.extend(synset.hyponyms(depth=2,recursive=True)) return [hyper.senses[0] for hyper in hypernym] else: return []
def lemma_is_person(lemma): synsets = wordnet.synsets(lemma, wordnet.NOUN) # if ALL the synsets are proper, then it's a person! if len(synsets) > 0 and all([synset_is_proper(s) for s in synsets]): return True # otherwise, check ONLY the non-proper synsets else: return any([synset_is_person(s) for s in synsets \ if not(synset_is_proper(s))])
def max_ic(words, pos): subj = None syns = [wordnet.synsets(w, pos=pos) for w in words] syns = [random.choice(s) for s in syns if s] if syns: vals = [(s.synonyms[0], s.ic) for s in syns] if vals: word, val = max(vals, key=lambda x: x[1]) subj = word return subj
def hypernym_search(text, search_word): try: from pattern.en import wordnet except: print('Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip') sys.exit() output = [] for search_word in search_word.split('|'): synset = wordnet.synsets(search_word)[0] pos = synset.pos possible_words = re_search(text, pos) for match in possible_words: word = match[0].string synsets = wordnet.synsets(word) if len(synsets) > 0: hypernyms = synsets[0].hypernyms(recursive=True) if any(search_word == h.senses[0] for h in hypernyms): output.append(word) return set(output)
def get_similar(word, pos): """ return a set of strings, lowercase unless proper noun """ ans = set() for ss in wordnet.synsets(word, pos): if ss.similar(): # a list of synsets for ss1 in ss.similar(): similars = [ synonym.replace('_', ' ') for synonym in ss1.synonyms ] # print(similars) ans.update(similars) return ans
def list_hypernyms(search_word): try: from pattern.en import wordnet except: print('Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip') sys.exit() output = [] for synset in wordnet.synsets(search_word): hypernyms = synset.hypernyms(recursive=True) output.append([h.senses[0] for h in hypernyms]) return output
def getOccurencies(dico, lis): synlist = {} #Loop on chapters, j is the key, dico[j] is the dictionnary of this chapter with a child "fulltext" for j in dico: c = dico[j]["fulltext"] wordpos = {} exclude = set(string.punctuation) #c = ''.join(ch for ch in c if ch not in exclude) #print c #Loop on words in list c = c.lower() for i in range(len(c)): for word in lis: if c[i:len(word) + i] == word: wordpos[i] = word """ for l in lis: wlen = 0 for pos, words in enumerate(c.split()): #print words #wlen += len(words) #print words #for en, char in enumerate(c): if l == words and l not in wordpos: #if not l in wordpos: wordpos[pos] = l #break #wordpos[l] = pos """ #End of word loop #Get synonyms for l in lis: syns = wordnet.synsets(l) for k in syns: if not l in synlist: synlist[l] = [] synlist[l].extend(k.synonyms) #End of synonyms loop #We update dico dico[j]["occurencies"] = wordpos #End of loop on chapters setsyn = set() lis1 = [] for word in synlist: synlist[word] = list(set(synlist[word])) #print(synlist) return dico, synlist
def list_hyponyms(word): try: from pattern.en import wordnet except: print('Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip') sys.exit() output = [] synsets = wordnet.synsets(word) if len(synsets) > 0: hyponyms = synsets[0].hyponyms(recursive=True) output = [h.senses[0] for h in hyponyms] return output
def get_related(word, plural=False): syn = wordnet.synsets(word) if not syn: return [] syn = syn[0] if not syn.hypernym: return [] fn = pluralize if plural else lambda x: x s = sorted([(fn(' '.join(x[0].split('_'))), x.similarity(syn)) for x in syn.hypernym.hyponyms()], key=lambda z: z[1], reverse=True) return [x for x in s if x[1] > 0.5 and x[1] < 1 and word not in x[0]]
def parse_phrases(documents): for document in documents: ptree = parsetree(document, relations=True, lemmata=True) for sentence in ptree: print i, sentence.string for phrase in sentence.phrases: for word in phrase.words: if word.pos in WN_POS: print i, phrase, word, word.pos, wn.synsets(word.lemma, word.pos) else: print i, phrase, word, word.pos print '\n'
def all_synsets(word, pos=None): map = { 'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJECTIVE, 'ADV': wordnet.ADVERB } if pos is None: pos_list = [wordnet.VERB, wordnet.ADJECTIVE, wordnet.NOUN, wordnet.ADVERB] else: pos_list = [map[pos]] ret = [] for pos in pos_list: ret.extend(wordnet.synsets(word, pos=pos)) return ret
def test_wordnet(): from pattern.en import wordnet word = "bird" word = "Java" word = "C++" word = "MongoDb" for s in wordnet.synsets(word) : print 'Definition:', s.gloss print ' Synonyms:', s.synonyms print ' Hypernyms:', s.hypernyms() print ' Hyponyms:', s.hyponyms() print ' Holonyms:', s.holonyms() print ' Meronyms:', s.meronyms()
def is_animate(lemma): # this "works" but is very eager to grant animacy even for words that have # one synset with a person hypernym (things like "rock" count as "animate" # because a "rock" can be a person that you depend on) hypernyms = list() for synset in wordnet.synsets(lemma, pos=NOUN): # skip synsets that are proper nouns, as these are always animate! if any([s[0].isupper() for s in synset.synonyms]): continue synonyms = list() for s in synset.hypernyms(recursive=True): synonyms.extend(s.synonyms) if 'person' in synonyms: return True return False
def get_words(word, pos, hyper_hypo, recursive=False, depth=None): """ return a dict { str.lower : zipf_freq.log } if depth = NOne, depth = 1 """ ans = {} # a dictioinary # find out which synset for the word for ss in wordnet.synsets(word, pos): # if ss.pos.startswith(pos.upper()): # this is what we want! if hyper_hypo == 'hyper': for x in ss.hypernyms(recursive, depth): ans.update({syn.replace('_', ' ') : zipf_frequency(syn.replace('_', ' '), 'en') \ for syn in x.synonyms}) elif hyper_hypo == 'hypo': for x in ss.hyponyms(recursive, depth): ans.update({syn.replace('_', ' ') : zipf_frequency(syn.replace('_', ' '), 'en') \ for syn in x.synonyms}) # break # only use the first synset, assuming it's the most common one return ans
def get_alternations(word, pos, synset_id, nlp, verbose=False): """ return all alternations """ alternations = {} # {word_str : simi_score} # find all alternations of walk for ss in wordnet.synsets(word, pos): if int(ss.id) != int(synset_id): continue for hyper in ss.hypernyms(): for hypo in hyper.hyponyms(): if hypo.pos != pos: continue # only want the same pos for synonym in hypo.synonyms: if synonym.lower() == word.lower(): continue # don't want ss here if synonym not in alternations: simi_score = nlp(word).similarity(nlp(synonym)) if simi_score > 0: # print(word, synonym, simi_score) alternations[synonym] = simi_score if verbose: print('found {} alternations'.format(len(alternations))) return alternations
def get_tables(words): """Build a list of tables for the SQL statement from random words""" # http://wordnet.princeton.edu/man/lexnames.5WN.html lexnames = ['noun.plant', 'noun.animal', 'noun.food', 'noun.shape', 'noun.body', 'noun.artifact', 'noun.object' ] # Loop until we find a table name that is less than the MAX_TABLE_NAME_LENGTH and has a # noun category that is in the list of lexnames above tables = [] for i in range(0, MAX_TABLES): lexname = '' while lexname not in lexnames: ((word, tag), f) = random.choice(words) word = word.lower() s = wordnet.synsets(word) if len(s): s = s[0] lexname = s.lexname if len(s.hypernym) > MAX_TABLE_NAME_LENGTH: lexname = '' else: lexname = '' tables.append(s) print '------------------------------------------------------------------------' print word print tables[0].hyponyms() print '------------------------------------------------------------------------' return tables
def add_vocab(self,word): if word in self.synsets: self.synsets[word]+=1 #count frequency if word in self.vocab: self.total_words+=1 else: self.vocab[word] = word self.total_words+=1 pass if word in self.vocab: self.total_words+= 1 pass elif not word in self.synsets: singular = singularize(word) if not wordnet.synsets(singular): singular = word self.vocab[word] = singular self.total_words+= 1 elif word in self.synsets: self.vocab[word] = word self.total_words+= 1