def load(self, pos): wn = self.wn if pos == 'n': roots = wn.synsets('entity') else: roots = [s for s in wn.all_synsets(pos) if len(s.hypernyms()) == 0] self.root = WordNetTreeNode('root') for synset in roots: self.__append_synset(synset, self.root) # unfortunately, the block above is not guaranteed to build # the entire WordNet tree. The reason is that it starts at root # adding the descendants retrieved from synset.hyponyms(). For some # odd reason that method not always returns all hyponyms. For # example, portugal.n.01 is not retrieved as a hyponym of # european_country.n.01, but if we call # wn.synsets('portugal')[0].hypernym_paths() # european-country.n.01 appears as its ancestor. # check for synsets that were not foundss index = self.hashtable() for synset in wn.all_synsets(pos): if synset.name() not in index: for path in synset.hypernym_paths(): keys = [s.name() for s in path] self.__extend(keys, is_internal = len(path[-1].hyponyms()) > 0)
def prepare(self): for verb in wn.all_synsets('v'): for lemma in verb.lemmas(): if 1 in lemma.frame_ids(): for lemma in verb.lemmas(): #print lemma.name() #print (lemma, lemma.frame_ids(), "|".join(lemma.frame_strings())) #print verb.frame_strings() verbs.append(str(lemma.name()).replace('_', ' ')) #print verbs for noun in wn.all_synsets('n'): #print noun for lemma in noun.lemmas(): #print lemma.name() nouns.append(self.plural(str(lemma.name()).replace('_', ' '))) #print nouns for adj in wn.all_synsets('a'): #print adj for lemma in adj.lemmas(): #print lemma.name() adjectives.append(str(lemma.name()).replace('_', ' ')) for adv in wn.all_synsets('r'): #print adv for lemma in adv.lemmas(): #print lemma.name() adverbs.append(str(lemma.name()).replace('_', ' '))
def list_nouns(): global NOUNS print "[+] Creating list of nouns... (This only has to be done once)" if WIKI_LANGUAGE == 'en': ## Make list of nouns from wordnet NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} ## TODO CREATE A SEPARATE LIST FOR NOUNS ENDING IN S elif WIKI_LANGUAGE == 'es': ## Make list of nouns from cess_esp list = nltk.corpus.cess_esp.tagged_words() sust = [] for elem in list: if elem[1][0] == 'n': sust.append(elem[0]) NOUNS = set(sust) # TODO german language support # elif WIKI_LANGUAGE == 'de': else: print "[!] Language not recognised, using English." ## Make list of nouns from wordnet NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} print " Done!"
def populateBars(): connection = mdb.connect('localhost', 'user', 'pass', 'barlytics') current = connection.cursor() nounsList = [] adjectiveList = [] cityList = ['San Francisco', 'Chicago', 'New York', 'Austin', 'Seattle'] print "here" count = 0 for synset in list(wn.all_synsets('n')): nounsList.append(str(synset.name).split('.')[0]) count = count + 1 if count >= 50000: break count= 0 print "here" for synset in list(wn.all_synsets('a')): adjectiveList.append(str(synset.name).split('.')[0]) count = count + 1 if count >= 50000: break print "here" finalList = [] for i in range(10000): string = '' string = "The " + adjectiveList[randint(0, len(adjectiveList) - 1)].capitalize() string = string + " " + nounsList[randint(0, len(nounsList) - 1)].capitalize() finalList.append(string) name = string license = str(randint(1000000, 9000000)) city = str(address.city()) phone = str(phone_number.phone_number_format(0)) addr = str(randint(1, 255)) + " " + address.street_name() query = 'insert into bars values("' + name + '", "' + license + '", "' + city + '", "' + phone + '", "' + addr + '"); ' print query try: current.execute(query) except mdb.IntegrityError: print "integrity error:" print 'commit' connection.commit()
def _run_extract(self): #extract all 2 word AN and NN compounds from WN and write to file print "Extracting noun compounds from WN" discards=[] allsynsets=list(wn.all_synsets(self.parameters['pos'])) if not self.parameters['testing']: self.n=len(allsynsets) for synset in list(wn.all_synsets(self.parameters['pos']))[:self.n]: for lemma in synset.lemmas: #walk over all lemmas for all synsets #print lemma.name words=lemma.name.split('_') if len(words)==2:#check 2 words poslist=[] for word in words: poslist.append(PairGenerator.getpos(word))#generate a PosList List for this pair of words #print words,poslist headpos=poslist.pop() if 'N' in headpos:#is 'N' a possible part of speech for the head word (last word in the list) phrase=words.pop()+'/N' modpos=poslist.pop() mod=words.pop() if 'N' in modpos: #is 'N' a poss part of speech for mod NNphrase=phrase+":nn-DEP:"+mod+'/N' self.NNs.append(NNphrase) if 'J' in modpos:#is 'J' a poss part of speech for mod ANphrase=phrase+":amod-DEP:"+mod+'/J' self.ANs.append(ANphrase) if len(modpos)==0:#only considering J and N for mod #print "Discarding "+lemma.name discards.append(lemma.name) else:#only considering N for head #print "Discarding "+lemma.name discards.append(lemma.name) print len(self.NNs),self.NNs print len(self.ANs),self.ANs print len(discards),discards #write lists to file with open(self.ANpath,'w') as outstream: for AN in self.ANs: outstream.write(AN+'\n') with open(self.NNpath,'w') as outstream: for NN in self.NNs: outstream.write(NN+'\n') return
def exercise3(): print print "Exercise - 3" ss = [w for w in wn.all_synsets('v')] result = sum([len(ss[i].hypernyms()) for i in range(len(ss))]) print "Total number of hypernyms of 'v' is: %d" %result print "Average number of hypernyms is: %f" %(result/float(len(ss)))
def getAllGlossLinks(useTagger=False, useverbs=False, reflexive=False, n=10000): links = {} print "Gathering synsets" synsets = [ss for ss in wordnet.all_synsets()] n = 0 for ss in synsets: print "%.3f"%(float(n)/float(len(synsets))) n += 1 ssname = ss.name defn = wordboundary.split(ss.definition.strip()) if useTagger: defn = [(form, wdnettags[tag[0]]) for form, tag in useTagger.tag(defn) if not form == "" and tag[0] in wdnettags] if not ssname in links: links[ssname] = {} for w in defn[:n]: if type(w) == "str": wsynsets = wordnet.synsets(w) else: wsynsets = wordnet.synsets(w[0], w[1]) for s in wsynsets: sname = s.name links[ssname][sname] = True if reflexive: if not sname in links: links[sname] = {} links[sname][ssname] = True if not ssname in links: print ssname, defn for l in links: ll = links[l] for d in ll: links[l][d] = 1.0/float(len(ll)) return links
def __init__(self): t0 = time() print 'initalizing random word generator' self.s_articles = ['A', 'The'] self.o_articles = ['a','the'] self.prepositions = ['of','in','to','for','with','on','at','from','by', 'about','as','into','like','through','after','over','out','around'] self.nouns = list(wn.all_synsets(wn.NOUN)) self.verbs = list(wn.all_synsets(wn.VERB)) self.adjectives = list(wn.all_synsets(wn.ADJ)) self.adverbs = list(wn.all_synsets(wn.ADV)) t1 = time() runTime = t1-t0 print 'word list initalized in ' + str(runTime) + ' seconds'
def main(argv): huang_vocab = LoadHuang() manaal_vocab = LoadManaal() brown_vocab = LoadBrown() all_lemmas = {x.lower() for x in wn.all_lemma_names(pos=wn.ADJ)} all_alpha_lemmas = {x for x in all_lemmas if x.isalpha()} all_synsets = set(wn.all_synsets(pos=wn.ADJ)) all_alpha_synsets = {x for x in all_synsets if IsAlphaSS(x)} all_lemmas_with_single_synset = {x for x in all_lemmas if IsSingleSynset(x)} all_lemmas_ambig_synset = {x for x in all_lemmas if not IsSingleSynset(x)} all_lemmas_with_single_synset_alpha = {x for x in all_lemmas_with_single_synset if x.isalpha()} all_lemmas_ambig_synset_alpha = {x for x in all_lemmas_ambig_synset if x.isalpha()} all_alpha_lemmas_has_noun = {x for x in all_alpha_lemmas if LemmaHasNoun(x)} all_alpha_lemmas_has_noun_single_lexname = {x for x in all_alpha_lemmas_has_noun if IsNounSingleLexName(x) } print "all_lemmas:", len(all_lemmas) print "all_alpha_lemmas:", len(all_alpha_lemmas) print "all_synsets:", len(all_synsets) print "all_alpha_synsets:", len(all_alpha_synsets) print "all_lemmas_with_single_synset:", len(all_lemmas_with_single_synset) print "all_lemmas_ambig_synset:", len(all_lemmas_ambig_synset) print "all_lemmas_with_single_synset_alpha", len(all_lemmas_with_single_synset_alpha) print "all_lemmas_ambig_synset_alpha", len(all_lemmas_ambig_synset_alpha) print "all_alpha_lemmas_has_noun", len(all_alpha_lemmas_has_noun) print "all_alpha_lemmas_has_noun_single_lexname", len(all_alpha_lemmas_has_noun_single_lexname) print "huang.intersect(all_alpha_lemmas)", len(huang_vocab.intersection(all_alpha_lemmas)) print "manaal.intersect(all_alpha_lemmas)", len(manaal_vocab.intersection(all_alpha_lemmas)) print "brown.intersect(all_alpha_lemmas)", len(brown_vocab.intersection(all_alpha_lemmas)) print "huang*manaal*brown*all_alpha_lemmas", len(huang_vocab.intersection(all_alpha_lemmas, manaal_vocab, brown_vocab)) print "huang.intersect(all_lemmas_with_single_synset_alpha)", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha)) print "manaal.intersect(all_lemmas_with_single_synset_alpha)", len(manaal_vocab.intersection(all_lemmas_with_single_synset_alpha)) print "brown.intersect(all_lemmas_with_single_synset_alpha)", len(brown_vocab.intersection(all_lemmas_with_single_synset_alpha)) print "huang*manaal*brown*all_lemmas_with_single_synset_alpha", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha, manaal_vocab, brown_vocab))
def convert_all_to_basic(reviews): print("Process Started") print("Gettin all nouns....") words=[s for s in wn.all_synsets(wn.NOUN) if (s.name().find('-')==-1) and (s.name().find('_')==-1) and len(s.name().split('.')[0])<12] print("Processing basic logic probability...") words2 = [] filter_basic_logic(words,words2) print("Removing redundancy...") a = list(set(words2)) a.sort() remove_unwanted(a) newReviews = [] for review in reviews: tempReview = "" tokens = word_tokenize(review) for token in tokens: tempword = check_basic(token,a) if tempword: tempReview = tempReview + " " + tempword else: tempReview = tempReview + " " + token newReviews.append(tempReview) return newReviews
def wn_pos_dist(): """Count the Synsets in each WordNet POS category.""" # One-dimensional count dict with 0 as the default value: cats = defaultdict(int) # The counting loop: for synset in wn.all_synsets(): cats[synset.pos] += 1
def load_corpora( self ): print "Loading corpora..." pth = os.path.realpath( os.path.dirname(__file__) ) nltk.data.path.append( os.path.join( pth, "nltk_data" ) ) from nltk.corpus import wordnet as wn self._adjectives = list(wn.all_synsets('a')) self._nouns = list(wn.all_synsets('n')) with open( os.path.join( pth, "firstnames.txt") ) as fh: self._firstnames = fh.readlines() with open( os.path.join( pth, "surnames.txt") ) as fh: self._surnames = fh.readlines()
def populate_cache(): adjectives, nouns = (set(), set()) for wordset, kind in [ (adjectives, wordnet.ADJ), (nouns, wordnet.NOUN), ]: for synset in wordnet.all_synsets(kind): for lemma in filter( lambda l: all(( not re.search(r'\d', l.name()), l.name() not in BLACKLIST, not l.name().endswith('_to'), l.count() > 0, )), synset.lemmas() ): wordset.add(lemma.name().replace('_', ' ')) os.mkdir(CACHE_PATH) for words, filename in [ (adjectives, 'adjectives'), (nouns, 'nouns'), ]: with open(os.path.join(CACHE_PATH, filename), 'w') as f: f.writelines((u'{}\n'.format(w) for w in words))
def list_nouns(): ## TODO CREATE A SEPARATE LIST FOR NOUNS ENDING IN S global NOUNS print "[+] Creating list of nouns... (This only has to be done once)" ## Make list of nouns in wordnet NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} print " Done!"
def ex26_branchingfactor(): from nltk.corpus import wordnet as wn num_synsets = 0 num_hyponyms = 0 for noun_synset in wn.all_synsets("n"): (num_hyponyms, num_synsets) = \ branchingfactor_r(noun_synset, num_synsets, num_hyponyms) print "branching factor=", (num_hyponyms / num_synsets)
def print_all_synset_categories(): """ Prints all domains and categories for research purposes """ categories = [] for synset in list(wordnet.all_synsets('n')): categories.append(synset) return categories
def getTaggedHyps(): allsynsets = [ss for ss in wordnet.all_synsets()] alltags = {} for ss in allsynsets: alltags[ss.pos] = True taggedhyps = {} for tag in alltags: taggedhyps[tag] = getAllHyps(allsynsets, tag=tag) return taggedhyps
def getUpDownLinks(): links = {} for ss in wordnet.all_synsets(): for s in ss.hypernyms()+ss.hyponyms(): try: links[ss.name][s.name] = True except: links[ss.name] = {s.name:True} return links
def synonyms(word, lch_threshold=2.26): for net1 in wn.synsets(word): for net2 in wn.all_synsets(): try: lch = net1.lch_similarity(net2) except: continue if lch >= lch_threshold: yield (net1, net2, lch)
def print_all_synset_categories(): """Print all domains and categories for research purposes. :rtype categories (list): A list of all wordnet synsets. """ categories = [] for synset in list(wordnet.all_synsets('n')): categories.append(synset) return categories
def wordlistGen(type_="a"): words = list(wordnet.all_synsets(type_)) word_strs = [] random.shuffle(words) for word in words[:1000]: word = str(word.lemma_names()[0]) print word word_strs.append(str(word)) pickle.dump(word_strs, open(os.getcwd() + "/" + type_ + ".p", "wb"))
def ex13(): from nltk.corpus import wordnet as wn num_synsets = 0 num_synsets_wo_hyponyms = 0 for noun_synset in wn.all_synsets("n"): if len(noun_synset.hyponyms()) == 0: num_synsets_wo_hyponyms = num_synsets_wo_hyponyms + 1 num_synsets = num_synsets + 1 print num_synsets_wo_hyponyms * 100 / num_synsets
def load(self, pos): if pos == 'n': roots = wn.synsets('entity') else: roots = [s for s in wn.all_synsets(pos) if len(s.hypernyms()) == 0] self.root = WordNetTreeNode('root') for synset in roots: self.__append_synset(synset, self.root)
def export_wn_lexicon(output_fpath): with codecs.open(output_fpath, "w", "utf-8") as output: num_lemmas = 0 for i, synset in enumerate(wordnet.all_synsets()): for lemma in synset.lemmas(): print >> output, lemma.name() num_lemmas += 1 print "Wordnet vocabulary:", output_fpath print "# lemmas:", num_lemmas
def __init__(self): words=[s for s in wn.all_synsets(wn.NOUN) if (s.name().find('-')==-1) and (s.name().find('_')==-1) and len(s.name().split('.')[0])<12] words2 = self.filter_basic_logic(words) # print(words2) a = list(set(words2)) a = self.remove_unwanted(a) self.basic_word_list = a
def _get_wordnet_words(): """Get the set of all words known by WordNet. This is the set of all lemma names for all synonym sets in WordNet. """ return set(word.lower() for synset in wordnet.all_synsets() for word in synset.lemma_names())
def ex27_polysemy(): from nltk.corpus import wordnet as wn for pos in ["n", "v", "a"]: synsets = wn.all_synsets(pos) num_synsets = 0 num_senses = 0 for synset in synsets: num_synsets = num_synsets + 1 num_senses = num_senses + len(synset.lemmas) print "polysemy(" + pos + ")=", (num_senses / num_synsets)
def branching_factor(): all_syn = wn.all_synsets('n') total_ratio = 0 all = 0 for syn in all_syn: leaves = len(syn.hyponyms()) if leaves > 0: ratio = float(leaves) total_ratio = total_ratio + ratio all += 1 return total_ratio / float(all)
def polysemy(pos): p = 0 lemmas = [] syns = list(wn.all_synsets(pos)) for synset in syns: lemmas.extend(synset.lemma_names) for lemma in lemmas: new = len(wn.synsets(lemma, pos)) p = p + new length = len(syns) return p/length
def build_dictionary(): dictionary = dict() from nltk.corpus import wordnet for synset in wordnet.all_synsets(): for lemma in synset.lemmas: if lemma.name not in dictionary: dictionary[lemma.name.lower()] = [synset.definition] else: dictionary[lemma.name.lower()].append(synset.definition) json.dump(dictionary, gzip.open('dictionary.json.gzip', 'w'))
self.off_to_description = {} self.description_to_off = {} with open(file, 'r') as f: for line in f: index = line.index(',') self.off_to_description[int(line[0:index])] = line[index + 1:].strip() for key in self.off_to_description: self.description_to_off[self.off_to_description[key]] = key print "synset reader successfully initialized" return self.off_to_description, self.description_to_off def get_description(self, offset): try: if isinstance(offset, int): return self.off_to_description[offset] if isinstance(offset, str): # ss_... return self.off_to_description[int(offset[3:])] except KeyError: print "synset offset ", offset, "not found" if __name__=="__main__": from nltk.corpus import wordnet as wn import sys print "writing synsets to {}".format(sys.argv[1]) with open(sys.argv[1], 'w') as f: for ss in list(wn.all_synsets()): f.write('{},{}\n'.format(ss.offset(), ss.definition()))
import discord import asyncio from discord.ext.commands import Bot from discord.ext import commands import json from nltk.corpus import wordnet as wn import random nouns = list(wn.all_synsets(wn.NOUN)) adjectives = list(wn.all_synsets(wn.ADJ)) client = commands.Bot(description="Art Bot", command_prefix="!") @client.event async def on_ready(): print('Logged in as ' + client.user.name + ' (ID:' + client.user.id + ') | Connected to ' + str(len(client.servers)) + ' servers | Connected to ' + str(len(set(client.get_all_members()))) + ' users') print('--------') @client.event async def on_message(message): msg = message.content if msg.lower() == 'go' and is_admin(message.author.id): await client.send_message(client.get_channel('545331566881144833'), get_random_options(nouns, adjectives))
from difflib import get_close_matches try: from nltk.corpus import wordnet as wn raise_lookuperror_if_wordnet_data_absent = wn.synsets("python") except LookupError: import nltk nltk.download("wordnet") from unipath import Path import inflect ALL_WORDNET_WORDS = set() for synset in list(wn.all_synsets()): for lemma in synset.lemmas(): ALL_WORDNET_WORDS.add(lemma.name()) verbs_fh = open(Path(__file__).ancestor(1).child("en-verbs.txt")) lines = verbs_fh.readlines() verbs_fh.close() CONJUGATED_VERB_LIST = [] for line in lines: if line[0] != ";": CONJUGATED_VERB_LIST.append( [string for string in line.strip().split(",") if string != ""]) ADJECTIVE_TO_ADVERB = {"good" : "well", "fast" : "fast", "hard" : "hard", "late" : "late", "early" : "early", "daily" : "daily", "straight" : "straight"} for ss in wn.all_synsets(pos = "r"): for lemma in ss.lemmas(): word = lemma.name()
set([word for (word, tag) in brown_rel_tagged if tag == "NOUN" and word.isalpha() and not word[0].isupper()]) ) romn_clean = sorted( set([word for (word, tag) in brown_rom_tagged if tag == "NOUN" and word.isalpha() and not word[0].isupper()]) ) # %% from nltk.corpus import wordnet as wn type = "n" # %% synsets = wn.all_synsets(type) # %% def find_polysemy(text): count = 0 for w in text: count += len(wn.synsets(w)) return count / len(text) n_rel = find_polysemy(rel_clean) print(n_rel)
from nltk.corpus import wordnet as wn noun_count = 0 total_noun_count = 0 for synset in wn.all_synsets('n'): #print(synset.name()[:-5]) total_noun_count += 1 noun_count += len(wn.synsets(synset.name()[:-5], 'n')) print(noun_count, total_noun_count, noun_count / total_noun_count)
import numpy as np from nltk.corpus import wordnet as wn from vectorspace import VSM def syn2sks(synset): return list(set([lemma.key() for lemma in synset.lemmas()])) sks_vecs_path = sys.argv[1] syns_vecs_path = sys.argv[2] print('Loading sensekey vecs ...') sks_vsm = VSM() sks_vsm.load_txt(sks_vecs_path) print('Aggregating synset vecs ...') syn_vecs = defaultdict(list) for syn in wn.all_synsets(): for sk in syn2sks(syn): if sk in sks_vsm.labels_set: syn_vecs[syn.name()].append(sks_vsm.get_vec(sk)) print('Writing synset vecs ...') with open(syns_vecs_path, 'w') as syns_vecs_f: for syn, syn_vecs in syn_vecs.items(): syn_vec = np.array(syn_vecs).mean(axis=0) syn_vec_str = ' '.join([str(round(v, 6)) for v in syn_vec.tolist()]) syns_vecs_f.write('%s %s\n' % (syn, syn_vec_str))
# https://medium.com/snips-ai/an-introduction-to-snips-nlu-the-open-source-library-behind-snips-embedded-voice-platform-b12b1a60a41a # # * To be very fair on our benchmarks and results, we used the same train and test set used by the other benchmarks and no cross validation or stratified splits were used. The test data was not used in any way to improve the results. The dataset used can be found here: # # https://github.com/Botfuel/benchmark-nlp-2018/tree/master/results # # # Spacy english dataset with vectors needs to be present. It can be downloaded using the following command: # # python -m spacy download en_core_web_lg # !python -m spacy download en_core_web_lg nlp = spacy.load('en_core_web_lg') print('Running') nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')} verbs = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('v')} def get_synonyms(word, number=3): synonyms = [] for syn in wordnet.synsets(word): for l in syn.lemmas(): synonyms.append(l.name().lower().replace("_", " ")) synonyms = list(OrderedDict.fromkeys(synonyms)) return synonyms[:number] #Hyperparameters benchmark_dataset = '' # Choose from 'AskUbuntu', 'Chatbot' or 'WebApplication' oversample = False # Whether to oversample small classes or not. True in the paper
split = [] ids = [] modelid = [] ids_dict = {} with open('all.csv', 'rb') as f: reader = csv.reader(f) #csv read object file next(reader) # skip the headers for row in reader: ids.append(row[0]) synsetid.append(row[1]) modelid.append(row[3]) split.append(row[4]) #pdb.set_trace() syns = list(wordnet.all_synsets()) offsets_list = [(s.offset(), s) for s in syns] offsets_dict = dict(offsets_list) class_ids = set(synsetid) class_ids = list(class_ids) class_dict = {} for id in class_ids: #pdb.set_trace() key = int(id) class_name = offsets_dict[key] class_name = str(class_name) value = class_name.split('.')[0][8:] class_dict[key] = value
# text = [stemmer.stem(w) for w in text] text = [lemmatizer.lemmatize(w) for w in text] # now = time.time() # print('terminou lemmatize em:', int(now-start)) return ' '.join(text) def norm(vector): val = vector.dot(vector.transpose()) if vector.__class__ not in [np.matrix, np.ndarray]: val = val.toarray() return math.sqrt(val) # Então precisarei de lemma_namses. Quer ver se cobrem as definitions lemmas = [] defs = [] for s in wn.all_synsets(): defs.append(lemmatize(s.definition())) lemmas.append(lemmatize(' '.join(s.lemma_names()))) len(lemmas) len(defs) vectorizer = CountVectorizer() X = vectorizer.fit_transform(defs) vec2 = CountVectorizer() Y = vec2.fit_transform(lemmas) voc1 = vectorizer.get_feature_names() voc2 = vec2.get_feature_names() inter = list(set(voc1) & set(voc2)) print('voc1: {}; voc2: {}; inter: {}'.format(len(voc1), len(voc2), len(inter)))
def load_sk2syn(self): for synset in wn.all_synsets(): for lemma in synset.lemmas(): self.map_sk2syn[lemma.key()] = synset
def get_all_syns(self): return list(wn.all_synsets())
data_test_500_rand1_unseen = json.load( open(os.path.join(data_path, 'data_test_500_rand1_unseen.json'))) data_desc_c = json.load(open(os.path.join(data_path, 'data_desc_c.json'))) word_list = [] for data in data_test_500_rand1_seen: word_list.append(data['word']) # word for data in data_test_500_rand1_unseen: word_list.append(data['word']) # word #lines = open(os.path.join(data_path, 'concept_words.txt')).readlines() #concept_words = [line.strip() for line in lines] concept_words = [value['word'] for value in data_desc_c] word_list = word_list + concept_words all_synsets = list(wn.all_synsets()) word_synset = {} for synset in all_synsets: # filter all multi-word phrases indicated by _ lemmas = [lemma for lemma in synset.lemmas() if "_" not in lemma.name()] if len(lemmas) == 0: continue for lemma in lemmas: wd = lemma.name().lower() if wd in word_list: if wd not in word_synset: word_synset[wd] = [] tmp = [le.name().lower() for le in lemmas] word_synset[wd].extend(tmp) word_syn = [] for wd in word_list:
#워드넷으로 명사, 동사, 형용사, 부사의 다의어 평균 계산 from nltk.corpus import wordnet as wn type = 'n' # 품사의 유형을 명사로 설정(n-명사, v-동사, r-부사, a-형용사) synsets = wn.all_synsets(type) # 존재하는 명사 유형 n의 모든 synset 반환 # lemma 리스트로 통합 lemmas = [] for synset in synsets: for lemma in synset.lemmas(): lemmas.append(lemma.name()) # 중복을 제거하고 개별 lemmas count lemmas = set(lemmas) #리스트를 집합으로 변환하면 중복제거 count = 0 for lemma in lemmas: count = count + len(wn.synsets(lemma, type)) print('개별 기본형 합계: ', len(lemmas)) print('총 뜻: ', count) print(type, '(명사)의 다의어 평균: ', count/len(lemmas))
import numpy as np from nltk.corpus import wordnet as wn for head in wn.all_synsets(): for rel in [ 'hyponyms', 'hypernyms', 'part_meronyms', 'substance_meronyms', 'part_holonyms', 'substance_holonyms', 'entailments' ]: for tail in getattr(head, rel)(): print('\t'.join([head.name(), rel, tail.name()]))
from nltk.corpus import wordnet as wn from textstat.textstat import textstat """ Run with `python collect_naked.py > unfiltered_naked.txt` """ adjectives = list(wn.all_synsets('a')) + list(wn.all_synsets('s')) nakeds = [] # Collect all two-syllable adjectives for item in adjectives: for adj in item.lemmas(): syllables = round(textstat.syllable_count(adj.name())) if syllables == 2.0: naked = adj.name().replace("_", " ") nakeds.append(naked) # Uniques only nakeds = set(nakeds) for naked in nakeds: print naked.encode('utf8')
def get_concept_set(): for concept in nlwn.all_synsets(pos='n'): yield concept.name()
''' From WordNet extracts all nouns and writes them into a new txt file ''' import nltk from nltk.corpus import wordnet as wn my_file=open('English_nouns.txt','a') for synset in list(wn.all_synsets('n')): my_file.write(synset.name()[:-5]+"\n") my_file.close()
# url = "https://api.weather.gov/gridpoints/LWX/89,65/forecast" # html = request.urlopen(url).read().decode('utf8') # print(html[2100:2200]) # ch3 ex22 # import re # response = request.urlopen('http://news.bbc.co.uk/') # raw = response.read().decode('utf8') # print(re.sub(r'(<.*?>|<\/.*?>)(?s)', '', raw)) # ch2 ex27 from nltk.corpus import wordnet as wn print("average polysemy of: ") # nouns synsets = wn.all_synsets("n") lemmas = set() for synset in synsets: for lemma in synset.lemmas(): lemmas.add(lemma.name()) count = 0 for lemma in lemmas: count = count + len(wn.synsets(lemma, "n")) print("nouns: %s" % (count / len(lemmas))) # verbs synsets = wn.all_synsets("v") lemmas = set() for synset in synsets: for lemma in synset.lemmas(): lemmas.add(lemma.name()) count = 0
#12. count_distinct = 0 dublettes = [] prev = '' for entry in nltk.corpus.cmudict.entries(): if ((entry[0] == prev) and (entry[0] not in dublettes)): dublettes.append(entry[0]) else: count_distinct = count_distinct + 1 prev = entry[0] print count_distinct print (len(dublettes) / count_distinct) * 100 #13. all_syns = list(wn.all_synsets('n')) no_hyponyms = [s for s in all_syns if len(s.hyponyms()) == 0] print (len(no_hyponyms) / len(all_syns)) * 100 #14. def supergloss(s): gloss = 'definition: ' + s.definition() + '\n\n' gloss = gloss + 'Hypernyms:\n' for hypernym in s.hypernyms(): gloss = gloss + hypernym.name() + ': ' + hypernym.definition() + '\n' gloss = gloss + '\nHyponyms:\n' for hyponym in s.hyponyms(): gloss = gloss + hyponym.name() + ': ' + hyponym.definition() + '\n' return gloss print superglosssuperglo (wn.synset('bicycle.n.01'))
def offset_pos_from_emb_key(str): return str[PREF:PREF + 8], str[PREF + 9:PREF + 10] def offstr(off): return '{0:0>8}'.format(off) def pos_of_set(set_name): return set_name.split('.')[-2] if __name__ == '__main__': # map synset names to offsets sset_offs = {s.name(): s.offset() for s in wn.all_synsets()} # load expected synset names in order (3.0) with open(MATRICES_DATA_W_SETS) as matrix_file: set_names = pickle.load(matrix_file)[1] # it's the same # as wn.all_synsets(), but let's be really careful and extensible. print('finished loading {} 3.0 synsets and offset map'.format( len(set_names))) # load embeddings syn_embs = {'n': {}, 'v': {}, 'r': {}, 'a': {}} with open(SYNSET_EMBEDDINGS_FILE) as embs_file: header_line = True for l in embs_file.readlines(): if header_line: # skip header_line = False
else: if len(def_words) <= 4 or ';' in definition: return definition.replace(',', '').split(';') else: return [] else: return [] # the whole process might take a long time for all POS, better deploy the process of each POS in separate machines # for noun synsets, we only deal with those that does not have example sentences for pos in ['n', 'v', 'a', 'r']: non_retreive = list() type2pos = {1: 'n', 2: 'v', 3: 'a', 4: 'r', 5: 'a'} all_synsets = [ i.name() for i in wn.all_synsets(pos) if len( wn.synsets(i.name().split('.')[0], type2pos[int( i.lemmas()[0].key().split('%')[1][0])])) > 0 ] if os.path.exists('./sentence_dict_%s_new' % pos): sentence_dict = { i: j for i, j in pickle.load(open('./sentence_dict_%s_new' % pos, 'rb')).items() } non_retreive = [i for i in sentence_dict.keys()] else: sentence_dict = defaultdict(list) non_retreive = all_synsets loop_bool = True
def get_all_synsets(self): return wn.all_synsets('n')
motorcar = wn.synsets('motorcar') print('synsets that motorcar belongs to: ' + repr(motorcar)) cars = wn.synset('car.n.01') print('synset of car sense 1: ' + str(cars)) print('car sense 1 lemma names: ' + repr(cars.lemma_names())) print('car sense 1 definition: ' + cars.definition()) print('car sense 1 example sentences: ' + repr(cars.examples())) car_lemmas = cars.lemmas() print('car sense 1 lemmas: ' + repr(car_lemmas)) automobile = wn.lemma('car.n.01.automobile') print('synset of automobile (car sense 1): ' + str(automobile.synset())) print('name of the automobile lemma: ' + automobile.name()) all_noun_synsets = wn.all_synsets('n') print('number of noun synsets: ' + str(len(list(all_noun_synsets)))) car_synsets = wn.synsets('car') print('synsets that car belongs to: ' + repr(car_synsets)) for synset in car_synsets: print(str(synset) + ' ' + repr(synset.lemma_names())) print('synsets in which car is a lemma: ' + repr(wn.lemmas('car'))) motorcar = wn.synset('car.n.01') types_of_motorcar = motorcar.hyponyms() print('types of motorcars: ' + repr(types_of_motorcar)) print('types of motorcars (all words): ' + repr(sorted([lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()]))) print('motorcar hypernyms: ' + repr(motorcar.hypernyms()))
#6. What percentage of noun synsets have no hyponyms? You can get all noun synsets using wn.all_synsets('n') from nltk.corpus import wordnet as wn nounSynsets = set(wn.all_synsets('n')) zeroCount = 0 total = 0 for synset in nounSynsets: total += 1 if len(synset.hyponyms()) == 0: zeroCount += 1 percentage = round((zeroCount / total) * 100, 2) print(percentage, "%")
def init_gloss_data(depth=2): gloss_data = {} for synset in tqdm.tqdm(list(wn.all_synsets())): gloss_data[synset.name()] = {} related_synsets = [synset] for d in range(depth): gloss_words = [] example_words = [] for s in related_synsets: gloss_words += synset_gloss[s.name()] example_words += synset_example[s.name()] for w in gloss_words: if w not in gloss_data[synset.name()]: gloss_data[synset.name()][w] = { "freq": 1, "graph_distance": d, } else: gloss_data[synset.name()][w]["freq"] += 1 for w in example_words: if w not in gloss_data[synset.name()]: gloss_data[synset.name()][w] = { "freq": 1, "graph_distance": d + 1, } else: gloss_data[synset.name()][w]["freq"] += 1 new_related_synset = [] for s in related_synsets: ns = s.also_sees() \ + s.attributes() \ + s.causes() \ + s.entailments() \ + s.hyponyms() \ + s.hypernyms() \ + s.instance_hypernyms() \ + s.instance_hyponyms() \ + s.member_meronyms() \ + s.member_holonyms() \ + s.part_holonyms() \ + s.part_meronyms() \ + s.region_domains() \ + s.substance_meronyms() \ + s.substance_holonyms() \ + s.topic_domains() \ + s.usage_domains() \ + s.verb_groups() \ + s.similar_tos() for l in s.lemmas(): ns += [ x.synset() for x in l.derivationally_related_forms() ] ns += [x.synset() for x in l.pertainyms()] if use_glossdisambiguated: ns += synset_gloss_relation[s.name()] new_related_synset += ns related_synsets = list(set(new_related_synset)) return gloss_data
import django django.setup() # WordNet30 Populating from ws_web.models import WordNet30 from nltk.corpus import wordnet as wn def populate(x): # data is a list of lists d, created = WordNet30.objects.get_or_create(word=x.name().split('.')[0], pos=x.pos(), offset=x.offset(), definition=x.definition(), examples=x.examples(), lemma_names=x.lemma_names(), name=x.name()) print(d, created) if __name__ == "__main__": y = wn.all_synsets() i = 0 for synset in iter(y): i += 1 populate(synset) print(i)
for entry in test: for ingredient in entry["ingredients"]: words = ingredient.lower().replace("-","").split(" ") for i, word in enumerate(words): if word in adjDict and i<len(words)-1: adjDict[word] += 1 wordList = [] for key, value in adjDict.iteritems(): if value>5: wordList.append(key) nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} # adjs = {x.name().split('.', 1)[0] for x in wn.all_synsets(wn.ADJ)} finalList = [] seen = set() for word in wordList: if word in nouns: wordList.remove(word) else: finalList.append(word) seen.add(word) print finalList with open('excludeList.json', 'w') as outfile: json.dump(finalList, outfile)
files = sorted( glob.glob( input_search_string ) ) print "Found {0} input images in {1}".format( len(files), input_search_string ) if len(files) > 0: #Only load names if there is something to name if generate_memorable_names: print 'Loading words for memorable name generation.' import nltk import random from nltk.corpus import wordnet # Seed based on input path so that names will be the same for multiple volumes random.seed( sbdm_string_hash( original_input_ids_path ) ) nouns, verbs, adjectives, adverbs = [list(wordnet.all_synsets(pos=POS)) for POS in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]] nouns_verbs = nouns + verbs adjectives_adverbs = adjectives + adverbs nouns_verbs = [x for x in nouns_verbs if not ( '_' in x.lemmas[0].name or '-' in x.lemmas[0].name )] adjectives_adverbs = [x for x in adjectives_adverbs if not ( '_' in x.lemmas[0].name or '-' in x.lemmas[0].name )] def make_memorable_name(): while True: word1 = random.choice(random.choice(adjectives_adverbs).lemmas).name #ignore hyphenated words if not ('_' in word1 or '-' in word1): break while True:
#!/usr/bin/env python import nltk from nltk.corpus import wordnet as wn ss = wn.all_synsets(pos=wn.NOUN) for s in ss: ws = [l.name() for l in s.lemmas()] for n, t in nltk.pos_tag(ws): if t == 'NNP' and len(n) > 3: print(n.replace('_', ' '))
def add_topic_features(data, test=False): print("Adding topic features") if not test: print("Train set registered, computing sentiment and topics") adjectives = set([ synset.name().split('.')[0] for synset in list(wn.all_synsets(wn.ADJ)) ]) dataset = data documents = dataset['reviewText'] new_df = pd.DataFrame({'document': documents}) tokenized_doc = new_df['document'] tokenized_doc = tokenized_doc.apply( lambda x: [item for item in x if item in adjectives]) # de-tokenization detokenized_doc = [] for i in range(len(new_df)): t = ' '.join(tokenized_doc[i]) detokenized_doc.append(t) new_df['document'] = detokenized_doc vectorizer = TfidfVectorizer( stop_words='english', max_features=100, # keep top 1000 terms max_df=0.9, smooth_idf=True) X = vectorizer.fit_transform(new_df['document']) # SVD represent documents and terms in vectors svd_model = TruncatedSVD(n_components=25, algorithm='randomized', n_iter=100, random_state=122) svd_model.fit(X) terms = vectorizer.get_feature_names() topics = [] for i, comp in enumerate(svd_model.components_): terms_comp = zip(terms, comp) sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7] topics.append(sorted_terms[0][0]) cleaned_topics = list(set(topics)) print("Writing topics") print(cleaned_topics) with open('topics.txt', 'w') as topic_file: for item in cleaned_topics: topic_file.write(item + '\n') #add features to df df = data df = pd.concat([df, pd.DataFrame(columns=cleaned_topics)], sort=False) df = df.fillna(int(0)) print("Adding topics to train") print(cleaned_topics) for i, row in df.iterrows(): intersect = set(row['reviewText']) & set(cleaned_topics) for word in intersect: df.at[i, word] = 1 return df else: print("Test registered, writing topics to dataframe") cleaned_topics = [] print("Opening topic file") with open('topics.txt', 'r') as topic_file: for line in topic_file.readlines(): cleaned_topics.append(line.strip("\n")) #print(data.head(2)) print("Adding test topics") print(cleaned_topics) df = data df = pd.concat([df, pd.DataFrame(columns=cleaned_topics)], sort=False) df = df.fillna(int(0)) for i, row in df.iterrows(): intersect = set(row['reviewText']) & set(cleaned_topics) for word in intersect: df.at[i, word] = 1 return df