def nounify(verb_word): set_of_related_nouns = set() l1 = wn.morphy(verb_word, wn.VERB) if (l1 is not None): print(l1) for lemma in wn.lemmas(l1, pos="v"): for related_form in lemma.derivationally_related_forms(): for synset in wn.synsets(related_form.name(), pos=wn.NOUN): if wn.synset('person.n.01') in synset.closure( lambda s: s.hypernyms()): set_of_related_nouns.add(synset) l1 = wn.morphy(verb_word, wn.ADJ) if (l1 is not None): print(l1) for lemma in wn.lemmas(l1, pos="a"): for related_form in lemma.derivationally_related_forms(): for synset in wn.synsets(related_form.name(), pos=wn.NOUN): if wn.synset('person.n.01') in synset.closure( lambda s: s.hypernyms()): set_of_related_nouns.add(synset) for lemma in wn.lemmas(l1, pos="s"): for related_form in lemma.derivationally_related_forms(): for synset in wn.synsets(related_form.name(), pos=wn.NOUN): if wn.synset('person.n.01') in synset.closure( lambda s: s.hypernyms()): set_of_related_nouns.add(synset) return set_of_related_nouns
def print_other_lexical_rel(): good1 = wn.synset('good.a.01') wn.lemmas('good') print("Antonyms of 'good': " + str(good1.lemmas()[0].antonyms())) print("") print("Entailment of 'walk': " + str(wn.synset('walk.v.01').entailments())) print("")
def wordNet_features(lidx_start, lidx_end, ridx_start, ridx_end, pos_dict): all_tokens = [tok.lower() for tok, span in pos_dict.values()] features = [] try: sims = set(wn.synsets(all_tokens[lidx_start])).intersection( wn.synsets(all_tokens[ridx_start])) if len(sims) > 0: features.append(1) else: features.append(0) except: features.append(0) try: lderiv = set( itertools.chain.from_iterable([ lemma.derivationally_related_forms() for lemma in wn.lemmas(all_tokens[lidx_start]) ])) rderiv = set( itertools.chain.from_iterable([ lemma.derivationally_related_forms() for lemma in wn.lemmas(all_tokens[ridx_start]) ])) if len(lderiv.intersection(rderiv)) > 0: features.append(1) else: features.append(0) except: features.append(0) return features
def isCompound(tok1, tok2): if tok1 is None or tok2 is None: return False l1 = tok1.stem l2 = tok2.stem ll = [l1,l2] return wn.lemmas(''.join(ll)) or wn.lemmas('_'.join(ll)) or wn.lemmas('-'.join(ll))
def path_similarity(term1, term2): ''' Evaluates the path similarity between two italian words using WordNet. :param term1: The first word. Must be a dictionary with the following keys: "lemma": the lemma, "pos": the pos tag :param term2: The first word. Must be a dictionary with the following keys: "lemma": the lemma, "pos": the pos tag :return: the path similarity :rtype: float ''' if term1['pos'] in ['r', 'n', 'a', 'v']: lemma1 = wn.lemmas(term1['lemma'], lang='ita', pos=term1['pos']) else: lemma1 = wn.lemmas(term1['lemma'], lang='ita') if term2['pos'] in ['r', 'n', 'a', 'v']: lemma2 = wn.lemmas(term2['lemma'], lang='ita', pos=term2['pos']) else: lemma2 = wn.lemmas(term2['lemma'], lang='ita') if (len(lemma2) == 0 or len(lemma1) == 0) and term1['lemma'] == term2['lemma']: return 1 if len(lemma1) == 0 or len(lemma2) == 0: return 0 synset1 = lemma1[0].synset() synset2 = lemma2[0].synset() tmp = synset1.path_similarity(synset2) if tmp is None: return 0 else: return tmp
def is_eventive_adverb(token): """ Returns true if the given token is an eventive adverb. We use simple and conservative heuristics that detects adverbs originated from present or past participles of verbs as events, e.g., `sparklingly' and `unexpectedly'. """ word = token.text.lower() # Remove a prefix, if any, to make verb conjugation and checking easier below. morphemes = get_morphemes(word) if morphemes and morphemes[0] in PREFIXES: word = word[len(morphemes[0]):] if len(word) == 0: return False # Pertainyms are relational adjectives lm_adjs = [ lm_ptn for lm in wn.lemmas(word, pos=wn.ADV) for lm_ptn in lm.pertainyms() ] if word.endswith('ly'): lm_adjs.extend(wn.lemmas(word[:-2])) words = set([lm_adj.name() for lm_adj in lm_adjs]) for word in words: conj_inf = conjugate(word, 'inf') if wn.lemmas(conj_inf, wn.VERB): if conjugate(conj_inf, 'ppart') == word: return True if conjugate(conj_inf, 'part') == word: return True return False
def func1(): while True: word = input('Enter a word: ') print('Synsets:') for i, w in enumerate(wn.synsets(word)): print("{0}) '{1}' -- definition: '{2}'".format( i, w.name(), w.definition())) print('Lemmas:') for i, w in enumerate(wn.lemmas(word)): print("{0}) '{1}'".format(i, str(w))) print('') #continue lem_id = int(input('Enter an lemma ID: ')) l = wn.lemmas(word)[lem_id] print(l, l.definition()) continue syn_id = int(input('Enter an synset ID: ')) synset = wn.synsets(word)[syn_id] print("Lemmas in %s:" % str(synset)) if len(wn.synsets(word)) > 0: for i, l in enumerate(synset.lemmas()): print("{0}) '{1}'".format(i, l)) lem_id = int(input('Enter an lemma ID: ')) lemma = synset.lemmas()[lem_id] print([x for x in dir(lemma) if not re.match('^_.+', x)]) print("<%s> :" % str(lemma)) print('synset - ', lemma.synset()) print('hypernyms - ', lemma.hypernyms()) print('hyponyms - ', lemma.hyponyms()) print('pertainyms - ', lemma.pertainyms()) print('antonyms - ', lemma.antonyms()) print('usage_domains - ', lemma.usage_domains())
def lemma_builder(words): """ returns a list of lemmas from a list of words """ lemmas = [] for word in words: if wn.lemmas(word): lemmas.append(wn.lemmas(word)[0].name) return lemmas
def synonyms(wordlist): wlist=[] for word in wordlist: print word if wordnet.lemmas(word): string=str(wordnet.lemmas(word)[0]) wlist+=re.findall("Lemma\(\'(.+?)\.\w",string) print re.findall("Lemma\(\'(.+?)\.\w",string) else: wlist+=word return wlist
def get_reasonable_synsets(word): lemmas = wordnet.lemmas(word) cnet_lemma, _pos = LEMMATIZER.lookup('en', word) if cnet_lemma != word: lemmas += wordnet.lemmas(cnet_lemma) good_synsets = [] for lem in lemmas: syn = lem.synset() if syn.lemma_names()[0] == word: good_synsets.append(syn) if not good_synsets: return [lem.synset() for lem in lemmas] else: return good_synsets
def wordnet_vector(concept, conceptlist): from nltk.corpus import wordnet start_points = wordnet.lemmas(concept.replace(' ', '_')) if not start_points: return None results = divisi2.DenseVector(None, conceptlist) for concept2 in conceptlist: end_points = wordnet.lemmas(concept2.replace(' ', '_')) best_sim = 0.0 for start_point in start_points: for end_point in end_points: sim = start_point.synset.wup_similarity(end_point.synset) if sim > best_sim: best_sim = sim results[results.index(concept2)] = best_sim return results
def predict_nearest(self, context): lemma = context.lemma pos = context.pos similarity = {} LemmaList = wn.lemmas(lemma, pos=pos) for lemmaTemp in LemmaList: LemmaListNew = lemmaTemp.synset().lemmas() # print(LemmaListNew) for l in LemmaListNew: name = l.name() # print(name) if name != lemma: if name not in self.model.vocab: continue #if "_" in name: # name = name.replace("_", " ") # continue if name not in similarity: similarity[name] = self.model.similarity(name, lemma) SimilarityList = sorted(similarity.items(), key=lambda x: x[1], reverse=True) return SimilarityList[0][0]
def predict_nearest(self, context): stop_words = stopwords.words('english') target_lemma, target_pos = context.lemma, context.pos v1 = self.model.wv[target_lemma] possible_synonyms = {} for lexeme in wn.lemmas(target_lemma, pos=target_pos): synset = lexeme.synset() for w in synset.lemmas(): synonym = w.name().replace('_', ' ') synonym_words = synonym.split() # if synonym consists of only one word and it is embedded in wv # calculate the cosine distance between the target and synonym # if synonym not in wv use 'UNK' embedding if len(synonym_words ) == 1 and synonym_words[0] in self.model.wv: v2 = self.model.wv[synonym_words[0]] possible_synonyms[synonym] = self.cos(v1, v2) else: # if the synonym compose of multiple words, e.g take up, in order to # first get rid of the stop words, then find the first word in wv if available # calculate the cosine distance between the first word and target # use the first word as the prediction synonym_words = [ w for w in synonym_words if w not in stop_words ] if synonym_words[0] in self.model.wv: v2 = self.model.wv[synonym_words[0]] possible_synonyms[synonym_words[0]] = self.cos(v1, v2) possible_synonyms.pop(target_lemma, None) predictor = [ k for k, v in possible_synonyms.items() if v == max(possible_synonyms.values()) ][0] return predictor # replace for part 4
def get_word_sense_vectors(candidate): vectors = {} try: candidate_vec = glove[candidate] except Exception: # print(candidate, "not found in glove") return None for sense in wn.lemmas(candidate): # if candidate == "bank": # print("synonym of ", candidate, " is ", ss.lemmas()[0].name()) # print("key of ", candidate, " is ", ss.lemmas()[0].key()) gloss = [sense.synset().definition()] gloss.extend(sense.synset().examples()) word_vectors = [] for sentence in gloss: tokens = nltk.word_tokenize(sentence) pos_tags = nltk.pos_tag(tokens) for gloss_pos, tag in pos_tags: if get_valid_pos_tag(tag): try: gloss_word_vec = glove[gloss_pos] except Exception: # print(gloss_pos, "not found in glove") continue cos_sim = dot(gloss_word_vec, candidate_vec) / (norm(gloss_word_vec) * norm(candidate_vec)) if cos_sim > cosine_sim_threshold: word_vectors.append(gloss_word_vec) if len(word_vectors) == 0: continue sense_vector = average(word_vectors, 0) vectors[sense] = sense_vector return vectors
def get_w2v_embeddings_from_pretrained_googlenews_wordnet( pretrained_embedding_fpath, save_path): if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) tic = time.time() print( 'Please wait ... (it could take a while to load the file : {})'.format( pretrained_embedding_fpath)) model = gensim.models.KeyedVectors.load_word2vec_format( pretrained_embedding_fpath, binary=True) print('Done. (time used: {:.1f}s)\n'.format(time.time() - tic)) embedding_weights = {} found_cnt = 0 for word in tqdm.tqdm(model.vocab, desc="Filtering Words Using WordNet"): if has_digits(word) or len(word) < 3: continue if len(wordnet.lemmas(word)) > 0: embedding_weights[word] = model.word_vec(word) found_cnt += 1 save2pickle(save_path, embedding_weights)
def _lemmata_by_freq(query, pos): """Return lemmata for query, sorted descending by frequency. """ lemmata = wordnet.lemmas(query, pos) return sorted(lemmata, key=lambda lemma: lemma.count(), reverse=True)
def thesaurus(word): ss = wn.synsets(word) lms = wn.lemmas(word) if ss: pt = 200 defl = "" for lem in lms: if lem.count() == 0: lem_count = 100 else: lem_count = lem.count() if lem._lexname_index == 0: lem_lex_index = 1 else: lem_lex_index = lem._lexname_index / 100 if pt > lem_count + lem_lex_index: pt = lem_count + lem_lex_index defl = lem.synset().definition() # defl=defl+markgreen('['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>') # else: # defl=defl+'['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>' # ssddf = ss[0].definition() # for ssitem in ss: # curdegf=ssitem.lexname return defl # ss[0].definition() else: return ""
def is_eventive_adjective(token): """ Returns true if the given token is an eventive adjective. We use simple and conservative heuristics that detects adjectives originated from present or past participles of verbs as events, e.g., `sparkling' and 'man-made'. """ word = token.text.lower() if '-' in word: # e.g., 'well-known' # Finding a head subword is difficult. First, we check if # there is a verb in subwords. If not, use the last token. verb = None for subword, pos in reversed(nltk.pos_tag(word.split('-'))): if pos.startswith('VB'): verb = subword # found a verb break word = verb if verb is not None else word.split('-')[-1] # Remove a prefix, if any, to make verb conjugation and checking easier below. morphemes = get_morphemes(word) if morphemes and morphemes[0] in PREFIXES: word = word[len(morphemes[0]):] if len(word) == 0: return False conj_inf = conjugate(word, 'inf') if wn.lemmas(conj_inf, wn.VERB): if conjugate(conj_inf, 'ppart') == word: return True if conjugate(conj_inf, 'part') == word: return True return False
def best_synset(word_str, pos_tag='n'): if isinstance(word_str, Token): word_str, pos_tag = word_str.text.lower(), word_str.pos_ assert isinstance(word_str, str) assert isinstance(pos_tag, str) lemma = lemmatize(word_str.lower()) if lemma: lemma = lemma[0] tag = to_wordnet_tag(pos_tag) try: if lemma and pos_tag: synset = wn.synset('{}.{}.{}'.format(lemma, tag, '01')) if synset: return synset raise WordNetError except WordNetError: try: lemmas = wn.lemmas(lemma) if lemmas: synset = lemmas[0].synset() if synset: return synset raise WordNetError except WordNetError: pass
def thesaurus(word): ss = wn.synsets(word) lms = wn.lemmas(word) if ss: pt = 200 defl = '' for lem in lms: if lem.count() == 0: lem_count = 100 else: lem_count = lem.count() if lem._lexname_index == 0: lem_lex_index = 1 else: lem_lex_index = lem._lexname_index / 100 if pt > lem_count + lem_lex_index: pt = lem_count + lem_lex_index defl = lem.synset().definition() # defl=defl+markgreen('['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>') # else: # defl=defl+'['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>' # ssddf = ss[0].definition() # for ssitem in ss: # curdegf=ssitem.lexname return defl # ss[0].definition() else: return ''
def hyponyms(word): hyponyms = [] for lemma in wn.lemmas(word): for hyponym in lemma.synset.hyponyms(): for word in hyponym.lemma_names: hyponyms.append(word.replace('_',' ')) return hyponyms
def get_sinonimos(palavra: str, lang: str) -> list: """ Execute a consulta de sinônimos na wordnet. :param palavra: palavra que se deve bucar os sinônimos :param lang: linguagem(abreviação) dos quais os sinônimos devem ser tradidos :return: Lista de string com os sinônimos retornados pela wordnet. A string estará no formato "^.*\.(a|v|n).([0-9][0-9])\..*$" """ logger.info("Buscando sinonimos: [palavra=%s, lang=%s]", palavra, lang) sinonimos = set() lemmasDaPalavra = wn.lemmas(palavra, lang=SinonimosConstantes.LEMMAS_LANG) logger.debug("Lemmas da palavra '%s' com lang=%s: %s", palavra, SinonimosConstantes.LEMMAS_LANG, lemmasDaPalavra) for lemma in lemmasDaPalavra: synsetNome = lemma.synset().name() synsetLemmas = wn.synset(synsetNome).lemmas(lang) for synsetLemma in synsetLemmas: synsetLammaName = synsetLemma.name() synsetLemmaSynsetName = synsetLemma.synset().name() sinonimo = '.'.join([synsetLemmaSynsetName, synsetLammaName]) sinonimos.add(sinonimo) logger.debug("[lemma=%s] = [synsetNome=%s, synsetLemmas=%s] = [synsetLammaName=%s, synsetLemmaSynsetName=%s] = [sinonimo=%s]", lemma, synsetNome, synsetLemmas, synsetLammaName, synsetLemmaSynsetName, sinonimo) logger.info("Sinonimos obtidos: %s", str(sinonimos)) return list(sinonimos)
def get_more_candidates(lemma, pos, depth=2): # Return solution as a set to make sure unique lemmas are returned possible_synonyms = set([]) # Retrieve all lexemes for the particular lemma and pos lexemes = wn.lemmas(lemma, pos=pos) # Iterate over lexemes for lexeme in lexemes: # Get the synset for current lexeme synset = lexeme.synset() # Get the lexemes from the synset for candidate_lemma in synset.lemmas(): # Retrieve the name from a lemma structure candidate_lemma_name = candidate_lemma.name() # Make sure we don't add input lemma as solution if candidate_lemma_name != lemma: # Check if lemma contains multiple words if len(candidate_lemma_name.split('_')) > 1: # Replace '_' with ' ', e.g. 'turn_around' -> 'turn around' candidate_lemma_name = candidate_lemma_name.replace('_', ' ') # Add lemma to the solution possible_synonyms.add(candidate_lemma_name) return possible_synonyms
def get_variant_words(cls, input_word): input_word = wn.lemmas(input_word) return set([ form.name() for lem in input_word for form in lem.derivationally_related_forms() ])
def reconstruct(s_element): sentence = [] wf_count = 1 gnd_atoms = [] for e in s_element.iter(): if e.text is not None: sentence.append(e.text) if e.tag == 'wf': wf_count += 1 word_const = '{}-{:f}'.format(e.text, wf_count) if e.get('pos', None) is not None: gnd_atoms.append('has_pos({},{})'.format(word_const, e.get('pos'))) if e.get('lemma', None) is not None: lem = e.get('lemma') lexsn = e.get('lexsn') synset = None for l in wordnet.lemmas(lem): if l.key == '{}%%{}'.format(lem, lexsn): synset = l.synset if synset is not None: sid = 's-{}'.format(word_const) gnd_atoms.append('has_sense({},{})'.format(word_const, sid)) gnd_atoms.append('is_a({},{})'.format(sid, synset.name)) sentence = ' '.join(sentence).strip() return sentence, gnd_atoms
def wn_frequency_predictor(context): # Counter with lemma as key and its count as value synonyms_counter = Counter() # Retrieve all lexemes for the particular lemma and pos lexemes = wn.lemmas(context.lemma, pos=context.pos) # Iterate over lexemes for lexeme in lexemes: # Get the synset for current lexeme synset = lexeme.synset() # Get the lemmas from the synset for candidate_lemma in synset.lemmas(): candidate_lemma_name = candidate_lemma.name() # Make sure we don't add input lemma as solution if candidate_lemma_name != context.lemma: # Check if lemma contains multiple words if len(candidate_lemma_name.split('_')) > 1: # Replace '_' with ' ', e.g. 'turn_around' -> 'turn around' candidate_lemma_name = candidate_lemma_name.replace('_', ' ') # Add to the solution, the same lemma can be added twice # if it appears together with input lemma in multiple synsets synonyms_counter[candidate_lemma_name] += candidate_lemma.count() # If there is a tie, pick an arbitrary lemma, whatever comes first # in the front of the list after sorted descendingly return synonyms_counter.most_common(1)[0][0]
def wn_simple_lesk_predictor(context: Context) -> str: stop_words = stopwords.words('english') overlap = defaultdict() ct = context.right_context + context.left_context ct = [i for i in ct if i not in stop_words and i not in string.punctuation] for i in wn.lemmas(context.lemma, context.pos): if i.synset().lemma_names() != [context.lemma]: defi = [j for j in i.synset().examples()] defi.append(i.synset().definition()) for j in i.synset().hypernyms(): defi = defi + j.examples() defi.append(j.definition()) defi = [ j for i in defi for j in tokenize(i) if j not in stop_words and j not in string.punctuation ] OV = [i in defi for i in ct] overlap[i] = sum(OV) / (len(OV) + 1) maximum = max(overlap, key=overlap.get) lemmadict = { maximum.synset().lemma_names()[i]: maximum.synset().lemmas()[i].count() for i in range(len(maximum.synset().lemmas())) } del lemmadict[context.lemma] maximum = max(lemmadict, key=lemmadict.get) return maximum.replace('_', ' ')
def convert_pos_by_lemmas(word, from_pos, to_pos): """ Transform words given from/to POS tags """ lemmas = wn.lemmas(word, pos=from_pos) # Word not found if not lemmas: return [] # Get related forms derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas] # filter only the desired pos (consider 'a' and 's' equivalent) related_noun_lemmas = [] for drf in derivationally_related_forms: for l in drf[1]: if l.synset().name().split('.')[1] == to_pos or \ (to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)): related_noun_lemmas += [l] # Extract the words from the lemmas related_words = [l.name() for l in related_noun_lemmas] return list(set(related_words))
def make_gerund(word): """ Create gerund form of word if it's possible code -> coding :param word: :return: """ # now = time.time() is_found = False gerund = word for lem in wn.lemmas(word): for related_form in lem.derivationally_related_forms(): if related_form.name().endswith('ing'): gerund = related_form.name() GerundWords.objects.get_or_create(word=gerund) is_found = True break if is_found: break if not is_found: if word.endswith('e'): g_word = GerundWords.objects.filter(word='%sing' % gerund[:-1]) if g_word: return g_word[0].word else: g_word = GerundWords.objects.filter(word='%sing' % gerund) if g_word: return g_word[0].word # logger.debug("Time get_gerund: {} {}".format(time.time() - now, word)) return gerund
def antonyms(word): antonyms = [] for lemma in wn.lemmas(word): for antonym in lemma.antonyms(): for word in antonym.synset.lemma_names: antonyms.append(word.replace('_',' ')) return antonyms
def get_synonyms(term): """ Funzione che trova i sinonimi di un termine Args: term: termine di cui cercare i sinonimi Returns: i sinonimi del termine in ingresso """ term_lemmas = wn.lemmas(term, lang="ita") if term_lemmas == []: return ["Non sono stati trovati sinonimi per questo termine"] else: synonyms_lemma = term_lemmas[0].synset() synonyms_lemmas_ita = synonyms_lemma.lemmas(lang="ita") synonyms = list() for lemma in synonyms_lemmas_ita: if lemma.name() == term: pass else: lemma_str = str(lemma.name()) lemma = lemma_str.replace('_', ' ') synonyms.append(lemma) if synonyms == []: return ["Non sono stati trovati sinonimi per questo termine"] return synonyms
def print_syn_lemmas(word): ## Synsets and Lemmas print("1. Synsets and Lemmas") print("Word: " + word) print("") print("Synsets:") [print(s) for s in wn.synsets(word)] print("") first_synset = wn.synsets(word)[0] print("First synset: " + str(first_synset)) print("") #word_synset = wn.synset("dog.n.01") print("Lemma names: ") [print(l) for l in first_synset.lemma_names()] print("") last_lemma = first_synset.lemmas()[len(first_synset.lemma_names())-1] #word_lemma = wn.lemma("dog.n.01.domestic_dog") print("Last lemmas: " + str(last_lemma)) print("") print("Synset of Last lemmas: " + str(last_lemma.synset())) print("") for synset in wn.synsets(word): print(str(synset) + ": lemma_name" + str(synset.lemma_names())) print("") print("Lemmas of {}:".format(word)) [print(l) for l in wn.lemmas(word)] print("") print("")
def print_syn_lemmas(word): ## Synsets and Lemmas print("1. Synsets and Lemmas") print("Word: " + word) print("") print("Synsets:") [print(s) for s in wn.synsets(word)] print("") first_synset = wn.synsets(word)[0] print("First synset: " + str(first_synset)) print("") #word_synset = wn.synset("dog.n.01") print("Lemma names: ") [print(l) for l in first_synset.lemma_names()] print("") last_lemma = first_synset.lemmas()[len(first_synset.lemma_names()) - 1] #word_lemma = wn.lemma("dog.n.01.domestic_dog") print("Last lemmas: " + str(last_lemma)) print("") print("Synset of Last lemmas: " + str(last_lemma.synset())) print("") for synset in wn.synsets(word): print(str(synset) + ": lemma_name" + str(synset.lemma_names())) print("") print("Lemmas of {}:".format(word)) [print(l) for l in wn.lemmas(word)] print("") print("")
def get_candidates(lemma, pos) -> List[str]: # Part 1 all_lemma = [] for i in wn.lemmas(lemma, pos): all_lemma = all_lemma + i.synset().lemma_names() all_lemma = set([i.replace('_', ' ') for i in all_lemma if i != lemma]) return all_lemma
def pairs(words): "Find hyponym/hypernym pairs from a set of words" # For each word, find its most common sense senses = [] sense_map = collections.defaultdict(lambda:[]) for word in words: lemmas = wordnet.lemmas(word) nouns = [l for l in lemmas if l.synset.pos == 'n'] counts = [l.count() for l in nouns] #print counts #print nouns best = max(nouns, key=lambda x:x.count()) #print best, best.synset sense_map[best.synset.name].append(word) senses.append(best.synset) #print max([len(x) for x in sense_map.values()]) #Take out "entity" as it's too general sense_set = {s.name for s in senses} - set(['entity.n.01']) for word, sense in zip(words, senses): hypernyms = list(sense.closure(lambda x:x.hypernyms())) common = {h.name for h in hypernyms} & sense_set if len(common) > 0: for c in common: for w in sense_map[c]: print word, w
def add_and_initialize_category_associations(word_synset_dict, path_to_wordnet_categories, no_mapping_count=1): wordnet_categories = load_as_json(path_to_wordnet_categories) for wordnet_category, wordnet_subcategories in wordnet_categories.items(): for wordnet_word in wordnet_subcategories.keys(): for lemma in wordnet.lemmas(wordnet_word): lemma_name = lemma.name() lemma_synset = lemma.synset() synset_name = lemma_synset.name() if lemma_name not in word_synset_dict: word_synset_dict[lemma_name] = dict() if synset_name not in word_synset_dict[lemma_name]: word_synset_dict[lemma_name][synset_name] = dict() word_synset_dict[lemma_name][synset_name][ "count"] = no_mapping_count word_synset_dict[lemma_name][synset_name][ "definition"] = lemma_synset.definition() word_synset_dict[lemma_name][synset_name][ "category"] = wordnet_category print(lemma_name, synset_name) if word_synset_dict[lemma_name][synset_name][ "definition"] != lemma_synset.definition(): print("wrong") return word_synset_dict
def SenseNumber(lemma): """Get WordNet sense number for lemma.""" i = 1 for l in wn.lemmas(lemma.name): if l is lemma: return i if l.synset.pos == lemma.synset.pos: i = i + 1 return 9999
def generate_mono_list(lemmalist): print "Number of WN lemmas = "+str(len(lemmalist)) monolist=[] mwcount=0 singlesense=0 lowfreq=0 predcount=0 nonpred=0 for lemma in lemmalist[0:]: if notmultiword(lemma): #print lemma, wn.synsets(lemma,pos=wn.NOUN) if len(wn.synsets(lemma,pos=wn.NOUN))==1: freq=0 for form in wn.lemmas(lemma,wn.NOUN): freq+=form.count() if freq>0: singlesense+=1 monolist.append(lemma) else: lowfreq+=1 else: total=0 max=0 for form in wn.lemmas(lemma,wn.NOUN): thiscount=form.count() #print form,thiscount total+=thiscount if thiscount>max: max=thiscount if max > (total)/2: predcount+=1 monolist.append(lemma) #print "Accepting "+lemma else: nonpred+=1 else: #print "lemma "+lemma+" not added as multiword" mwcount+=1 print "Accepted single sense words "+str(singlesense) print "Accepted predominant sense words "+str(predcount) print "TOTAL accepted "+str(singlesense+predcount) print "Rejected single sense words with no freq info "+str(lowfreq) print "Rejected non predominant sense words "+str(nonpred) print "Rejected multiwords "+str(mwcount) return monolist
def hypernyms(word): hypernyms = [] for lemma in wn.lemmas(word): for hypernym in lemma.synset.hypernyms(): for word in hypernym.lemma_names: if '_' not in word: hypernyms.append(word) return hypernyms
def _check_antonyms_existence(word, word_sets, wn_pos): for le in wn.lemmas(word, pos=wn_pos): for antonym in le.antonyms(): name = antonym.name() if name in word_sets or wn.morphy(name) in word_sets or lemma( name) in word_sets: return True return False
def extract_lemmas(sent): """Extracts all adjectives in a given sentence""" lemmas = [] tokens = word_tokenize(sent) pos_tagged = pos_tag(tokens) for word in pos_tagged: lemmas.extend(wn.lemmas(word[0])) return lemmas
def wordnet(): wn.synsets('motorcar') wn.synset('car.n.01').lemma_names wn.synset('car.n.01').definition wn.synset('car.n.01').examples wn.synset('car.n.01').lemmas wn.lemma('car.n.01.automobile') wn.lemma('car.n.01.automobile').synset wn.lemma('car.n.01.automobile').name wn.synsets('car') for synset in wn.synsets('car'): print synset.lemma_names wn.lemmas('car')
def get_word_synsets(word, only_nouns=True): lemmas = wn.lemmas(word) #si no me da lemmas, intento algo if len(lemmas) == 0: wnl = nltk.WordNetLemmatizer() lemmatized_word = wnl.lemmatize(word) lemmas = wn.lemmas(lemmatized_word) if len(lemmas) == 0: return [] #some distances doesn't handle not-noun words synsets = [lemma.synset for lemma in lemmas] if only_nouns: return [synset for synset in synsets if synset.name.split('.')[1] == 'n'] else: return synsets
def antonyms(word): """ return all antonyms of word """ lemmas = [lemma for lemma in wn.lemmas(word)] antonyms = set() for lemma in lemmas: for antonym in lemma.antonyms(): antonyms.add(antonym.name) return antonyms
def _lemmata_by_freq(query, pos): """Return lemmata for query, sorted descending by frequency. """ lemmata = wordnet.lemmas(query, pos) return sorted( lemmata, key=lambda lemma: lemma.count(), reverse=True )
def synonyms(word): synonyms = [] for lemma in wn.lemmas(word): group = lemma.synset.lemma_names if word and (word in group): group.remove(word) group.insert(0,word) if len(group) > 1: for synonym in group[1:]: if '_' not in synonym: synonyms.append(synonym) return synonyms
def lesk_ESA_similarity(filename): document_similarity=Document_Relatedness() print "Lesk ESA" f = open(filename,'w') tree = etree.parse(path_string) dictionary = get_wsd_input_data(tree); print "Computing Lesk Similarity \n" for key in sorted(dictionary.iterkeys()): word=wn.morphy(dictionary[key].lower()) sentence=get_sentence(tree,key) context_original=set([]) for w in get_context_words(sentence): if w.isdigit()==False: context_original.add(w) if wn.lemmas(w)!=[]: def_sentence=wn.lemmas(w)[0].synset.definition for mmm in get_context_words(def_sentence): context_original.add(mmm) best_sense=wn.lemmas(word)[0].synset #best sense #most frequent sense is a default one max_overlap=0 for lemma in wn.lemmas(word): context_int=set([]) for example in lemma.synset.examples: for w in get_context_words(example): context_int.add(w) for w in get_context_words(lemma.synset.definition): context_int.add(w) overlap=document_similarity.ESA_sentence_similarity(context_original,context_int) if overlap>max_overlap: max_overlap=overlap best_sense=lemma.synset l=split_syn_dots(key) answer_line=l[0]+" "+key+" eng-30-"+str(best_sense.offset)+"-"+best_sense.pos+"\n" f.write(answer_line) print "Finished"
def trans_verb_list(): '''Generate a list of transitive verbs.''' transitive_verbs = [] for word in wordnet.all_lemma_names('v'): frame_ids = set() for lem in wordnet.lemmas(word, 'v'): frame_ids.update(lem.frame_ids()) # Verbs with these frames make sense for our sentences. if frame_ids.intersection({8, 9, 10, 11}): transitive_verbs.append(word) # Remove duplicates by converting to set and back in case of # malicious WordNet. return list(set(transitive_verbs))
def unambiguous(words): "Get a list of (mainly) unambiguous terms" for word in words: lemmas = wordnet.lemmas(word) counts = [x.count() for x in lemmas if x.synset.pos == 'n'] if len(counts) == 0: continue m = max(counts) if m == 0: continue p = float(m)/sum(counts) if m > 2 and p > 0.8: print word
def isantonym(word1, word2): """ judge whether two word are antonym """ w1_lemma = [lemma for lemma in wn.lemmas(word1)] w1_antonyms = set() for lemma in w1_lemma: for antonym in lemma.antonyms(): w1_antonyms.add(antonym.name) if word2 in w1_antonyms: return True else: return False
def process(sentences): result = [] tokenized_sentences = [word_tokenize(sentence) for sentence in sentences] for review in tokenized_sentences: new_review = [] for token in review: new_token = regex.sub(u'', token) if not new_token == u'' and not new_token in stop_words: if len(wordnet.lemmas(new_token)) > 0: new_review.append(lemmatizer.lemmatize(new_token)) # new_token = regex.sub('', token) # if not new_token == '' and not new_token in stop_words: # new_review.append(new_token) result.append(new_review) return result
def WORDNET(Words): '''Whether the Word is present in the WORDNET''' '''-------------------------------Feature 15------------------------------''' global Featuredict global StopList for wrd in Words: if wrd.isalpha(): lem=wn.lemmas(str(wrd)) if len(lem)>0: temp=wrd temp=temp.lower() temp=temp.rstrip() temp=temp.lstrip() if temp not in StopList: wrd=wrd.lower() if wrd in FeatureDict: FeatureDict[str(wrd)][15]=1 else: INITFeatureDictionary(str(wrd)) FeatureDict[str(wrd)][15]=1
def getsynsets(user_input): for synset in (wn.synsets(user_input)): print synset nyms = ['hypernyms', 'hyponyms', 'meronyms', 'holonyms', 'part_meronyms', 'sisterm_terms', 'troponyms', 'inherited_hypernyms'] for i in nyms: try: print getattr(synset, i)() except AttributeError as e: print e pass for lemma in (wn.lemmas(user_input)): print lemma lemmanyms = ['antonyms', 'derivationally_related_forms', 'pertainyms'] for y in lemmanyms: try: print getattr(lemma, y)() except AttribureError as e: print e pass
def wordnet_data(word: str) -> [str]: """returns a list of semi-formatted wordnet word data""" # definitions, parts of speech, synonyms, antonyms, related words result = [[], [], [], set(), [], []] word_info = wordnet.synsets(word) if len(word_info) > 0: word_info = word_info[0] else: return result lemmas = wordnet.lemmas(word) for synset in wordnet.synsets(word): result[0].append(synset.definition()) result[1].append(synset.pos()) # how to access part of speech? result[2] = word_info.lemma_names() for lemma in lemmas: for word in lemma.antonyms(): result[3].add(word.name()) result[4] = (word.name().split(".")[0] for word in word_info.hyponyms()) result[5] = (word.name().split(".")[0] for word in word_info.hypernyms()) # how to find similar words? Doesn't the corpus analysis do this? Hyponyms? return result
def wordnet_data(word: str)->[str]: """returns a list of semi-formatted wordnet word data""" #definitions, parts of speech, synonyms, antonyms, related words result = [[],[],[],set(),[],[]] word_info = wordnet.synsets(word) if (len(word_info) > 0): word_info = word_info[0] else: return result lemmas = wordnet.lemmas(word) for synset in wordnet.synsets(word): #if synset.name().split('.')[0] != word: #continue result[0].append(synset.definition()) result[1].append(synset.pos()) result[2] = word_info.lemma_names() for lemma in lemmas: for word in lemma.antonyms(): result[3].add(word.name()) result[4] = (word.name().split('.')[0] for word in word_info.hyponyms()) result[5] = (word.name().split('.')[0] for word in word_info.hypernyms()) return result
def words_of_type(word_type, min_frequency=4): ''' Generate a list of words of WordNet word_type that have a total frequency of at least min_frequency times across all senses of the word with word_type. ''' try: with open(word_type + '.' + str(min_frequency), 'r') as file: return file.read().split('\n') except: words = [] for word in wordnet.all_lemma_names(wordnet.__getattribute__(word_type)): counts = [lem.count() for lem in wordnet.lemmas(word, wordnet.__getattribute__(word_type))] if sum(counts) >= min_frequency: words.append(word) words = [item for item in words if not item.isdigit()] with open(word_type + '.' + str(min_frequency), 'w') as file: # Remove duplicates by converting to set and back in case of # malicious WordNet. file.write('\n'.join(list(set(words)))) return words
def estimate(self, word): # find lemmas whose surface is the same as a given word word = word.lower() lemmas = wn.lemmas(word, pos=wn.NOUN) abstractness_list = [] for lemma in lemmas: # find all the hypernyms tree = lemma.synset.tree(lambda s:s.hypernyms()) hypernyms = self._flatten(tree) # count physical_entity and abstraction synsets concrete = self._count_synset(hypernyms, self.CONCRETE_NAME) abstract = self._count_synset(hypernyms, self.ABSTRACT_NAME) # abstractness = #abst / (#abst + #conc) if (concrete + abstract) != 0: abstractness = float(abstract) / (abstract + concrete) abstractness_list.append(abstractness) # take the average (0 if no sense) if len(abstractness_list) != 0: result = sum(abstractness_list) / len(abstractness_list) else: result = 0.0 return result
print "Computing Lesk Similarity \n" #for key in dictionary.keys(): #print "en3.s036.t595"," " + dictionary["en3.s036.t595"] for key in sorted(dictionary.iterkeys()): #print key," " + dictionary[key] word=wn.morphy(dictionary[key].lower()) #print word,dictionary[key] sentence=get_sentence(tree,key) context_original=set([]) #print dictionary[key] # My first python program for w in get_context_words(sentence): if w.isdigit()==False: context_original.add(w) if wn.lemmas(w)!=[]: def_sentence=wn.lemmas(w)[0].synset.definition for mmm in get_context_words(def_sentence): context_original.add(mmm) # for example in wn.lemmas(w)[0].synset.examples: adding examples decreases accuracy # for w in get_context_words(example): # context_original.add(w) #print key,word best_sense=wn.lemmas(word)[0].synset #best sense #most frequent sense is a default one max_overlap=0 for lemma in wn.lemmas(word): context_int=set([]) for example in lemma.synset.examples: #crawl through all definitions and examples #print "Lemma: "+lemma.name+"\n" #print "Example: "+ example+"\n" for w in get_context_words(example):