def max_similarity(context_sentence: str, ambiguous_word: str, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True) -> "wn.Synset": """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see https://ibin.co/4gG9zUlejUUA.png): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} :param context_sentence: String, a sentence. :param ambiguous_word: String, a single word. :return: If best, returns only the best Synset, else returns a dict. """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [ lemmatize(w) for w in word_tokenize(context_sentence) ] result = {} for i in wn.synsets(ambiguous_word, pos=pos): result[i] = 0 for j in context_sentence: _result = [0] for k in wn.synsets(j): _result.append(sim(i, k, option)) result[i] += max(_result) if option in ["res", "resnik"]: # lower score = more similar result = sorted([(v, k) for k, v in result.items()]) else: # higher score = more similar result = sorted([(v, k) for k, v in result.items()], reverse=True) try: return result[0][1] if best else result except IndexError: return None
def adapted_lesk(context_sentence: str, ambiguous_word: str, pos: str = None, lemma=True, stem=False, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, keepscore=False, normalizescore=False, from_cache=True) -> "wn.Synset": """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None. :return: A Synset for the estimated best sense. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = signatures(ambiguous_word, pos=pos, hyperhypo=hyperhypo, adapted=True, remove_stopwords=stop, to_lemmatize=lemma, remove_numbers=True, lowercase=True, to_stem=stem, from_cache=from_cache) # Disambiguate the sense in context. context_sentence = context_sentence.split( ) if context_is_lemmatized else lemmatize_sentence(context_sentence) return compare_overlaps(context_sentence, ss_sign, nbest=nbest, keepscore=keepscore, normalizescore=normalizescore)
def cosine_lesk(context_sentence: str, ambiguous_word: str, pos: str = None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, from_cache=True) -> "wn.Synset": """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None. :return: A Synset for the estimated best sense. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in ss_sign.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") scores.append((cos_sim(context_sentence, signature), ss)) scores = sorted(scores, reverse=True) return scores if nbest else scores[0][1]
def simple_lesk(tokens: list, ambiguous_word: str, pos: str = None, lemma=True, stem=False, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, keepscore=False, normalizescore=False, from_cache=True) -> "wn.Synset": """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None. :return: A Synset for the estimated best sense. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word, pos=pos) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) # Disambiguate the sense in context. context_sentence = tokens #if context_is_lemmatized else lemmatize_sentence(context_sentence) return compare_overlaps(context_sentence, ss_sign, nbest=nbest, keepscore=keepscore, normalizescore=normalizescore)
def original_lesk(context_sentence: str, ambiguous_word: str, dictionary=None, from_cache=True) -> "wn.Synset": """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://dl.acm.org/citation.cfm?id=318728 :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :return: A Synset for the estimated best sense. """ ambiguous_word = lemmatize(ambiguous_word) if not dictionary: # If dictionary is not provided, use the WN defintion. dictionary = signatures(ambiguous_word, original_lesk=True, from_cache=from_cache) best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def synset_signatures(ss: "wn.Synset", hyperhypo=True, adapted=False, remove_stopwords=True, to_lemmatize=True, remove_numbers=True, lowercase=True, original_lesk=False, from_cache=True) -> set: """ Takes a Synset and returns its signature words. :param ss: An instance of wn.Synset. :return: A set of signature strings """ if from_cache: return synset_signatures_from_cache(ss, hyperhypo, adapted, original_lesk) # Collects the signatures from WordNet. signature = [] # Adds the definition, example sentences and lemma_names. signature += word_tokenize(ss.definition()) # If the original lesk signature is requested, skip the other signatures. if original_lesk: return set(signature) # Adds the examples and lemma names. signature += chain(*[word_tokenize(eg) for eg in ss.examples()]) signature += ss.lemma_names() # Includes lemma_names of hyper-/hyponyms. if hyperhypo: hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms()) signature += set(chain(*[i.lemma_names() for i in hyperhyponyms])) # Includes signatures from related senses as in Adapted Lesk. if adapted: # Includes lemma_names from holonyms, meronyms and similar_tos related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \ ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \ ss.similar_tos()) signature += set(chain(*[i.lemma_names() for i in related_senses])) # Lowercase. signature = set(s.lower() for s in signature) if lowercase else signature # Removes stopwords. signature = set(signature).difference( EN_STOPWORDS) if remove_stopwords else signature # Lemmatized context is preferred over stemmed context. if to_lemmatize: signature = [ lemmatize(s) if lowercase else lemmatize(s) # Lowercasing checks here. for s in signature # We only throw away if both remove_numbers and s is a digit are true. if not (remove_numbers and s.isdigit()) ] # Keep only the unique bag-of-words return set(signature)