def longest_common_subsequence(s1, s2): lemmas_sentence_1, _ = lemmatize_sentence(s1.lower()) lemmas_sentence_2, _ = lemmatize_sentence(s2.lower()) sent1 = [w for w in lemmas_sentence_1 if not w in stop_words] sent2 = [w for w in lemmas_sentence_2 if not w in stop_words] ss1 = ' '.join(sent1) ss2 = ' '.join(sent2) m = len(ss1) n = len(ss2) if m == 0 or n == 0: return 0 # declaring the array for storing the dp values L = [[None] * (n + 1) for i in range(m + 1)] """Following steps build L[m + 1][n + 1] in bottom up fashion Note: L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]""" for i in range(m + 1): for j in range(n + 1): if i == 0 or j == 0: L[i][j] = 0 elif ss1[i - 1] == ss2[j - 1]: L[i][j] = L[i - 1][j - 1] + 1 else: L[i][j] = max(L[i - 1][j], L[i][j - 1]) # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1] normalizer = len(ss1) if len(ss1) < len(ss2) else len(ss2) return L[m][n] / normalizer
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, option=False,lemma=True,hyperhypo=True, \ stop=True): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. #ambiguous_word = lemmatize(ambiguous_word) # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word,lemma=True,hyperhypo=True) #print ss_sign for ss in ss_sign: related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + ss.part_meronyms() + ss.part_holonyms() + ss.similar_tos() + ss.substance_holonyms() + ss.substance_meronyms())) try: signature = list([j for j in chain(*[i.lemma_names() for i in \ related_senses]) if j not in stopwords.words('english')]) except: signature = list([j for j in chain(*[i.lemma_names for i in \ related_senses]) if j not in stopwords.words('english')]) ss_sign[ss]+=signature context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign) return best_sense
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms + ss_simto)) signature = list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss] += signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def information_content_similarity(s1, s2): """ Compute the sentence similairty using information content from wordnet (words are disambiguated first to Synsets by means of Lesk algorithm) """ lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower()) lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower()) # Disambiguate words and create list of sysnsets synsets_sentence_1 = [] for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1): synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1])) if synset is not None: synsets_sentence_1.append(synset) else: found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1])) if len(found) > 0: synsets_sentence_1.append(found[0]) #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0])) synsets_sentence_2 = [] for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2): synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1])) if synset is not None: synsets_sentence_2.append(synset) else: found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1])) if len(found) > 0: synsets_sentence_2.append(found[0]) #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0])) score, count = 0.0, 0 # For each word in the first sentence for synset in synsets_sentence_1: L = [] for ss in synsets_sentence_2: try: L.append(synset.lin_similarity(ss, brown_ic)) except: continue if L: best_score = max(L) score += best_score count += 1 # Average the values if count > 0: score /= count return score
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def synsets_similarity(s1, s2): """ Find the jaccard similarity between two sentences synsets using lesk algorithm to disambiguate words given their context. """ lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower()) lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower()) # Disambiguate words and create list of sysnsets synsets_sentence_1 = [] for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1): if lemma in stop_words: continue synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1])) if synset is not None: synsets_sentence_1.append(synset) else: found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1])) if len(found) > 0: synsets_sentence_1.append(found[0]) #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0])) synsets_sentence_2 = [] for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2): if lemma in stop_words: continue synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1])) if synset is not None: synsets_sentence_2.append(synset) else: found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1])) if len(found) > 0: synsets_sentence_2.append(found[0]) #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0])) # Compute similarity if len(synsets_sentence_1) != 0 and len(synsets_sentence_2) != 0: similarity = 1 - jaccard_distance(set(synsets_sentence_1), set(synsets_sentence_2)) return similarity else: return 0
def disambiguate(sentence, algorithm=simple_lesk, context_is_lemmatized=False, similarity_option='path', keepLemmas=False, prefersNone=True, similarity_data=None): tagged_sentence = [] # Pre-lemmatize the sentnece before WSD if not context_is_lemmatized: surface_words, lemmas, morphy_poss = lemmatize_sentence( sentence, keepWordPOS=True) lemma_sentence = " ".join(lemmas) else: lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve? for word, lemma, pos in zip(surface_words, lemmas, morphy_poss): if lemma not in stopwords: # Checks if it is a content word try: wn.synsets(lemma)[0] if algorithm == original_lesk: # Note: Original doesn't care about lemmas synset = algorithm(lemma_sentence, lemma) elif algorithm == max_similarity: synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option, data=similarity_data) else: synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True) except: # In case the content word is not in WordNet synset = '#NOT_IN_WN#' else: synset = '#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, synset)) else: tagged_sentence.append((word, synset)) # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. if prefersNone and not keepLemmas: tagged_sentence = [(word, None) if str(tag).startswith('#') else (word, tag) for word, tag in tagged_sentence] if prefersNone and keepLemmas: tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence
def cosine_lesk( context_sentence, ambiguous_word, pos=None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, ): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + ss.part_meronyms() + ss.part_holonyms() + ss.similar_tos() + ss.substance_holonyms() + ss.substance_meronyms())) try: signature = list([j for j in chain(*[i.lemma_names() for i in \ related_senses]) if j not in stopwords.words('english')]) except: signature = list([j for j in chain(*[i.lemma_names for i in \ related_senses]) if j not in stopwords.words('english')]) if j in stopwords.words('english') print "Error" # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def cosine_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) \ if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def disambiguate(sentence, algorithm=simple_lesk, context_is_lemmatized=False, similarity_option='path', keepLemmas=False, prefersNone=True): tagged_sentence = [] # Pre-lemmatize the sentnece before WSD if not context_is_lemmatized: surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, keepWordPOS=True) lemma_sentence = " ".join(lemmas) else: lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve? for word, lemma, pos in zip(surface_words, lemmas, morphy_poss): if lemma not in stopwords: # Checks if it is a content word try: wn.synsets(lemma)[0] if algorithm == original_lesk: # Note: Original doesn't care about lemmas synset = algorithm(lemma_sentence, lemma) elif algorithm == max_similarity: synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option) else: synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True) except: # In case the content word is not in WordNet synset = '#NOT_IN_WN#' else: synset = '#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, synset)) else: tagged_sentence.append((word, synset)) # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. if prefersNone and not keepLemmas: tagged_sentence = [(word, None) if str(tag).startswith('#') else (word, tag) for word, tag in tagged_sentence] if prefersNone and keepLemmas: tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence
def disambiguate_new(sentence, algorithm=simple_lesk, extra_words=None, context_is_lemmatized=False, similarity_option='path', keepLemmas=False, prefersNone=True, similarity_data=None): # adds option of extra words, e.g. from LDA output, though not required # also checks if a word has 0 or 1 synsets, and doesn't run WSD in those cases tagged_sentence = [] # Pre-lemmatize the sentence before WSD if not context_is_lemmatized: surface_words, lemmas, morphy_poss = lemmatize_sentence( sentence, keepWordPOS=True) lemma_sentence = " ".join(lemmas) else: lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve? # print lemma_sentence if extra_words: #print("changing sentence to add LDA words:") #print(lemma_sentence) lemma_sentence = lemma_sentence.rstrip('.') + ' ' + " ".join( extra_words) #print(lemma_sentence) for word, lemma, pos in zip(surface_words, lemmas, morphy_poss): if lemma not in stopwords: # Checks if it is a content word try: if re.search(r'[a-z]+\.[nvsar]\.[0-9]{2}', lemma) != None: # lemma is already disambiguated synset = wn.synset(lemma) # print("single synset for %s" % lemma) else: syns = wn.synsets(lemma) if len(syns) == 0: #print("no synsets for %s: returning None" % lemma) synset = None elif len(syns) == 1: #print("just one synset for %s: returning %s" % (lemma, syns[0])) synset = syns[0] elif algorithm == original_lesk: # Note: Original doesn't care about lemmas # print("running original_lesk on %s" % lemma) synset = algorithm(lemma_sentence, lemma) # print("succeeded; returning %s" % synset) elif algorithm == max_similarity: #print("running max_similarity on %s %s" % (lemma, pos)) synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option, data=similarity_data) # print("succeeded at max_sim; returning %s" % synset) else: # print("running alg %s on %s" % (algorithm.__name__, lemma)) synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True) # print("succeeded; returning %s" % synset) except: # In case the content word is not in WordNet #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) print "threw error on: ", word, lemma, pos tb = traceback.format_exc() print tb synset = '#NOT_IN_WN#' # print("\ntry/except caught %s while trying alg %s and is returning #NOT_IN_WN#\n" % (lemma, algorithm.__name__)) else: #print lemma, " in stop words" synset = '#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, synset)) else: tagged_sentence.append((word, synset)) # print word, synset # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. if prefersNone and not keepLemmas: tagged_sentence = [(word, None) if str(tag).startswith('#') else (word, tag) for word, tag in tagged_sentence] if prefersNone and keepLemmas: tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence
def extract_overlap_pen(s1, s2): """ :param s1: :param s2: :return: overlap_pen score """ lemmas_sentence_1, _ = lemmatize_sentence(s1.lower()) lemmas_sentence_2, _ = lemmatize_sentence(s2.lower()) ss1 = [w for w in lemmas_sentence_1 if not w in stop_words] ss2 = [w for w in lemmas_sentence_2 if not w in stop_words] ovlp_cnt = 0 for w1 in ss1: ovlp_cnt += ss2.count(w1) score = 2 * ovlp_cnt / (len(ss1) + len(ss2) + .001) return score # def sif_embeddings(sentences, alpha=1e-3): # """Compute the SIF embeddings for a list of sentences # Parameters # ---------- # sentences : list # The sentences to compute the embeddings for # model : `~gensim.models.base_any2vec.BaseAny2VecModel` # A gensim model that contains the word vectors and the vocabulary # alpha : float, optional # Parameter which is used to weigh each individual word based on its probability p(w). # Returns # ------- # numpy.ndarray # SIF sentence embedding matrix of dim len(sentences) * dimension # """ # global glove_model # vlookup = glove_model.wv.vocab # Gives us access to word index and count # vectors = glove_model.wv # Gives us access to word vectors # size = glove_model.vector_size # Embedding size # Z = 0 # for k in vlookup: # Z += vlookup[k].count # Compute the normalization constant Z # output = [] # # Iterate all sentences # for s in sentences: # count = 0 # v = numpy.zeros(size, dtype=REAL) # Summary vector # # Iterare all words # for w in s: # # A word must be present in the vocabulary # if w in vlookup: # for i in range(size): # v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i] # count += 1 # if count > 0: # for i in range(size): # v[i] *= 1/count # output.append(v) # return numpy.vstack(output)