def ourLesk(sentence, word, pos1, forceResponse = False): leskList = [] if pos is not None: possibility1 = pylesk.cosine_lesk(sentence, word, pos1) possibility2 = pylesk.adapted_lesk(sentence, word) else: possibility1 = pylesk.cosine_lesk(sentence, word) possibility2 = pylesk.adapted_lesk(sentence, word) if possibility1 is not None and possibility2 is not None: possibility1 = [str(lemma.name()) for lemma in possibility1.lemmas()] possibility2 = [str(lemma.name()) for lemma in possibility2.lemmas()] leskList = set(possibility1).intersection(possibility2) else: if possibility1 is None: if possibility2 is not None: leskList = [str(lemma.name()) for lemma in possibility2.lemmas()] else: return None else: leskList = [str(lemma.name()) for lemma in possibility1.lemmas()] if len(leskList) > 0: print "-------" print word print leskList return list(leskList) else: return None
def disambiguateWordSenses3(sentence,word,stanfordPOS, senti_db): #disambiguation with simple_lesk result_list=cosine_lesk(sentence,word,nbest=True) #result is a list of synsets of word result = None print word,stanfordPOS if result_list: for ss,score in result_list: pos=ss.pos() if (pos == u's'): pos = u'a' if pos == stanfordPOS: result = ss break if result: pos = result.pos() if (pos == u's'): pos = u'a' offset = result.offset() pos_score=0.0 neg_score=0.0 if (pos, offset) in senti_db: pos_score, neg_score = senti_db[(pos, offset)] obj = 1.0-(pos_score+neg_score) else: obj=1.0 pos=None pos_score=0.0 neg_score=0.0 return obj,pos,pos_score,neg_score
def disambiguateWordSenses3(self,sentence,word,stanfordPOS): #disambiguation with simple_lesk #result=simple_lesk(sentence,word) print word,stanfordPOS result_list=cosine_lesk(sentence,word,nbest=True) #result is a list of synsets of word #print result_list result = None print word,stanfordPOS if result_list: for ss, score in result_list: pos=ss.pos() if (pos == u's'): pos = u'a' if pos == stanfordPOS: result = ss print "matched" break if result: pos = result.pos() if (pos == u's'): pos = u'a' offset = result.offset() pos_score=0.0 neg_score=0.0 if (pos, offset) in self.db: # print word,pos,offset pos_score, neg_score = self.db[(pos, offset)] obj = 1.0-(pos_score+neg_score) #print "%%%%%%%%%%" #print pos_score,neg_score, obj else: obj=1.0 pos=None pos_score=0.0 neg_score=0.0 return obj,pos,pos_score,neg_score
def wsd(text, quary): sentences = sent_tokenize(text) find_sent = '' tag = None for sent in sentences: if quary in sent: find_sent = sent break synonyms = [] tag2tag = { 'NN': 'n', 'NNS': 'n', 'RN': 'r', 'VB': 'v', 'VBP': 'v', 'VBD': 'v', 'VBZ': 'v', 'VBG': 'v', 'JJ' : 'a' } if find_sent != "": tags = pos_tag(word_tokenize(find_sent)) tag = [x[1] for x in tags if x[0]==quary][0] try: tag = tag2tag[tag] except KeyError: tag = None answer = cosine_lesk(find_sent, quary, pos=None, context_is_lemmatized=True, nbest=True) for syn in wordnet.synsets(quary): for l in syn.lemmas(): synonyms.append(l.name()) synonyms = list(set(synonyms)) print("Synonyms: {}".format(', '.join(synonyms))) print("The best definition: {}".format(answer[0][1].definition())) print() definitions ={} for ans in answer: definitions[ans[1].definition()] = ceil(ans[0]*100)/100 #print("Definition: {0}, The similarity is {1}".format(ans[1].definition(), ans[0])) return ', '.join(synonyms), answer[0][1].definition(), definitions else: return ''
def get_synset(metode, word, text): synset = "" if metode == "original_lesk": synset = simple_lesk(text, word) elif metode == "simple_lesk": synset = adapted_lesk(text, word) elif metode == "adapted_lesk": synset = cosine_lesk(text, word) # elif metode == "path" : # synset = max_similarity(text, word, "path") # elif metode == "path" : # synset = max_similarity(text, word, "wup") # elif metode == "path" : # synset = max_similarity(text, word, "lin") # elif metode == "path" : # synset = max_similarity(text, word, "res") # elif metode == "random_sense": # synset = random_sense(word) # elif metode == "first_sense": # synset = first_sense(word) # elif metode == "most_frequent_sense": # synset = most_frequent_sense(word) return synset
def disambiguateWordSenses3(self,sentence,word): #disambiguation with simple_lesk #result=simple_lesk(sentence,word) result=cosine_lesk(sentence,word) #result is a list of synsets of word #print result_list if result: pos = result.pos() if (pos == u's'): pos = u'a' offset = result.offset() pos_score=0.0 neg_score=0.0 if (pos, offset) in self.db: # print word,pos,offset pos_score, neg_score = self.db[(pos, offset)] obj = 1.0-(pos_score+neg_score) #print "%%%%%%%%%%" #print pos_score,neg_score, obj else: obj=1.0 pos=None pos_score=0.0 neg_score=0.0 return obj,pos,pos_score,neg_score
def get_def(word, context, lang): #job = json.loads(injob.text) #lang = job.lang #context = job.context #word = job.word # remove non alphanumeric chars context = remove_notalpha(context) doc = nlp(context) if lang != 'eng': #call for translation to proper lang getstr = "https://glosbe.com/gapi/translate?from=" + lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true" response = requests.get(getstr) indef = json.loads(response.text) word = find_token(indef, doc) else: for token in doc: if word == token.text: word = token break # do two seperate lesks answer = simple_lesk(context, word.text, pos_convert(word.pos_)) cosans = cosine_lesk(context, word.text, pos_convert(word.pos_)) # find what we hope is the better answer if (check_def(context, cosans.definition()) > check_def( context, answer.definition())): answer = cosans sense = str(answer) sense = sense.split("'")[1].split(".") if ((sense[0] != word.lemma_ or int(sense[2]) > 4) and word.pos_ != 'PROPN'): try: answer = wn.synset(word.lemma_ + '.' + pos_convert(word.pos_) + '.01') except Exception: pass if lang != 'eng': if lang == 'spa': lang = 'es' if lang == 'arb': lang = 'ar' #this should use the spa or arb word given if len(indef['tuc']) > 0: meaning = "" for tuc in indef['tuc']: try: if tuc['phrase']['text'] == word.lemma_: esptemp = "" for m in tuc['meanings']: if m['language'] == lang and len( m['text']) > len(meaning): meaning = m['text'] except KeyError: pass else: # needs to look for beginning of sentence if (word.pos_ == 'PROPN'): meaning = word.text + " is a proper noun." elif answer: meaning = answer.definition() return meaning
def get_def(injob): lang = injob['language'] context = injob['context'].lower() word = injob['word'].lower() # make proper names into iso standard if lang == 'English': lang = 'eng' if lang == 'Spanish': lang = 'spa' if lang == 'Arabic': lang = 'arb' if lang == 'French': lang = 'fra' # remove non alphanumeric chars doc = nlp(context) if lang != 'eng': if lang == 'fra': stoken = flp(word) if lang == 'spa': stoken = slp(word) for token in stoken: print(token.lemma_) word = token.lemma_.lower() # call for translation to proper lang getstr = "https://glosbe.com/gapi/translate?from="+ lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true" response = requests.get(getstr) indef = json.loads(response.text) word = find_token(indef, doc, lang) if isinstance(word, str): return word else: for token in doc: if word == token.text: word = token break if word and (word.is_stop or word.text == 'I'): if lang != 'eng': return find_def(indef, lang, word) else: if word.text == 'I': response = "Singular first person pronoun." else: try: a = o.get_info_about_word(word.lemma_).json() except Exception: a = o.get_info_about_word(word.text).json() response = a['results'][0]['lexicalEntries'][0][ 'entries'][0]['senses'][0]['definitions'][0] return response if word: # do two seperate lesks answer = simple_lesk(context, word.text, pos_convert(word.pos_)) cosans = cosine_lesk(context, word.text, pos_convert(word.pos_)) # find what we hope is the better answer if(check_def(context, cosans.definition()) > check_def(context, answer.definition())): answer = cosans sense = str(answer) sense = sense.split("'")[1].split(".") if ((sense[0] != word.lemma_ or int(sense[2]) > 4) and word.pos_ != 'PROPN'): try: answer = wn.synset(word.lemma_ + '.' + pos_convert(word.pos_) + '.01') except Exception: pass # probably broken now the stemmer had problems with capitolization if (word.pos_ == 'PROPN'): meaning = word.text + " is a proper noun." elif lang != 'eng' and len(indef['tuc']) > 0: # this should use the spa or arb word given meaning = find_def(indef, lang, word) elif answer: meaning = answer.definition() if meaning: print("meaning: " + meaning) return meaning elif lang == 'eng': return "Sorry, I don't know that definintion:(" elif lang == 'spa': return "Lo siento, no sé esa definición:(" elif lang == 'fra': return "Désolé, je ne connais pas cette définition:(" elif lang == 'eng': return "Sorry, I don't know that definintion:(" elif lang == 'spa': return "Lo siento, no sé esa definición:(" elif lang == 'fra': return "Désolé, je ne connais pas cette définition:("
print "Context:", bank_sents[0] answer = adapted_lesk(bank_sents[0],'bank','n', True, \ nbest=True, keepscore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] try: definition = best_sense.definition() except: definition = best_sense.definition print "Definition:", definition print print "======== TESTING cosine_lesk ===========\n" from pywsd.lesk import cosine_lesk print "#TESTING cosine_lesk() ..." print "Context:", bank_sents[0] answer = cosine_lesk(bank_sents[0],'bank') print "Sense:", answer try: definition = answer.definition() except: definition = answer.definition print "Definition:", definition print print "#TESTING cosine_lesk() with nbest results..." print "Context:", bank_sents[0] answer = cosine_lesk(bank_sents[0],'bank', nbest=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][0] try: definition = best_sense.definition() except: definition = best_sense.definition print "Definition:", definition print
print "#TESTING adapted_lesk() with pos, stem, nbest and scores." print "Context:", bank_sents[0] answer = adapted_lesk(bank_sents[0],'bank','n', True, \ nbest=True, keepscore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] definition = best_sense.definition() print "Definition:", definition print print "======== TESTING cosine_lesk ===========\n" from pywsd.lesk import cosine_lesk print "#TESTING cosine_lesk() ..." print "Context:", bank_sents[0] answer = cosine_lesk(bank_sents[0], 'bank') print "Sense:", answer definition = answer.definition() print "Definition:", definition print print "#TESTING cosine_lesk() with nbest results..." print "Context:", bank_sents[0] answer = cosine_lesk(bank_sents[0], 'bank', nbest=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] definition = best_sense.definition() print "Definition:", definition print print "======== TESTING baseline ===========\n"
def get_wordnet_related_words_from_word(word, context, synonyms=1, hypernyms=0, hyponyms=0): """ Method to generate a list of words that are found to be related to the input word through the WordNet ontology and resource. The correct sense of the input word to be used within the context of WordNet is picked based on disambiguation from the PyWSD package which takes the surrounding text (or whatever text is provided as context) into account. All synonyms, hypernyms, and hyponyms are considered to be related words in this case. Args: word (str): The word for which we want to find related words. context (str): Text to use for word-sense disambigutation, usually sentence the word is in. synonyms (int, optional): Set to 1 to include synonyms in the set of related words. hypernyms (int, optional): Set to 1 to included hypernyms in the set of related words. hyponyms (int, optional): Set to 1 to include hyponyms in the set of related words. Returns: list: The list of related words that were found, could be empty if nothing was found. """ # To get the list of synsets for this word if not using disambiguation. list_of_possible_s = wordnet.synsets(word) # Disambiguation of synsets (https://github.com/alvations/pywsd). # Requires installation of non-conda package PyWSD from pip ("pip install pywsd"). # The methods of disambiguation that are supported by this package are: # (simple_lesk, original_lesk, adapted_lesk, cosine_lesk, and others). s = cosine_lesk(context, word) try: # Generate related words using wordnet, including synonyms, hypernyms, and hyponyms. # The lists of hypernyms and hyponyms need to be flattened because they're lists of lists from synsets. # definition() yields a string. # lemma_names() yields a list of strings. # hypernyms() yields a list of synsets. # hyponyms() yields a list of synsets. synset_definition = s.definition() synonym_lemmas = s.lemma_names() hypernym_lemmas_nested_list = [x.lemma_names() for x in s.hypernyms()] hyponym_lemmas_nested_list = [x.lemma_names() for x in s.hyponyms()] # Flatten those lists of lists. hypernym_lemmas = list( itertools.chain.from_iterable(hypernym_lemmas_nested_list)) hyponym_lemmas = list( itertools.chain.from_iterable(hyponym_lemmas_nested_list)) # Print out information about the synset that was picked during disambiguation. #print(synset_definition) #print(synonym_lemmas) #print(hypernym_lemmas) #print(hyponym_lemmas) related_words = [] if synonyms == 1: related_words.extend(synonym_lemmas) if hypernyms == 1: related_words.extend(hypernym_lemmas) if hyponyms == 1: related_words.extend(hyponym_lemmas) return (related_words) except AttributeError: return ([])
for word in con: if word == token.text: context = token.text + " " answer = simple_lesk(context, token.text, pos_convert(token.pos_)) print(answer) if not answer: continue except Exception: continue sense = split_syn(answer) print(sense[0] + " " + token.lemma_) if ((sense[0] != token.lemma_ or int(sense[2]) > 4) and token.pos_ != 'PROPN'): try: cosans = cosine_lesk(context, token.text, pos_convert(token.pos_)) if (check_def(context, cosans.definition()) > check_def( context, answer.definition())): answer = cosans if ((sense[0] != token.lemma_ or int(sense[2]) > 4) and token.pos_ != 'PROPN'): answer = wn.synset(token.lemma_ + '.' + pos_convert(token.pos_) + '.01') print("unlikely sense detected - new sense:") print(answer) except Exception: pass # needs to look for beginning of sentence if (token.pos_ == 'PROPN'): print(token.text + " is a proper noun.") elif answer:
def wsd_lesk(raw_df, algorithm_choice): """This finds the synset of the word using the original sentence as context and different lesk algorithms from nltk- and pywsd-packages. Algorithm choices are: 1. nltk's lesk 2. pywsd simple_lesk, 3. pywsd advanced_lesk.""" start = timer() algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk", 3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"} df = raw_df full_aspect_synset_list = [] full_aspect_synset_list_definition = [] aspect_synset_list_definition = [] aspect_synset_list = [] opinion_synset_list = [] opinion_synset_list_definition = [] full_opinion_synset_list = [] full_opinion_synset_list_definition = [] aspect_opinion = ["aspect_tags", "opinion_tags"] tokenized_sentences = raw_df["tokenized_sentence"] non_tokenized_sentences = raw_df["original_text"] for opinion_list in aspect_opinion: for i, phrase in enumerate(df[opinion_list]): multiple_word_found = False for j, word in enumerate(phrase): special_word = False if multiple_word_found is False: # Check here for special words such as "bug". aspect = check_for_special_word(word) if aspect is not None: special_word = True wn_check = [] if len(phrase) >= 2: k = 0 temporary_combined_word = [] while k < len(phrase): temporary_combined_word.append(phrase[k][0]) k += 1 combined_word_string = '_'.join(temporary_combined_word) wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1])) multiple_word_found = True if len(wn_check) == 0: wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1])) multiple_word_found = False if len(wn_check) > 0: if special_word is False: if algorithm_choice == 1: if multiple_word_found is True: aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 2: if multiple_word_found is True: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 3: if multiple_word_found is True: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 4: if multiple_word_found is True: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if aspect is not None: if opinion_list is "aspect_tags": aspect_synset_list.append(aspect) aspect_synset_list_definition.append(aspect.definition()) else: opinion_synset_list.append(aspect) opinion_synset_list_definition.append(aspect.definition()) if opinion_list is "aspect_tags": full_aspect_synset_list.append(aspect_synset_list) full_aspect_synset_list_definition.append(aspect_synset_list_definition) aspect_synset_list = [] aspect_synset_list_definition = [] else: full_opinion_synset_list.append(opinion_synset_list) full_opinion_synset_list_definition.append(opinion_synset_list_definition) opinion_synset_list = [] opinion_synset_list_definition = [] df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values end = timer() logging.debug("WSD Lesk Time: %.2f seconds" % (end - start)) return df
for eachword in words: if has_synset(eachword): answer = adapted_lesk(raw_sentence, eachword) adaptedlesk_answer.append(answer) print "Sense :", answer print eachword+":"+answer.definition()+"\n" else: print eachword+": "+eachword+"\n" adaptedlesk_answer.append(eachword) print "\nDisambiguating your sentence word by word using Cosine Lesk algorithm. Hold on. \n======================================================" for eachword in words: if has_synset(eachword): answer = cosine_lesk(raw_sentence, eachword) cosinelesk_answer.append(answer) print "Sense :", answer print eachword+":"+answer.definition()+"\n" else: print eachword+": "+eachword+"\n" cosinelesk_answer.append(eachword) print "Word Definition Comparison\n====================================\n" for i in range(len(simplelesk_answer)): # assuming the lists are of the same length print "\n============================================================" print "\nWord being compared is: "+words[i] if simplelesk_answer[i]==adaptedlesk_answer[i]==cosinelesk_answer[i]: print "\nSame definition in all algorithms."