def get_lesk_answers(senseval_data): time_start = time.clock() # Getting answers from lesk algorithms original_lesk_answers = {} simple_lesk_answers = {} adapted_lesk_answers = {} for sentence_data in senseval_data: for phrase in sentence_data["test_phrases"]: word_id, word = phrase["headword"] original_lesk_answers[word_id] = lesk.original_lesk( " ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk( " ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk( " ".join(sentence_data["sentence"]), word) for word_id, word in sentence_data["test_words"].iteritems(): original_lesk_answers[word_id] = lesk.original_lesk( " ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk( " ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk( " ".join(sentence_data["sentence"]), word) sys.stdout.write(".") lesk_answers_list = [] lesk_answers_list.append((original_lesk_answers, "original lesk")) lesk_answers_list.append((simple_lesk_answers, "simple lesk")) lesk_answers_list.append((adapted_lesk_answers, "adapted lesk")) time_end = time.clock() print "\nlesk took " + str(time_end - time_start) + " seconds" return lesk_answers_list
def get_lesk_answers(senseval_data): time_start = time.clock() # Getting answers from lesk algorithms original_lesk_answers = {} simple_lesk_answers = {} adapted_lesk_answers = {} for sentence_data in senseval_data: for phrase in sentence_data["test_phrases"]: word_id, word = phrase["headword"] original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word) for word_id, word in sentence_data["test_words"].iteritems(): original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word) sys.stdout.write(".") lesk_answers_list = [] lesk_answers_list.append((original_lesk_answers, "original lesk")) lesk_answers_list.append((simple_lesk_answers, "simple lesk")) lesk_answers_list.append((adapted_lesk_answers, "adapted lesk")) time_end = time.clock() print "\nlesk took " + str(time_end - time_start) + " seconds" return lesk_answers_list
def getCommonWord(str1, str2): # str1 = "Some scoff at the notion that movies do anything more than entertain ." # str2 = "Some are wrong ." ml1 = str1.lower().split() ml1 = ml1[2:] ml1[0] = ml1[0][0].upper() + ml1[0][1:] ml2 = str2.lower().split() ml2 = ml2[2:] ml2[0] = ml2[0][0].upper() + ml2[0][1:] print ml1 print ml2 ml3 = set(ml1).intersection(ml2) # print ml3 file = open("stpwrds.txt", "r+") stpwrds = [ x.rstrip('\n').rstrip(')') for x in file.readlines() if x.strip() ] file.close() ml4 = set(ml3).difference(stpwrds) # print ml4 ml4 = list(ml4) print ml4 i = 0 while i < len(ml4): if simple_lesk(str1, ml4[i]) != simple_lesk(str2, ml4[i]): del ml4[i] i += 1 t2 = nltk.pos_tag(ml4) ml = [] for x in t2: if x[1][0] == 'V': ml.append(x[0]) for x in t2: if x[1][0] == 'N': ml.append(x[0]) for x in t2: if x[1][0] == 'P' and x[1][1] == 'R': ml.append(x[0]) for x in t2: if x[1] == "CD": ml.append(x[0]) for x in t2: if x[1][0] == 'F': ml.append(x[0]) for x in t2: if x[0] not in ml: ml.append(x[0]) # print t2 # print ml print "After WSD and POS reordering:\n", ml i = 0 while i < len(ml): ml[i] = ml[i].lower() i += 1 return ml
def test_simple_lesk_default(self): bank_sents = [('I went to the bank to deposit my money', 'depository_financial_institution.n.01'), ('The river bank was full of dead fishes', 'bank.n.01')] plant_sents = [('The workers at the industrial plant were overworked', 'plant.n.01'), ('The plant was no longer bearing flowers', 'plant.v.01')] for sent, synset_name in bank_sents: self.assertEqual(simple_lesk(sent,'bank').name(), synset_name) for sent, synset_name in plant_sents: self.assertEqual(simple_lesk(sent,'plant').name(), synset_name)
def test_simple_lesk_default(self): bank_sents = [('I went to the bank to deposit my money', 'depository_financial_institution.n.01'), ('The river bank was full of dead fishes', 'bank.n.01')] plant_sents = [('The workers at the industrial plant were overworked', 'plant.n.01'), ('The plant was no longer bearing flowers', 'plant.v.01')] for sent, synset_name in bank_sents: self.assertEqual(simple_lesk(sent, 'bank').name(), synset_name) for sent, synset_name in plant_sents: self.assertEqual(simple_lesk(sent, 'plant').name(), synset_name)
def wordSenseDisambiguation(self, sentence): # removing the disambiguity by getting the context pos = self.identifyWordsForComparison(sentence) sense = [] for p in pos: sense.append(simple_lesk(sentence, p[0], pos=p[1][0].lower())) return set(sense)
def simple_lesk_algo(): sent = 'How much deposit i can deposit to my deposit?' ambiguous = 'deposit' answer = simple_lesk(sent, ambiguous, pos='v') print (answer) # Synset('depository_financial_institution.n.01') print (answer.definition())
def get_event_guesses(self,link): tokens = link['sentence'] pos_tokens = nltk.pos_tag(tokens) guess, guess_backup, guess_all = [], [], [] for i in range(len(tokens)): guess_all.append(i) word = tokens[i].lower() word_lem_n = Word(word).lemmatize('n') word_lem_v = Word(word).lemmatize('v') lesk_syn = simple_lesk((" ").join(tokens), word) if ((word == '.\n') or (word in stopwords.words('english'))): continue #EOL or stop word; ignore guess_backup.append(i) if ('VB' in pos_tokens[i][1] or \ word in noun_events or word_lem_n in noun_events or word_lem_v in noun_events): #verb then add it in guess.append(i) elif ('NN' in pos_tokens[i][1] and lesk_syn): lesk_list = lesk_syn.name().split('.') if ((len(lesk_list) == 3) and ([lesk_list[0],int(lesk_list[2])] in noun_senses)): guess.append(i) #word disamb is in noun sense list matching sense # if (len(guess)==0): # # print('no guesses made here') # # print('link sentence:> ', tokens) # # print('pos tags:> ', pos_tokens) # # print('actual event:> ', tokens[link['start_index']:link['end_index']+1]) # # print('link:> ', link) # # print() # # print() # return guess_backup return guess_all
def disambiguateWordSenses3(self,sentence,word,stanfordPOS): #disambiguation with simple_lesk print word,stanfordPOS result_list=simple_lesk(sentence,word,nbest=True) #result is a list of synsets of word print result_list result = None print word,stanfordPOS if result_list: for ss in result_list: pos=ss.pos() if (pos == u's'): pos = u'a' if pos == stanfordPOS: result = ss print "matched" break if result: pos = result.pos() if (pos == u's'): pos = u'a' offset = result.offset() pos_score=0.0 neg_score=0.0 if (pos, offset) in self.db: # print word,pos,offset pos_score, neg_score = self.db[(pos, offset)] obj = 1.0-(pos_score+neg_score) #print "%%%%%%%%%%" #print pos_score,neg_score, obj else: obj=1.0 pos=None pos_score=0.0 neg_score=0.0 return obj,pos,pos_score,neg_score
def wsd(): definition = "" if request.method == "POST": word = request.form["word"] sentence = request.form["sentence"] if word and sentence and word.lower() in sentence.lower(): definition = simple_lesk(sentence, word).definition() return render_template("main.html", definition=definition)
def provide_synset(word, context): try: answer = simple_lesk(context, word, pos='n') return answer except IndexError: #print("PYWSD DOES NOT LIKE THIS WORD BECAUSE OF A BUG:") #print(word) return None
def get_semantic_score_with_context(token, nlp_review): word = token.lower_ # get lowercased token text position = token.idx # get position of word in document pos = posTag_to_wordNetTag( token.pos_ ) # get POS of token, for better word sense disambiguation # define how many tokens around the token of interest we look at num_surrounding_words = 10 # careful if there are less then num_surrounding_words before our token or after our token leftmost_word_idx = max(0, position - num_surrounding_words) rightmostword_idx = min(len(nlp_review), position + num_surrounding_words) surrounding_text = nlp_review[ leftmost_word_idx:rightmostword_idx].text # determine word with the closest sense in WordNet # print(word,"....",surrounding_text,pos) try: word_with_closest_sense = simple_lesk(surrounding_text, word, pos=pos) except: word_with_closest_sense = simple_lesk(surrounding_text, word) # print(word,pos,word_with_closest_sense) # find the sentiment score to the word we found in wordnet if word_with_closest_sense: sentiword = swn.senti_synset(word_with_closest_sense.name()) sent_scores = { "objective": sentiword.obj_score(), "positive": sentiword.pos_score(), "negative": sentiword.neg_score() } sentiment = max(sent_scores, key=sent_scores.get) return sentiment else: return 'no_sentiment_assigned'
def disambiguate_lesk(self, sentence, ambiguous, pos): """ @param sentence : I went to the bank to deposit my money @param ambiguous : bank @param pos : n @return : Synset('depository_financial_institution.n.01') """ from pywsd.lesk import simple_lesk return simple_lesk(sentence, ambiguous, pos)
def wrapper(doc): head_word = func(doc) head_word_synset = simple_lesk(str(doc), str(head_word)) if not head_word_synset: return "" max_similarity = -1 max_class_synset = "" for category in FINE_CLASSES_SYNSETS: class_synset = wn.synset(category) similarity = wn.path_similarity(head_word_synset, class_synset) if similarity and similarity > max_similarity: max_class_synset = class_synset max_similarity = similarity return max_class_synset
def disambiguate_word_senses(self, sentence): """ Disambiguating word senses for nouns and verbs using the LESK algorithm """ # Extract nouns and verbs pos_tags = self.extract_nouns_and_verbs(sentence) sense = [] for tag in pos_tags: # Fetch correct synset for each tag based on surrounding context disambiguated_term = simple_lesk(sentence, tag[0], pos=tag[2][0].lower()) if disambiguated_term is not None: sense.append(disambiguated_term) return set(sense)
def predictwsd() -> Response: # pylint: disable=unused-variable """make a prediction using the specified model and return the results""" if request.method == "OPTIONS": return Response(response="", status=200) data = request.get_json() sentence = data["sentence"] word = data["word"] answer = simple_lesk(sentence, word) hypernym = "" for synset in answer.hypernyms(): lemma_list = synset.lemmas() hypernym = lemma_list[0].name() break log_blob = {"hypernym": hypernym} return jsonify(log_blob)
def get_event_guesses(link, nsenses, nevents, punctuation): tokens = link['sentence'] cv_events = list(link['caevo_event']) guess, guess2, guess3 = [], [], [] #backups to avoid empty return for i in range(len(tokens)): guess3.append(i) word = tokens[i].rstrip('\n').lower().replace("'", "").replace("\"", "") # print('main word:> ', word) wordS = Word(word).stem() wordLN = Word(word).lemmatize('n') wordLV = Word(word).lemmatize('v') if (word in punctuation or word in stopwords.words('english')): continue guess2.append(i) #all words except punctuations & stop words #1. check with caevo events for cv in cv_events: if (word == cv.decode('utf8').rstrip('\n').lower().replace( "'", "").replace("\"", "")): guess.append(i) continue #2. check with noun events n, nstem, nlemn, nlemv = nevents['nevents'], nevents[ 'nevents_stem'], nevents['nevents_lem_n'], nevents['nevents_lem_v'] if (word in n or word in nstem or word in nlemn or word in nlemv or \ wordS in n or wordS in nstem or wordS in nlemn or wordS in nlemv or \ wordLN in n or wordLN in nstem or wordLN in nlemn or wordLN in nlemv or \ wordLV in n or wordLV in nstem or wordLV in nlemn or wordLV in nlemv): guess.append(i) continue #3. check with noun senses n, nstem, nlemn, nlemv = nsenses['nsenses'], nsenses[ 'nsenses_stem'], nsenses['nsenses_lem_n'], nsenses['nsenses_lem_v'] lesk_syn = simple_lesk((" ").join(tokens), word) if (lesk_syn): lesk_list = lesk_syn.name().split('.') if (len(lesk_list) == 3): lsen = int(lesk_list[2]) if ((word, lsen) in n or (wordS, lsen) in n or (wordLN, lsen) in n or (wordLV, lsen) in n or \ (wordS,lsen) in nstem or (wordLN,lsen) in nlemn or (wordLV,lsen) in nlemv): guess.append(i) continue return guess
def wrapper(doc): head_word = func(doc) hypernyms = [] if head_word: # print("question: " + str(doc)) # print("head word: " + str(head_word) + " pos=" + str(head_word.pos_)) synset = simple_lesk(str(doc), str(head_word)) if synset: unvisited_hypernyms = synset.hypernyms() for i in range(5): for hypernym in unvisited_hypernyms: unvisited_hypernyms = unvisited_hypernyms + hypernym.hypernyms( ) unvisited_hypernyms.remove(hypernym) hypernyms.append(hypernym) hypernyms.append(synset) # print(str(hypernyms)) return hypernyms
def get_synonyms(self, sentence, word): from pywsd.lesk import simple_lesk synonyms = set() if isinstance(sentence, str): sentence = sentence.decode('utf-8') if isinstance(word, str): word = word.decode('utf-8') synset = simple_lesk(sentence, word) if synset is not None: for synonym in synset.lemma_names(): synonyms.add(synonym.replace('_', ' ')) # for idx, synset in enumerate(wordnet.synsets(word)): # for synonym in synset.lemma_names(): # synonyms.add(synonym.replace('_', ' ')) return list(synonyms)
def tagging(self): if len(self.question1) <= 0 or len(self.question2) <= 0: return NULL else: stemmer1 = SnowballStemmer("english") stemmer2 = SnowballStemmer("english") #for self.w in : self.words1 = pos_tag(word_tokenize(self.question1)) for i in range(0, len(self.words1)): #To create a list of lists instead of a read only tuple self.words1stem.append([]) self.words1stem[i].append(stemmer1.stem((self.words1[i])[0])) self.words1stem[i].append((self.words1[i])[1]) for word in self.words1stem: print word print(simple_lesk(self.question1, word[0])) #print self.answer1 #if self.w.lower() not in stops: #print stemmer1.stem(self.w) #self.words1.append(stemmer.stem(self.w)) #self.words1.append(nltk.pos_tag(self.w)) #temp=stemmer1.stem(self.w) #print WORD1[0] #self.answer1.append(simple_lesk(self.question1,WORD1[0],WORD1[1])) #answer = simple_lesk #self.words1pos=nltk.pos_tag(self.words1) #print self.answer1 #words1=stem(words1) for self.w2 in self.question2.split(): if self.w2.lower() not in stops: #self.words2.append(stemmer2.stem(self.w2)) #self.words2.append(self.w2) WORD2 = nltk.pos_tag(stemmer2.stem(self.w2))
def get_synset(metode, word, text): synset = "" if metode == "original_lesk": synset = simple_lesk(text, word) elif metode == "simple_lesk": synset = adapted_lesk(text, word) elif metode == "adapted_lesk": synset = cosine_lesk(text, word) # elif metode == "path" : # synset = max_similarity(text, word, "path") # elif metode == "path" : # synset = max_similarity(text, word, "wup") # elif metode == "path" : # synset = max_similarity(text, word, "lin") # elif metode == "path" : # synset = max_similarity(text, word, "res") # elif metode == "random_sense": # synset = random_sense(word) # elif metode == "first_sense": # synset = first_sense(word) # elif metode == "most_frequent_sense": # synset = most_frequent_sense(word) return synset
async def extract_wsd(request, target): """ $ curl -d '{"sents":"The sheet is twenty centimeters."}' \ -H "Content-Type: application/json" -X POST \ localhost:1700/en/wsd/default | json :param request: :return: """ from pywsd import disambiguate from pywsd.similarity import max_similarity as maxsim from pywsd.lesk import simple_lesk rd = request.json sents = rd['sents'] extract_syn = lambda r: (r[0], r[1].name(), r[1].definition()) def extract_sents(): rs = disambiguate(sents) return [extract_syn(r) for r in rs if r[1]] fn_map = { 'default': lambda: extract_sents(), 'maxsim': lambda: [ extract_syn(r) for r in disambiguate(sents, algorithm=maxsim, similarity_option='wup', keepLemmas=False) if r[1] ], 'lesk': lambda: simple_lesk(sents, rd['word']), } result = fn_map[target]() if target in fn_map else [] return json(result)
from pywsd.utils import has_synset simplelesk_answer = [] adaptedlesk_answer = [] cosinelesk_answer = [] print "\nSentence Context Disambiguation\n============================== \n" raw_sentence="Some people are happy this sentence." words = nltk.word_tokenize(raw_sentence) print "\nChecking synsets of each word . . .\n==========================================\n" print(disambiguate(raw_sentence)) print "\nDisambiguating your sentence word by word using Simple Lesk algorithm. Hold on. \n======================================================" for eachword in words: if has_synset(eachword): answer = simple_lesk(raw_sentence, eachword) simplelesk_answer.append(answer) print "Sense :", answer print eachword+":"+answer.definition()+"\n" else: print eachword+": "+eachword+"\n" simplelesk_answer.append(eachword) """" print "\nDisambiguating your sentence word by word using Adapted Lesk algorithm. Hold on. \n======================================================" for eachword in words: if has_synset(eachword): answer = adapted_lesk(raw_sentence, eachword) adaptedlesk_answer.append(answer) print "Sense :", answer
from string import punctuation from nltk import word_tokenize, pos_tag from nltk.corpus import wordnet as wn from nltk.corpus import brown, stopwords from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.similarity import max_similarity from pywsd.utils import lemmatize, penn2morphy from pywsd.allwords_wsd import disambiguate """ This module is to test for consistency between using the dismabiguate() and individually calling wsd functions. """ for sentence in brown.sents()[:100]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Uses POS info when WSD-ing. _, poss = zip(*pos_tag(word_tokenize(sentence))) tagged_sent = disambiguate(sentence, prefersNone=True, keepLemmas=True) for word_lemma_semtag, pos in zip(tagged_sent, poss): word, lemma, semtag = word_lemma_semtag if semtag is not None: # Changes POS to morphy POS pos = penn2morphy(pos, returnNone=True) # WSD on lemma assert simple_lesk(sentence, lemma, pos=pos) == semtag
sent_id = {} for i in data: i = i.split() print(i) sent_id[i[0]] = i[1] file.close() file = open("raw_sent2.txt", "r") data = file.read() data = data.split("\n") sent_dict = {} counter = 1 for i in data: sent_dict["hom_" + str(counter)] = i counter += 1 print(len(sent_id)) print(len(sent_dict)) synsets = {} list = [] for i in sent_id: print sent_dict[i], sent_id[i] synsets[sent_id[i]] = simple_lesk(sent_dict[i], sent_id[i]) list.append((sent_id[i], simple_lesk(sent_dict[i], sent_id[i]))) file = open("interpretation.txt", "w") for i in list: file.write(str(i)) file.write("\n") print list
def predict(sent, ambiguous): try: return simple_lesk(sent, ambiguous) except: return None
if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return '' for i in get_pos_tags(sentences[0]): try: dict[i[0]] = simple_lesk(sentences[0], i[0], get_wordnet_pos(i[1])) except IndexError: print sentences[0] dict[i[0]] = simple_lesk(sentences[0], i[0]) continue flag = 0 s=[] for i in trigrams: for j in get_pos_tags(i): try:
def RecursiveGlossOverlap_Classify(text): definitiongraphedges=defaultdict(list) definitiongraphedgelabels=defaultdict(list) #--------------------------------------------------------------------------------- #2.Compute intrinsic merit (either using linear or quadratic overlap) #--------------------------------------------------------------------------------- tokenized = nltk.word_tokenize(text) fdist1 = FreqDist(tokenized) stopwords = nltk.corpus.stopwords.words('english') stopwords = stopwords + [u' ',u'or',u'and',u'who',u'he',u'she',u'whom',u'well',u'is',u'was',u'were',u'are',u'there',u'where',u'when',u'may', u'The', u'the', u'In',u'in',u'A',u'B',u'C',u'D',u'E',u'F',u'G',u'H',u'I',u'J',u'K',u'L',u'M',u'N',u'O',u'P',u'Q',u'R',u'S',u'T',u'U',u'V',u'W',u'X',u'Y',u'Z'] puncts = [u' ',u'.', u'"', u',', u'{', u'}', u'+', u'-', u'*', u'/', u'%', u'&', u'(', ')', u'[', u']', u'=', u'@', u'#', u':', u'|', u';',u'\'s'] #at present tfidf filter is not applied #freqterms1 = [w for w in fdist1.keys() if w not in stopwords and w not in puncts and (fdist1.freq(w) * compute_idf(corpus, w))] freqterms1 = [w.decode("utf-8") for w in fdist1.keys() if w not in stopwords and w not in puncts] current_level = 1 nodewithmaxparents = '' noofparents = 0 maxparents = 0 relatedness = 0 first_convergence_level = 1 tokensofthislevel = [] convergingterms = [] convergingparents = [] tokensofprevlevel = [] prevlevelsynsets = [] commontokens = [] vertices = 0 edges = 0 overlap = 0 iter = 0 from nltk.corpus import wordnet as wn #recurse down to required depth and update intrinsic merit score #relatedness is either sum(overlaps) or sum((overlapping_parents)*(overlaps)^2) also called convergence factor while current_level < 3: #crucial - gather nodes which converge/overlap (have more than 1 parent) if current_level > 1: print current_level for x in freqterms1: for y in parents(x,prevlevelsynsets): ylemmanames=y.lemma_names() #for yl in ylemmanames: # definitiongraphedges[x].append(yl) definitiongraphedges[x].append(ylemmanames[0]) definitiongraphedgelabels[x + " - " + ylemmanames[0]].append(" is a subinstance of ") definitiongraphedgelabels[ylemmanames[0] + " - " + x].append(" is a superinstance of ") convergingterms = [w for w in freqterms1 if len(parents(w,prevlevelsynsets)) > 1] for kw in freqterms1: convergingparents = convergingparents + ([w for w in parents(kw, prevlevelsynsets) if len(parents(kw, prevlevelsynsets)) > 1]) for kw in freqterms1: noofparents = len(parents(kw, prevlevelsynsets)) if noofparents > maxparents: maxparents = noofparents nodewithmaxparents = kw for keyword in freqterms1: #WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset() #disamb_synset = best_matching_synset(set(doc1), wn.synsets(keyword)) if use_pywsd_lesk: disamb_synset = simple_lesk(" ".join(freqterms1), keyword) if use_nltk_lesk: disamb_synset = lesk(freqterms1, keyword) else: disamb_synset = best_matching_synset(freqterms1, wn.synsets(keyword)) prevlevelsynsets = prevlevelsynsets + [disamb_synset] if len(wn.synsets(keyword)) != 0: disamb_synset_def = disamb_synset.definition() tokens = nltk.word_tokenize(disamb_synset_def) fdist_tokens = FreqDist(tokens) #at present frequency filter is not applied #if keyword in convergingterms: tokensofthislevel = tokensofthislevel + ([w for w in fdist_tokens.keys() if w not in stopwords and w not in puncts and fdist_tokens.freq(w)]) listcount = len(tokensofthislevel) setcount = len(set(tokensofthislevel)) overlap = listcount-setcount if overlap > 0 and iter == 0 : first_convergence_level = current_level iter = 1 #choose between two relatedness/convergence criteria :- #1) simple linear overlap or 2) zipf distributed quadratic overlap #relatedness = relatedness + len(convergingparents)*overlap relatedness = relatedness + overlap + len(convergingparents) #relatedness = relatedness + ((len(convergingparents)*overlap*overlap) + 1) #find out common tokens of this and previous level so that same token does not get grasped again - #relatedness must be increased since repetition of keywords in two successive levels is a sign of #interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges #corresponding to common tokens commontokens = set(tokensofthislevel).intersection(set(tokensofprevlevel)) tokensofthislevel = set(tokensofthislevel).difference(commontokens) relatedness = relatedness + len(commontokens) #decrease the vertices count to address common tokens removed above - edges should remain same since they #would just point elsewhere vertices = vertices + setcount - len(commontokens) edges = edges + listcount current_level = current_level + 1 freqterms1 = set(tokensofthislevel) tokensofprevlevel = tokensofthislevel tokensofthislevel = [] intrinsic_merit = vertices*edges*relatedness / first_convergence_level print definitiongraphedges nxg=nx.DiGraph() pos=nx.spring_layout(nxg) #pos=nx.shell_layout(nxg) #pos=nx.random_layout(nxg) #pos=nx.spectral_layout(nxg) #nx.draw_graphviz(nxg,prog="neato") for k,v in definitiongraphedges.iteritems(): for l in v: nxg.add_edge(k,l) nxg.add_edge(l,k) #nx.draw_networkx(nxg) #plt.show() nxg.remove_edges_from(nxg.selfloop_edges()) #print "Core number =",nx.core_number(nxg) sorted_core_nxg=sorted(nx.core_number(nxg).items(),key=operator.itemgetter(1), reverse=True) print "Core number (sorted) :",sorted_core_nxg print "=============================================================================================================" print "Unsupervised Classification based on top percentile Core numbers of the definition graph(subgraph of WordNet)" print "=============================================================================================================" no_of_classes=len(nx.core_number(nxg)) top_percentile=0 max_core_number=0 max_core_number_class="" for n in sorted_core_nxg: print "This document belongs to class:",n[0],",core number=",n[1] if top_percentile < no_of_classes*0.50: top_percentile+=1 else: break if n[1] > max_core_number: max_core_number=n[1] max_core_number_class=n[0] print " max_core_number",max_core_number print "===================================================================" print "Betweenness Centrality of Recursive Gloss Overlap graph vertices" print "===================================================================" bc=nx.betweenness_centrality(nxg) sorted_bc=sorted(bc.items(),key=operator.itemgetter(1),reverse=True) print sorted_bc print "===================================================================" print "Closeness Centrality of Recursive Gloss Overlap graph vertices" print "===================================================================" cc=nx.closeness_centrality(nxg) sorted_cc=sorted(cc.items(),key=operator.itemgetter(1),reverse=True) print sorted_cc print "===================================================================" print "Degree Centrality of Recursive Gloss Overlap graph vertices" print "===================================================================" dc=nx.degree_centrality(nxg) sorted_dc=sorted(dc.items(),key=operator.itemgetter(1),reverse=True) print sorted_dc print "===================================================================" print "Page Rank of the vertices of RGO Definition Graph (a form of Eigenvector Centrality)" print "===================================================================" sorted_pagerank_nxg=sorted(nx.pagerank(nxg).items(),key=operator.itemgetter(1),reverse=True) print sorted_pagerank_nxg return (sorted_core_nxg, sorted_pagerank_nxg)
def get_def(word, context, lang): #job = json.loads(injob.text) #lang = job.lang #context = job.context #word = job.word # remove non alphanumeric chars context = remove_notalpha(context) doc = nlp(context) if lang != 'eng': #call for translation to proper lang getstr = "https://glosbe.com/gapi/translate?from=" + lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true" response = requests.get(getstr) indef = json.loads(response.text) word = find_token(indef, doc) else: for token in doc: if word == token.text: word = token break # do two seperate lesks answer = simple_lesk(context, word.text, pos_convert(word.pos_)) cosans = cosine_lesk(context, word.text, pos_convert(word.pos_)) # find what we hope is the better answer if (check_def(context, cosans.definition()) > check_def( context, answer.definition())): answer = cosans sense = str(answer) sense = sense.split("'")[1].split(".") if ((sense[0] != word.lemma_ or int(sense[2]) > 4) and word.pos_ != 'PROPN'): try: answer = wn.synset(word.lemma_ + '.' + pos_convert(word.pos_) + '.01') except Exception: pass if lang != 'eng': if lang == 'spa': lang = 'es' if lang == 'arb': lang = 'ar' #this should use the spa or arb word given if len(indef['tuc']) > 0: meaning = "" for tuc in indef['tuc']: try: if tuc['phrase']['text'] == word.lemma_: esptemp = "" for m in tuc['meanings']: if m['language'] == lang and len( m['text']) > len(meaning): meaning = m['text'] except KeyError: pass else: # needs to look for beginning of sentence if (word.pos_ == 'PROPN'): meaning = word.text + " is a proper noun." elif answer: meaning = answer.definition() return meaning
#importing libraries import nltk import re from pywsd.lesk import simple_lesk #Downloading the The stopwords and populars nltk.download('popular') #Taking the user input sent = input("Enter the sentence") #Tokenizing the inut into word sent2 = nltk.word_tokenize(sent) #Tagging parts of speech tagged_word = nltk.pos_tag(sent2) print(tagged_word) #Ask your to which word sense the want to know and collecting all that word with there POS in a list ambiguous = input('Enter the word want to disambiguate:') sense_word_list = [] for i in range(len(tagged_word)): if tagged_word[i][0].lower() == ambiguous: synset = simple_lesk(sent, tagged_word[i][0], tagged_word[i][1][0].lower()) sense = synset.lemmas()[0].name() sense = re.sub(r"_", " ", sense) print("Sense of", tagged_word[i][0], "is: ", sense) print("Definition of", tagged_word[i][0], "is:", synset.definition())
def get_def(injob): lang = injob['language'] context = injob['context'].lower() word = injob['word'].lower() # make proper names into iso standard if lang == 'English': lang = 'eng' if lang == 'Spanish': lang = 'spa' if lang == 'Arabic': lang = 'arb' if lang == 'French': lang = 'fra' # remove non alphanumeric chars doc = nlp(context) if lang != 'eng': if lang == 'fra': stoken = flp(word) if lang == 'spa': stoken = slp(word) for token in stoken: print(token.lemma_) word = token.lemma_.lower() # call for translation to proper lang getstr = "https://glosbe.com/gapi/translate?from="+ lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true" response = requests.get(getstr) indef = json.loads(response.text) word = find_token(indef, doc, lang) if isinstance(word, str): return word else: for token in doc: if word == token.text: word = token break if word and (word.is_stop or word.text == 'I'): if lang != 'eng': return find_def(indef, lang, word) else: if word.text == 'I': response = "Singular first person pronoun." else: try: a = o.get_info_about_word(word.lemma_).json() except Exception: a = o.get_info_about_word(word.text).json() response = a['results'][0]['lexicalEntries'][0][ 'entries'][0]['senses'][0]['definitions'][0] return response if word: # do two seperate lesks answer = simple_lesk(context, word.text, pos_convert(word.pos_)) cosans = cosine_lesk(context, word.text, pos_convert(word.pos_)) # find what we hope is the better answer if(check_def(context, cosans.definition()) > check_def(context, answer.definition())): answer = cosans sense = str(answer) sense = sense.split("'")[1].split(".") if ((sense[0] != word.lemma_ or int(sense[2]) > 4) and word.pos_ != 'PROPN'): try: answer = wn.synset(word.lemma_ + '.' + pos_convert(word.pos_) + '.01') except Exception: pass # probably broken now the stemmer had problems with capitolization if (word.pos_ == 'PROPN'): meaning = word.text + " is a proper noun." elif lang != 'eng' and len(indef['tuc']) > 0: # this should use the spa or arb word given meaning = find_def(indef, lang, word) elif answer: meaning = answer.definition() if meaning: print("meaning: " + meaning) return meaning elif lang == 'eng': return "Sorry, I don't know that definintion:(" elif lang == 'spa': return "Lo siento, no sé esa definición:(" elif lang == 'fra': return "Désolé, je ne connais pas cette définition:(" elif lang == 'eng': return "Sorry, I don't know that definintion:(" elif lang == 'spa': return "Lo siento, no sé esa definición:(" elif lang == 'fra': return "Désolé, je ne connais pas cette définition:("
def wsd_lesk(raw_df, algorithm_choice): """This finds the synset of the word using the original sentence as context and different lesk algorithms from nltk- and pywsd-packages. Algorithm choices are: 1. nltk's lesk 2. pywsd simple_lesk, 3. pywsd advanced_lesk.""" start = timer() algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk", 3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"} df = raw_df full_aspect_synset_list = [] full_aspect_synset_list_definition = [] aspect_synset_list_definition = [] aspect_synset_list = [] opinion_synset_list = [] opinion_synset_list_definition = [] full_opinion_synset_list = [] full_opinion_synset_list_definition = [] aspect_opinion = ["aspect_tags", "opinion_tags"] tokenized_sentences = raw_df["tokenized_sentence"] non_tokenized_sentences = raw_df["original_text"] for opinion_list in aspect_opinion: for i, phrase in enumerate(df[opinion_list]): multiple_word_found = False for j, word in enumerate(phrase): special_word = False if multiple_word_found is False: # Check here for special words such as "bug". aspect = check_for_special_word(word) if aspect is not None: special_word = True wn_check = [] if len(phrase) >= 2: k = 0 temporary_combined_word = [] while k < len(phrase): temporary_combined_word.append(phrase[k][0]) k += 1 combined_word_string = '_'.join(temporary_combined_word) wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1])) multiple_word_found = True if len(wn_check) == 0: wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1])) multiple_word_found = False if len(wn_check) > 0: if special_word is False: if algorithm_choice == 1: if multiple_word_found is True: aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 2: if multiple_word_found is True: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 3: if multiple_word_found is True: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 4: if multiple_word_found is True: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if aspect is not None: if opinion_list is "aspect_tags": aspect_synset_list.append(aspect) aspect_synset_list_definition.append(aspect.definition()) else: opinion_synset_list.append(aspect) opinion_synset_list_definition.append(aspect.definition()) if opinion_list is "aspect_tags": full_aspect_synset_list.append(aspect_synset_list) full_aspect_synset_list_definition.append(aspect_synset_list_definition) aspect_synset_list = [] aspect_synset_list_definition = [] else: full_opinion_synset_list.append(opinion_synset_list) full_opinion_synset_list_definition.append(opinion_synset_list_definition) opinion_synset_list = [] opinion_synset_list_definition = [] df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values end = timer() logging.debug("WSD Lesk Time: %.2f seconds" % (end - start)) return df
imp_pos.append("n") elif (i[1][1] in pa): imp_pos.append("a") elif (i[1][1] in pv): imp_pos.append("v") elif (i[1][1] in pav): imp_pos.append("r") else: imp_pos.append("none") ''' imp_synset_definition = [] for i in range(len(imp_key)): # imp_synset_definition.append(simple_lesk(sent, imp_key[i],pos=imp_pos[i]).definition()) imp_synset_definition.append( simple_lesk(sent, imp_key[i], pos=None).definition()) dic_for_context = {} kw_extractor = yake.KeywordExtractor(lan="en", n=1, windowsSize=2, top=10) imp_key_from_definition = [] for i in range(len(imp_key)): temp = [] a = kw_extractor.extract_keywords(imp_synset_definition[i]) for j in range(len(a)): imp_key_from_definition.append(a[j][1]) temp.append(a[j][1]) dic_for_context[imp_key[i]] = temp ''' Weight Distribution ''' '''
# # Copyright (C) 2014 alvations # URL: # For license information, see LICENSE.md bank_sents = ['I went to the bank to deposit my money', 'The river bank was full of dead fishes'] plant_sents = ['The workers at the industrial plant were overworked', 'The plant was no longer bearing flowers'] print "======== TESTING simple_lesk ===========\n" from pywsd.lesk import simple_lesk print "#TESTING simple_lesk() ..." print "Context:", bank_sents[0] answer = simple_lesk(bank_sents[0],'bank') print "Sense:", answer try: definition = answer.definition() except: definition = answer.definition # Using older version of NLTK. print "Definition:", definition print print "#TESTING simple_lesk() with POS ..." print "Context:", bank_sents[1] answer = simple_lesk(bank_sents[1],'bank','n') print "Sense:", answer try: definition = answer.definition() except: definition = answer.definition # Using older version of NLTK. print "Definition:", definition print
# for s in sentence: # answers = disambiguate(s, adapted_lesk, keepLemmas=False) # what I was doing before context = remove_notalpha(context) # lemmatize the context doc = nlp(context) for token in doc: print("") if ' ' not in token.text and not token.is_stop and token.pos_ != '-PRON-': try: con = ''.split(context) for word in con: if word == token.text: context = token.text + " " answer = simple_lesk(context, token.text, pos_convert(token.pos_)) print(answer) if not answer: continue except Exception: continue sense = split_syn(answer) print(sense[0] + " " + token.lemma_) if ((sense[0] != token.lemma_ or int(sense[2]) > 4) and token.pos_ != 'PROPN'): try: cosans = cosine_lesk(context, token.text, pos_convert(token.pos_)) if (check_def(context, cosans.definition()) > check_def( context, answer.definition())):
bank_sents = ['I went to the bank to deposit my money', 'The river bank was full of dead fishes'] plant_sents = ['The workers at the industrial plant were overworked', 'The plant was no longer bearing flowers'] print "======== TESTING simple_lesk ===========\n" from pywsd.lesk import simple_lesk print "#TESTING simple_lesk() ..." print "Context:", bank_sents[0] answer = simple_lesk(bank_sents[0],'bank') print "Sense:", answer definition = answer.definition() #except: definition = answer.definition # Using older version of NLTK. print "Definition:", definition print '' print "#TESTING simple_lesk() with POS ..." print "Context:", bank_sents[1] answer = simple_lesk(bank_sents[1],'bank','n') print "Sense:", answer definition = answer.definition() #except: definition = answer.definition # Using older version of NLTK. print "Definition:", definition print print "#TESTING simple_lesk() with POS and stems ..." print "Context:", plant_sents[0] answer = simple_lesk(plant_sents[0],'plant','n', True) print "Sense:", answer definition = answer.definition()