def getRightSyns2(word, tokenized, pos1, sentence, fdist): pos = pos1[0:2] otherDict = {'VB':"simV.lsp", 'JJ':"simA.lsp", 'RB':"simA.lsp",'NN':"simN.lsp"} toReturn = None if (pos in otherDict): myPos = otherDict[pos] source = ourLesk(sentence, word, None) if source is not None: synonyms = sorted(thes.scored_synonyms(word, fileid = myPos),key=lambda x: x[1], reverse=True)[0:9] if len(synonyms) > 0: finalList = [] for synonym in synonyms: code = ourLesk(sentence, synonym[0], None) if code is not None: if source == code: finalList.append(synonym[0]) if len(finalList) != 0: toReturn = finalList return toReturn
def GetSynonyms(self, word): try: # check, if the result is not cached current_key = self._get_configuration_key() if current_key not in self.__found_synonyms: self.__found_synonyms[current_key] = {} if word in self.__found_synonyms[current_key]: return self.__found_synonyms[current_key][word] # it is not, so find synonyms wordLemma = word if self.__use_lemma: wordLemma = self.__lemmatizer.lemmatize(word) results = thes.scored_synonyms(wordLemma, fileid=self.__fileId) if len(results) > 0: sorted_results = sorted(results, key=lambda cell: cell[1], reverse=True) final_results = [w[0].lower() for w in sorted_results if (sorted_results[0][1] - w[1]) <= self.__similarity] if self.__max_words > 0 and len(final_results) > self.__max_words: final_results = final_results[:self.__max_words] if word.lower() not in final_results: final_results.append(word) self.__found_synonyms[current_key][word] = final_results return final_results else: # no synonym found return [word] return filtered_results except: # check, if any synonym has been found (if not, return only this word) return [word]
def prevalent_sense(word,pos=wn.NOUN): #determine the prevalence if pos=='a': lin_type='simA.lsp' elif pos=='v': lin_type='simV.lsp' else: lin_type='simN.lsp' distthes = lin.scored_synonyms(word, fileid=lin_type) sortedthes = sorted(distthes, key=operator.itemgetter(1), reverse=True)[0:k] scores = {} #dict for the scores for each sense which will be contributed to by each neighbour for wnsynset in wn.synsets(word,pos): #initialise scores for each synset as 0 print "wnsynset.name" ##bow.n.01 print wnsynset.name scores[wnsynset.name] = 0 print "sortedthes:(lin scored_synonyms)" print sortedthes for (neigh, dss) in sortedthes: print "neigh:%r" % neigh if len(wn.synsets(neigh))>0: #check neighbour is in WN otherwise all sims will be 0 sum = 0 #this will be the sum of wnss scores for this distributional neighbour (summed over all senses) neighscores = {} #this stores the wnss scores for each sense for this neighbour #it could be a list with index corresponding to WN synset number #will need to divide by sum and times by dss before adding to the sum over all distributional neighbours for each sense print "pos%r" % pos print "word%r" % word for wnsynset in wn.synsets(word,pos): # word = film #print "wnsynset:%r" % wnsynset wnss_score = wnss(neigh, wnsynset,pos=pos)#look up wnss score for this neighbour and this sense sum += wnss_score #add it to the sum over all senses for the neighbour neighscores[wnsynset.name] = wnss_score #store it in dictionary so that each value can later be divided by sum print "neighscores,sum,dss" print neighscores,sum,dss for wnsynset in wn.synsets(word,pos):#second loop is needed to divide by sum (which is not known until completion of first loop) #sum will be different for each neighbour so it is not a constant which can be ignored scores[wnsynset.name] += dss * neighscores[wnsynset.name] / sum #weight the score for each sense (according to this neighbour) # by its dss score and inversely by the sum of wnss scores for this neighbour # and add to the total for this sense else: print "Warning: ignoring distributional neighbour "+neigh+" : not in WordNet as noun" #this is likely to happen when distributional neighbours are proper nouns see 'hull' example #probably should modifiy code so that it is the top k neighbours excluding words not in WN print "scores.tiems" print scores.items() scoreslist = [scoretuple for scoretuple in scores.items()] sortedscores = sorted(scoreslist, key=operator.itemgetter(1), reverse=True) print "sortedscores" print sortedscores return sortedscores[0]
def get_similar_words(self, term): if term in self.terms_dict.keys(): return self.terms_dict[term] scored_synonyms = lin_thesaurus.scored_synonyms(term, fileid="simN.lsp") best_2 = sorted(scored_synonyms, key=lambda x: x[1], reverse=True)[:2] best_2_list = [tup[0] for tup in best_2] self.terms_dict[term] = best_2_list return best_2_list
def do_thesaurus(query): lowered = [] toAdd = set() # lower every word in query for word in query: lowered.append(word.lower()) # Go over every word in query for word in lowered: counterNoMoreThen4 = 0 dictionary = thes.synonyms(word)[1][1] # find similar expressions and their scores listOfScores = thes.scored_synonyms(word)[1][1] dictOfScored = dict(listOfScores) # print("\n word: ",word) # print(dictOfScored) # print(dictionary) # Go over the thesaurus words #for idx, syn in enumerate(dictionary): # related.append(syn) # Go over the scored dictionary for key in dictOfScored: # Check if similar enough and no more then 4 per word if dictOfScored[key] > 0.21 and key not in lowered and counterNoMoreThen4 < 4: counterNoMoreThen4 += 1 # if the similar term contains ' ' if key.__contains__(' '): splited = key.split() # add only relevant term for term in splited: if term not in lowered: toAdd.add(term) else: toAdd.add(key) elif counterNoMoreThen4 == 4: # Too many terms for word continue #print("word: ",word," similar:",list(toAdd)) # Lower term in listToAdd listToAdd = list(toAdd) for i, term in enumerate(toAdd): listToAdd[i] = term.lower() #print("list: ", listToAdd) #print("how much: ", len(listToAdd)) return listToAdd
def expand_query(self, parsed_query) -> list: """ expands query based on synonyms given from thesaurus package. :param parsed_query: :return: """ terms_from_expansion = [] for term in parsed_query: scored = thes.scored_synonyms(term)[1][1] terms_from_expansion += [ k for k, v in sorted( scored, key=lambda item: item[1], reverse=True) ][:17] return terms_from_expansion + parsed_query
def getRightSyns3(word, tokenized, pos1, sentence, fdist): pos = pos1[0:2] otherDict = {'VB':"simV.lsp", 'JJ':"simA.lsp", 'RB':"simA.lsp",'NN':"simN.lsp"} if pos in otherDict: myPos = otherDict[pos] synonyms = sorted(thes.scored_synonyms(word, fileid = myPos),key=lambda x: x[1], reverse=True)[0:4] if len(synonyms) > 0: return [synonym[0] for synonym in synonyms] else: return None else: return None
def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.scored_synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print(f"Similarity score for {word1} and {word2}:") print(thes.similarity(word1, word2))
def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.scored_synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Similarity score for %s and %s:" % (word1, word2)) print(thes.similarity(word1, word2))
def get_lin_terms(self, term, n, pos): # sys.stderr.write("DEBUG Getting scored synonyms of term %s, POS %s\n" % (term, pos)) syns = thes.scored_synonyms(term) # sys.stderr.write("DEBUG Here are the synsets returned from the Lin thesaurus: %s\n" % syns) all_syns = [] # syn list is in the form ((POS, [syn, syn, syn]), (POS, [syn, syn, syn]) ...) # concatenate all synonyms from the various lists (see TODO for possible change) for element in syns: if element[0] == pos: all_syns.extend(element[1]) # sys.stderr.write("DEBUG Here are all the synonyms: %s\n" % all_syns) if len(all_syns) > n: # sys.stderr.write("DEBUG Found more synonyms than required; filtering by similarity measure\n") # get n-best synonyms according to Lin similarity top = heapq.nlargest(n, all_syns, key = lambda k: k[1]) else: # sys.stderr.write("DEBUG Synonyms found do not exceed max number of synonyms desired; skipping filtering step\n") top = all_syns # sys.stderr.write("DEBUG Here are the top %s synonyms: %s\n" % (n, top)) return top
def demo(): word1 = "business" word2 = "enterprise" #refer to: ''' print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting scored synonyms for " + word1) print(thes.scored_synonyms(word1) print("Similarity score for %s and %s:" % (word1, word2)) print(thes.similarity(word1, word2)) ''' sql_handler = mysql_handler('scam', 'scam', 'test') sql_handler.connect() result = sql_handler.do_query('select key_word from email_trie\ where occur_time > 30', True) for line in result: kw = line[0] print "searching for synonyms for key_word: ", kw synonyms = thes.scored_synonyms(kw) for field in synonyms: field_name = field[0] for syn_score in field[1]: syn = syn_score[0] score = syn_score[1] param = '("' + kw + '","' + field_name + '","' + syn + '",' + str(score) + ')' print "executing: insert into synonyms (key_word, field_name, syn, score) values " + param try: sql_handler.do_query("insert into synonyms (key_word, field_name, syn, score) values " + param) except: continue
def lin_synonyms(word, pos): fileid = 'sim%s.lsp' % pos.upper() thes_entry = lin.scored_synonyms(word, fileid=fileid) thes_entry = sorted(thes_entry, key=(lambda x: x[1]), reverse=True) # return words ordered by score return [syn for syn, score in thes_entry]
import nltk from nltk.corpus import wordnet from nltk.corpus import lin_thesaurus as cs #syns = cs.scored_synonyms('pillage') #print syns #syn = cs.synonyms('pillage') for entry in cs.scored_synonyms('pillage'): print entry[0] score = 0.0 word = "" for words in entry[1]: #print words[0], words[1] if score < words[1]: score = words[1] word = words[0] print word, score print cs.scored_synonyms('pillage') #meaning = wordnet.synsets('pillage')[0].lemmas()[0].name() #print wordnet.synsets('pillage')[0].lch_similarity(meaning, "n") #print syns #print dir(wordnet.synsets('pillage')) #print meaning #print wordnet.wup_similiarity(wordnet.synsets('pillage'), meaning) #w1 = wordnet.synset('run.v.01') #w2 = wordnet.synset('sprint.v.01') #print w1.wup_similarity(w2)
__author__ = 'arkanath' from nltk.corpus import lin_thesaurus as thes print(thes.scored_synonyms('scorn'))
def lin_synonyms(word, pos): fileid = 'sim%s.lsp' % ('N' if pos is None else pos.upper()) thes_entry = lin_thesaurus.scored_synonyms(word, fileid=fileid) thes_entry = sorted(thes_entry, key = (lambda x : x[1]), reverse = True) return [syn for syn, score in thes_entry]