def test_sim_graph(): from sematch.semantic.graph import SimGraph from sematch.semantic.similarity import WordNetSimilarity from sematch.nlp import Extraction, lemmatization from sematch.sparql import EntityFeatures from collections import Counter madrid = EntityFeatures().features('http://dbpedia.org/resource/Tom_Cruise') words = Extraction().extract_words_sent(madrid['abstract']) words = list(set(lemmatization(words))) wns = WordNetSimilarity() word_graph = SimGraph(words, wns.word_similarity) word_scores = word_graph.page_rank() words, scores =zip(*Counter(word_scores).most_common(10)) assert words is not None
def test_sim_graph(): from sematch.semantic.graph import SimGraph from sematch.semantic.similarity import WordNetSimilarity from sematch.nlp import Extraction, lemmatization from sematch.sparql import EntityFeatures from collections import Counter madrid = EntityFeatures().features( 'http://dbpedia.org/resource/Tom_Cruise') words = Extraction().extract_words_sent(madrid['abstract']) words = list(set(lemmatization(words))) wns = WordNetSimilarity() word_graph = SimGraph(words, wns.word_similarity) word_scores = word_graph.page_rank() words, scores = zip(*Counter(word_scores).most_common(10)) assert words is not None
def disambiguate_graph(self, sentence): words_origin = word_tokenize(sentence) #extract words that have a synset in WordNet, currently support NOUN. words = [w for w in words_origin if self._wn_sim.word2synset(w)] # map words to synsets words_synsets = {w:self._wn_sim.word2synset(w) for w in words} # construct sets list synsets = list(itertools.chain.from_iterable([words_synsets[w] for w in words])) # remove duplicate synsets synsets = list(set(synsets)) # define semantic similarity metric sim_metric = lambda x, y: self._wn_sim.similarity(x, y, self._sim_name) # construct similarity graphs sim_graph = SimGraph(synsets, sim_metric) # get pagerank scores of synsets rank_scores = sim_graph.page_rank() results = [] for w in words_origin: if w in words: candidate_scores = {s:rank_scores[s] for s in words_synsets[w]} results.append((w, Counter(candidate_scores).most_common(1)[0][0])) else: results.append((w, None)) return results