Пример #1
0
def test_word_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None  # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat',
                               'li') is not None  # 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa',
                                     'lin') is not None  #0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn',
                                     'wup') is not None  # 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng',
                                      'res') is not None  #7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn',
                                      'jcn') is not None  #0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng',
                                      'wpath') is not None  #0.593666388463
Пример #2
0
def test_wordnet_similarity():
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    dog = wns.word2synset('dog')
    cat = wns.word2synset('cat')
    # Measuring semantic similarity between concepts using Path method
    assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2
    # Computing English word similarity using Li method
    assert wns.word_similarity('dog', 'cat', 'li') is not None# 0.449327301063
    # Computing Spanish word similarity using Lin method
    assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None#0.876800984373
    # Computing Chinese word similarity using  Wu & Palmer method
    assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None# 0.857142857143
    # Computing Spanish and English word similarity using Resnik method
    assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None#7.91166650904
    # Computing Spanish and Chinese word similarity using Jiang & Conrad method
    assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None#0.31023804699
    # Computing Chinese and English word similarity using WPath method
    assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None#0.593666388463
Пример #3
0
class WSD:

    def __init__(self, wsd_method='maxsim', sim_name='wpath'):
        '''
        wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive']
        sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath']
        '''
        self._method = wsd_method
        self._sim_name = sim_name
        self._wn_sim = WordNetSimilarity()

    def disambiguate_graph(self, sentence):
        words_origin = word_tokenize(sentence)
        #extract words that have a synset in WordNet, currently support NOUN.
        words = [w for w in words_origin if self._wn_sim.word2synset(w)]
        # map words to synsets
        words_synsets = {w:self._wn_sim.word2synset(w) for w in words}
        # construct sets list
        synsets = list(itertools.chain.from_iterable([words_synsets[w] for w in words]))
        # remove duplicate synsets
        synsets = list(set(synsets))
        # define semantic similarity metric
        sim_metric = lambda x, y: self._wn_sim.similarity(x, y, self._sim_name)
        # construct similarity graphs
        sim_graph = SimGraph(synsets, sim_metric)
        # get pagerank scores of synsets
        rank_scores = sim_graph.page_rank()
        results = []
        for w in words_origin:
            if w in words:
                candidate_scores = {s:rank_scores[s] for s in words_synsets[w]}
                results.append((w, Counter(candidate_scores).most_common(1)[0][0]))
            else:
                results.append((w, None))
        return results

    def classify(self, featureset):
        context = featureset['context']
        senses = featureset['senses']
        return self.max_senses(context, senses)

    def context2words(self, sent):
        words = word_tokenize(sent.lower())
        words = [w for w in words if len(w) > 2]
        return lemmatization(words)

    def random_sense(self, word):
        senses = self._wn_sim.word2synset(word)
        return random.choice(senses)

    def first_sense(self, word):
        senses = self._wn_sim.word2synset(word)
        return senses[0]

    def word_sense_similarity(self, word, sense):
        word_senses = self._wn_sim.word2synset(word)
        scorer = lambda x:self._wn_sim.similarity(x, sense, self._sim_name)
        sim_scores = map(scorer, word_senses) + [0.0]
        return max(sim_scores)

    def max_senses(self, context, senses):
        if len(senses) == 1:
            return senses[0]
        context_words = self.context2words(context)
        result = {}
        for ss in senses:
            scorer = lambda x: self.word_sense_similarity(x, ss)
            sim_score = sum(map(scorer, context_words))
            result[ss] = sim_score
        return Counter(result).most_common(1)[0][0]

    def max_sim(self, context, word):
        senses = self._wn_sim.word2synset(word)
        return self.max_senses(context, senses)

    def lesk(self, context, word):
        from nltk.wsd import lesk as nltk_lesk
        context_words = self.context2words(context)
        return nltk_lesk(context_words, word, 'n')