예제 #1
0
    def test_wordnet(self):
        self.assertIsInstance(wordnet.langs(), list)
        self.assertIn("tha", wordnet.langs())

        self.assertEqual(
            wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"])
        self.assertIsNotNone(wordnet.synsets("นก"))
        self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ))

        self.assertIsNotNone(wordnet.lemmas("นก"))
        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
        self.assertIsNotNone(wordnet.lemma("cat.n.01.cat"))

        self.assertEqual(wordnet.morphy("dogs"), "dog")

        bird = wordnet.synset("bird.n.01")
        mouse = wordnet.synset("mouse.n.01")
        self.assertEqual(wordnet.path_similarity(bird, mouse),
                         bird.path_similarity(mouse))
        self.assertEqual(wordnet.wup_similarity(bird, mouse),
                         bird.wup_similarity(mouse))
        self.assertEqual(wordnet.lch_similarity(bird, mouse),
                         bird.lch_similarity(mouse))

        cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
        self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
예제 #2
0
    def test_wordnet(self):
        self.assertIsNotNone(wordnet.langs())

        self.assertEqual(
            wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]
        )
        self.assertIsNotNone(wordnet.synsets("นก"))
        self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ))

        self.assertIsNotNone(wordnet.lemmas("นก"))
        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
        self.assertIsNotNone(wordnet.lemma("cat.n.01.cat"))

        self.assertEqual(wordnet.morphy("dogs"), "dog")

        bird = wordnet.synset("bird.n.01")
        mouse = wordnet.synset("mouse.n.01")
        self.assertEqual(
            wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)
        )
        self.assertEqual(
            wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)
        )

        cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
        self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
예제 #3
0
def compute_wordnet_path_scores(pairs):
    """
        Compute WordNet path similarity for a list of input word pairs
        Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity
            lch_similarity we can't use. path_similarity seems to have better results than wup_similarity

        If we don't find a path between the two works, we add "None" to the result list

        @returns: this list of simility scores, and the number of OOV-word-pairs
    """

    structed_oov_pairs = 0
    wn_scores = []

    for index, pair in enumerate(pairs):

        w1 = wordnet.synsets(pair[0])
        w2 = wordnet.synsets(pair[1])

        if len(w1) > 0 and len(w2) > 0:
            # just use the first synset of each term
            if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset':
                path = wordnet.path_similarity(w1[0], w2[0])

            # return the highest sim between all synset combinations
            elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar':
                path = -1
                for syn1 in w1:
                    for syn2 in w2:
                        tmppath = wordnet.path_similarity(syn1, syn2)
                        if tmppath and tmppath > path: path = tmppath
                if path == -1:
                    # if no path found, set back to None
                    path = None
            else:
                raise RuntimeError(
                    'WORDNET_PATH_SIMILARITY_TYPE is not set in config!')

            wn_scores.append(path)
        else:
            wn_scores.append(None)
            structed_oov_pairs += 1

    return wn_scores, structed_oov_pairs
def compute_wordnet_path_scores(pairs):
    """
        Compute WordNet path similarity for a list of input word pairs
        Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity
            lch_similarity we can't use. path_similarity seems to have better results than wup_similarity

        If we don't find a path between the two works, we add "None" to the result list

        @returns: this list of simility scores, and the number of OOV-word-pairs
    """
    print("DEBUG: starting compute_wordnet_path_scores")
    from pythainlp.corpus import wordnet

    structed_oov_pairs = 0  # wohlg: we count word pairs for which we have no path
    wn_scores = []

    for index, pair in enumerate(pairs):

        w1 = wordnet.synsets(pair[0])
        w2 = wordnet.synsets(pair[1])

        if len(w1) > 0 and len(w2) > 0:
            if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset':  # just use the first synset of each term
                path = wordnet.path_similarity(w1[0], w2[0])
                # path = wordnet.lch_similarity(w1[0], w2[0]) ## we can't use it, requires the same part-of-speech for both words
                # path = wordnet.wup_similarity(w1[0], w2[0])
            elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar':  # return the highest sim between all synset combinations
                path = -1
                for syn1 in w1:
                    for syn2 in w2:
                        tmppath = wordnet.path_similarity(syn1, syn2)
                        if tmppath and tmppath > path: path = tmppath
                if path == -1:
                    path = None  # if no path found, set back to None

            wn_scores.append(path)
        else:
            wn_scores.append(None)
            structed_oov_pairs += 1

    return wn_scores, structed_oov_pairs