Пример #1
0
    def test_compute_similar_words_from_word(self) -> None:
        file_path = str(
            Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
        )
        word_embeds = WordEmbedding(file_path)

        word = "てすと"
        expected = [("サンプル", 0.7506074093675397)]
        actual = word_embeds.compute_similar_words_from_word(word)
        self.assertEqual(actual, expected)
Пример #2
0
 def test_property_shape(self) -> None:
     file_path = str(
         Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
     )
     word_embeds = WordEmbedding(file_path)
     self.assertEqual(len(word_embeds), 2)
     self.assertEqual(word_embeds.dim, 10)
     self.assertEqual(word_embeds.shape, (2, 10))
Пример #3
0
    def test_error_unknown_words(self) -> None:
        file_path = str(
            Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
        )
        word_embeds = WordEmbedding(file_path)

        words = ["てすと", "unk"]
        with self.assertRaises(ValueError, msg=f"unknown word: 'unk'"):
            word_embeds(words)
Пример #4
0
    def test_extract_word_embeddings(self) -> None:
        file_path = str(
            Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
        )
        word_embeds = WordEmbedding(file_path)

        words = ["てすと", "サンプル"]
        actual = word_embeds(words)
        expected = np.array(
            [
                [
                    0.31882,
                    0.89289,
                    0.90071,
                    0.45753,
                    0.37083,
                    0.64955,
                    0.34075,
                    0.70048,
                    0.89085,
                    0.13621,
                ],
                [
                    0.79375,
                    0.44464,
                    0.07644,
                    0.35242,
                    0.03996,
                    0.68827,
                    0.97103,
                    0.77324,
                    0.72781,
                    0.69158,
                ],
            ]
        )
        np.testing.assert_almost_equal(actual, expected)

        word = "てすと"
        actual = word_embeds(word)
        expected = np.array(
            [
                0.31882,
                0.89289,
                0.90071,
                0.45753,
                0.37083,
                0.64955,
                0.34075,
                0.70048,
                0.89085,
                0.13621,
            ]
        )
        np.testing.assert_almost_equal(actual, expected)
Пример #5
0
    def test_compute_cosine_similarity(self) -> None:
        file_path = str(
            Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
        )
        word_embeds = WordEmbedding(file_path)

        embed = np.array(
            [
                0.29902,
                0.90019,
                0.89964,
                0.50753,
                0.38001,
                0.59495,
                0.29175,
                0.69909,
                0.90185,
                0.09687,
            ]
        )
        expected = np.array([0.9987114080207757, 0.7286216119815097])
        actual = word_embeds.compute_cosine_similarity(embed)
        np.testing.assert_almost_equal(actual, expected)
Пример #6
0
    def test_compute_similar_words_from_vec(self) -> None:
        file_path = str(
            Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
        )
        word_embeds = WordEmbedding(file_path)

        embed = np.array(
            [
                0.29902,
                0.90019,
                0.89964,
                0.50753,
                0.38001,
                0.59495,
                0.29175,
                0.69909,
                0.90185,
                0.09687,
            ]
        )
        expected = [("てすと", 0.9987114080207757), ("サンプル", 0.7286216119815097)]
        actual = word_embeds.compute_similar_words_from_vec(embed)
        self.assertEqual(actual, expected)
Пример #7
0
    def test_property_vocab(self) -> None:
        file_path = str(
            Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt"
        )
        word_embeds = WordEmbedding(file_path)

        self.assertSetEqual(word_embeds.vocab, {"てすと", "サンプル"})
        self.assertEqual(word_embeds.to_word(1), "サンプル")
        self.assertEqual(word_embeds.to_index("てすと"), 0)
        self.assertTrue(word_embeds.is_known("てすと"))
        self.assertFalse(word_embeds.is_known("test"))
Пример #8
0
 def test_load_glove_format_file(self) -> None:
     file_path = str(
         Path(__file__).resolve().parent / "samples" / "sample.glove.txt"
     )
     word_embeds = WordEmbedding(file_path)
     self.assertIsInstance(word_embeds, WordEmbedding)