class TestTokenEmbeddingSimilarity(TestTokenEmbedding):
    def setUp(self):
        super().setUp()
        self.config["extended_vocab_path"] = self.test_data_file
        self.config["keep_extended_vocab_only"] = True

    def get_dot(self, vec_a, vec_b):
        return np.sum(vec_a * vec_b)

    def get_cosine(self, vec_a, vec_b):
        return self.get_dot(vec_a, vec_b) / (np.sqrt(
            self.get_dot(vec_a, vec_a) * self.get_dot(vec_b, vec_b)))

    def get_random_word_vec(self, vocab_list):
        vocab_size = len(vocab_list)
        ids = np.random.randint(vocab_size, size=2)
        word_a, word_b = vocab_list[ids[0]], vocab_list[ids[1]]
        vec_a, vec_b = self.embedding.search([word_a, word_b])
        return word_a, word_b, vec_a, vec_b

    def test_cosine_sim(self):
        self.embedding = TokenEmbedding(**self.config)
        vocab_list = get_vocab_list(self.config["extended_vocab_path"])
        word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list)
        result = self.embedding.cosine_sim(word_a, word_b)
        expected_result = self.get_cosine(vec_a, vec_b)
        self.check_output_equal(result, expected_result)

    def test_dot(self):
        self.embedding = TokenEmbedding(**self.config)
        vocab_list = get_vocab_list(self.config["extended_vocab_path"])
        word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list)
        result = self.embedding.dot(word_a, word_b)
        expected_result = self.get_dot(vec_a, vec_b)
        self.check_output_equal(result, expected_result)
示例#2
0
words = jiagu.seg(text)  # 分词
print(words)

pos = jiagu.pos(words)  # 词性标注
print(pos)

ner = jiagu.ner(words)  # 命名实体识别
print(ner)

from paddlenlp.datasets import ChnSentiCorp

train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets(['train', 'dev', 'test'])

from paddlenlp.embeddings import TokenEmbedding

wordemb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300")
print(wordemb.cosine_sim("苹果", "香蕉"))

wordemb.cosine_sim("艺术", "火车")

wordemb.cosine_sim("狗", "香蕉")

for token1 in ['狗', '猫', '香蕉']:
    for token2 in ['狗', '猫', '香蕉']:
        print(wordemb.cosine_sim(token1, token2))

vv = wordemb.search(['狗', '猫', '香蕉'])

vv2 = wordemb.search('狗猫香蕉')