Exemplo n.º 1
0
    def test_cache_xlu_embeds(self):
        embeddings_ref = PretrainedEmbedding()

        dialects = ["en_US", "en_UK", "es_XX"]
        for dialect in dialects:
            embeddings_ref.load_pretrained_embeddings(EMBED_RAW_PATH,
                                                      append=True,
                                                      dialect=dialect)
        with tempfile.NamedTemporaryFile(
                delete=False, suffix=".{}".format("cached")) as cached_path:
            embeddings_ref.cache_pretrained_embeddings(cached_path.name)
            embeddings_cached = PretrainedEmbedding()
            embeddings_cached.load_cached_embeddings(cached_path.name)

        np.testing.assert_array_equal(sorted(embeddings_cached.stoi.keys()),
                                      sorted(embeddings_ref.stoi.keys()))
        np.testing.assert_array_equal(embeddings_cached.embed_vocab,
                                      embeddings_ref.embed_vocab)
        np.testing.assert_array_equal(
            sorted(embeddings_cached.stoi.values()),
            sorted(embeddings_ref.stoi.values()),
        )
        for word_idx in embeddings_ref.stoi.values():
            np.testing.assert_array_almost_equal(
                embeddings_cached.embedding_vectors[word_idx],
                embeddings_ref.embedding_vectors[word_idx],
            )
Exemplo n.º 2
0
 def test_assing_pretrained_weights(self):
     embeddings_ref = PretrainedEmbedding()
     embeddings_ref.load_cached_embeddings(EMBED_CACHED_PATH)
     VOCAB = ["UNK", "aloha", "the"]
     EMBED_DIM = 5
     # Get Vocab to Idx:
     UNK_IDX = 0
     embed_vocab_to_idx = {}
     for word in embeddings_ref.embed_vocab:
         if word in VOCAB:
             embed_vocab_to_idx[word] = VOCAB.index(word)
         else:
             embed_vocab_to_idx[word] = UNK_IDX
     pretrained_embeds = embeddings_ref.initialize_embeddings_weights(
         embed_vocab_to_idx, UNK_IDX, len(VOCAB), EMBED_DIM,
         EmbedInitStrategy.RANDOM)
     assert pretrained_embeds.shape[0] == len(VOCAB)
     assert pretrained_embeds.shape[1] == EMBED_DIM
     np.testing.assert_array_almost_equal(
         pretrained_embeds[1].numpy(),
         [-0.43124, 0.014934, -0.50635, 0.60506, 0.56051],
     )  # embedding vector for 'aloha'
     np.testing.assert_array_almost_equal(
         pretrained_embeds[2].numpy(),
         [-0.39153, -0.19803, 0.2573, -0.18617, 0.25551],
     )  # embedding vector for 'the'
Exemplo n.º 3
0
 def test_assign_pretrained_weights(self):
     embeddings_ref = PretrainedEmbedding()
     embeddings_ref.load_cached_embeddings(EMBED_CACHED_PATH)
     VOCAB = ["UNK", "aloha", "the"]
     embed_vocab_to_idx = {tok: i for i, tok in enumerate(VOCAB)}
     pretrained_embeds = embeddings_ref.initialize_embeddings_weights(
         embed_vocab_to_idx, "UNK", EMBED_DIM, EmbedInitStrategy.RANDOM)
     assert pretrained_embeds.shape[0] == len(VOCAB)
     assert pretrained_embeds.shape[1] == EMBED_DIM
     np.testing.assert_array_almost_equal(
         pretrained_embeds[1].numpy(),
         [-0.43124, 0.014934, -0.50635, 0.60506, 0.56051],
     )  # embedding vector for 'aloha'
     np.testing.assert_array_almost_equal(
         pretrained_embeds[2].numpy(),
         [-0.39153, -0.19803, 0.2573, -0.18617, 0.25551],
     )  # embedding vector for 'the'
Exemplo n.º 4
0
    def test_load_pretrained_embeddings(self):
        pretrained_emb = PretrainedEmbedding(EMBED_RAW_PATH)

        self.assertEqual(len(pretrained_emb.embed_vocab), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.embed_vocab[0], "</s>")
        self.assertEqual(pretrained_emb.embed_vocab[2], "to")

        self.assertEqual(len(pretrained_emb.stoi), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.stoi["</s>"], 0)
        self.assertEqual(pretrained_emb.stoi["to"], 2)

        self.assertEqual(pretrained_emb.embedding_vectors.size(0), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.embedding_vectors.size(1), EMBED_DIM)