def load_embedding(emb_dim, emb_file, emb_type, vocab): """Load the pre-trained embedding and combine with the given dictionary. :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. :param str emb_file: the pre-trained embedding file path. :param str emb_type: the pre-trained embedding format, support glove now :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim) vocab: input vocab or vocab built by pre-train """ pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): vocab.add(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: raise ValueError( "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v return embedding_tensor, vocab
def test_additional_update(self): vocab = Vocabulary(max_size=None, min_freq=None) vocab.update(text) _ = vocab["well"] self.assertEqual(vocab.rebuild, False) vocab.add("hahaha") self.assertEqual(vocab.rebuild, True) _ = vocab["hahaha"] self.assertEqual(vocab.rebuild, False) self.assertTrue("hahaha" in vocab)
def test_add(self): vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) for word in text: vocab.add(word) self.assertEqual(vocab.word_count, counter)