def test_character_token_embedder(self): vocab = Dictionary() vocab.add_symbol('hello') vocab.add_symbol('there') embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2) test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']] max_len = max(len(s) for s in test_sents) input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad()) for i in range(len(test_sents)): input[i][0] = vocab.eos() for j in range(len(test_sents[i])): input[i][j + 1] = vocab.index(test_sents[i][j]) input[i][j + 2] = vocab.eos() embs = embedder(input) assert embs.size() == (len(test_sents), max_len + 2, 5) self.assertAlmostEqual(embs[0][0], embs[1][0]) self.assertAlmostEqual(embs[0][0], embs[0][-1]) self.assertAlmostEqual(embs[0][1], embs[2][1]) self.assertAlmostEqual(embs[0][3], embs[1][1]) embs.sum().backward() assert embedder.char_embeddings.weight.grad is not None
def _get_test_data(self): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor([i + 1 for i in src_len])
def _get_test_data(self, append_eos=True): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor(src_len)
def assert_word_shuffle_matches_expected( self, x, x_len, max_shuffle_distance: int, vocab: Dictionary, expected_shufle_maps: List[Dict[int, int]], expect_eos_at_end: bool, bpe_end_marker=None, ): """ This verifies that with a given x, x_len, max_shuffle_distance, and vocab, we get the expected shuffle result. Args: x: Tensor of shape (T x B) = (sequence_length, batch_size) x_len: Tensor of length B = batch_size max_shuffle_distance: arg to pass to noising expected_shuffle_maps: List[mapping] where mapping is a Dict[old_index, new_index], mapping x's elements from their old positions in x to their new positions in x. expect_eos_at_end: if True, check the output to make sure there is an EOS at the end. bpe_end_marker: str denoting the BPE end token. If this is not None, we set the BPE cont token to None in the noising classes. """ bpe_cont_marker = None if bpe_end_marker is None: bpe_cont_marker = "@@" with data_utils.numpy_seed(1234): word_shuffle = noising.WordShuffle( vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker ) x_noised, l_noised = word_shuffle.noising( x, x_len, max_shuffle_distance=max_shuffle_distance ) # For every example, we have a different expected shuffle map. We check # that each example is shuffled as expected according to each # corresponding shuffle map. for i in range(len(expected_shufle_maps)): shuffle_map = expected_shufle_maps[i] for k, v in shuffle_map.items(): self.assertEqual(x[k][i], x_noised[v][i]) # Shuffling should not affect the length of each example for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised): self.assertEqual(pre_shuffle_length, post_shuffle_length) if expect_eos_at_end: self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def _convert_src_tokens_to_tensor( self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool ): src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return x, torch.LongTensor(src_len)