Exemplo n.º 1
0
class VocabTest(unittest.TestCase):
    def setUp(self):
        vocab_list = ["UNK", "a", "b", "c", "d"]
        self.vocab = Vocabulary(vocab_list)

    def test_vocab_lookup(self):
        # There are bugs with just making this a script, eventually these can be simpler
        class LookupWord(jit.ScriptModule):
            def __init__(self, vocab):
                super().__init__()
                self.vocab = vocab

            @jit.script_method
            def forward(self, word: str):
                return self.vocab.idx[word]

        lookup_word = LookupWord(self.vocab)

        self.assertEqual(1, lookup_word("a"))
        self.assertEqual(3, lookup_word("c"))
        with self.assertRaises(Exception):
            lookup_word("notaword")

    def test_vocab_idx_lookup(self):
        # There are bugs with just making this a script, eventually these can be simpler
        class LookupIndex(jit.ScriptModule):
            def __init__(self, vocab):
                super().__init__()
                self.vocab = vocab

            @jit.script_method
            def forward(self, i: int):
                return self.vocab.vocab[i]

        lookup_idx = LookupIndex(self.vocab)

        self.assertEqual("UNK", lookup_idx(0))
        self.assertEqual("b", lookup_idx(2))
        with self.assertRaises(Exception):
            lookup_idx(20)

    def test_lookup_1d(self):
        self.assertEqual([1, 0, 3, 4],
                         self.vocab.lookup_indices_1d(["a", "e", "c", "d"]))
        self.assertEqual([], self.vocab.lookup_indices_1d([]))

    def test_lookup_2d(self):
        self.assertEqual(
            [[1, 0, 3, 4], [], [2]],
            self.vocab.lookup_indices_2d([["a", "e", "c", "d"], [], ["b"]]),
        )
        self.assertEqual([], self.vocab.lookup_indices_2d([]))

    def test_custom_unk(self):
        vocab_list = ["a", "UNK", "b", "c", "d"]
        vocab = Vocabulary(vocab_list, unk_idx=1)
        self.assertEqual([0, 1, 3, 4],
                         vocab.lookup_indices_1d(["a", "e", "c", "d"]))
Exemplo n.º 2
0
        class ModelWithDenseFeat(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = Vocabulary(input_vocab,
                                        unk_idx=input_vocab.idx[UNK])
                self.max_byte_len = jit.Attribute(max_byte_len, int)
                self.byte_offset_for_non_padding = jit.Attribute(
                    byte_offset_for_non_padding, int)
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
                self.model = traced_model
                self.output_layer = output_layer

            @jit.script_method
            def forward(self, tokens: List[List[str]],
                        dense_feat: List[List[float]]):
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                token_bytes, _ = make_byte_inputs(
                    tokens, self.max_byte_len,
                    self.byte_offset_for_non_padding)
                logits = self.model(
                    torch.tensor(word_ids),
                    token_bytes,
                    torch.tensor(seq_lens),
                    torch.tensor(dense_feat),
                )
                return self.output_layer(logits)
Exemplo n.º 3
0
        class Model(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = Vocabulary(input_vocab,
                                        unk_idx=input_vocab.idx[UNK])
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)

            @jit.script_method
            def forward(self, tokens: List[List[str]]):
                word_ids = self.vocab.lookup_indices_2d(tokens)

                seq_lens = jit.annotate(List[int], [])

                for sentence in word_ids:
                    seq_lens.append(len(sentence))
                pad_to_length = list_max(seq_lens)
                for sentence in word_ids:
                    for _ in range(pad_to_length - len(sentence)):
                        sentence.append(self.pad_idx)

                logits = self.model(torch.tensor(word_ids),
                                    torch.tensor(seq_lens))
                return self.output_layer(logits)
Exemplo n.º 4
0
        class Model(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = Vocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)

            @jit.script_method
            def forward(self, tokens: List[List[str]]):
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                logits = self.model(torch.tensor(word_ids), torch.tensor(seq_lens))
                return self.output_layer(logits)
Exemplo n.º 5
0
        class ModelWithDenseFeat(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = Vocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
                self.normalizer = tensorizers["dense"].normalizer
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)

            @jit.script_method
            def forward(self, tokens: List[List[str]], dense_feat: List[List[float]]):
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                dense_feat = self.normalizer.normalize(dense_feat)
                logits = self.model(
                    torch.tensor(word_ids),
                    torch.tensor(seq_lens),
                    torch.tensor(dense_feat, dtype=torch.float),
                )
                return self.output_layer(logits)