예제 #1
0
    def test_create_byte_token_tensors(self):
        tensorizer = ByteTokenTensorizer(
            text_column="text", max_seq_len=4, max_byte_len=5
        )
        # not initializing because initializing is a no-op for this tensorizer

        s1 = "I want some coffee today"
        s2 = "Turn it up"

        def ords(word, pad_to):
            return list(word.encode()) + [0] * (pad_to - len(word))

        batch = [{"text": s1}, {"text": s2}]
        # Note that the tokenizer lowercases here
        expected = [
            [ords("i", 5), ords("want", 5), ords("some", 5), ords("coffe", 5)],
            [ords("turn", 5), ords("it", 5), ords("up", 5), ords("", 5)],
        ]
        expected_token_lens = [4, 3]
        expected_byte_lens = [[1, 4, 4, 5], [4, 2, 2, 0]]

        bytes, token_lens, byte_lens = tensorizer.tensorize(
            [tensorizer.numberize(row) for row in batch]
        )
        self.assertIsInstance(bytes, torch.LongTensor)
        self.assertIsInstance(token_lens, torch.LongTensor)
        self.assertIsInstance(byte_lens, torch.LongTensor)
        self.assertEqual((2, 4, 5), bytes.size())
        self.assertEqual((2,), token_lens.size())
        self.assertEqual((2, 4), byte_lens.size())
        self.assertEqual(expected, bytes.tolist())
        self.assertEqual(expected_token_lens, token_lens.tolist())
        self.assertEqual(expected_byte_lens, byte_lens.tolist())
예제 #2
0
 def test_tokens_dictfeat_contextual(self):
     # TODO (T65593688): this should be removed after
     # https://github.com/pytorch/pytorch/pull/33645 is merged.
     with torch.no_grad():
         model = Seq2SeqModel.from_config(
             Seq2SeqModel.Config(
                 source_embedding=WordEmbedding.Config(embed_dim=512),
                 target_embedding=WordEmbedding.Config(embed_dim=512),
                 inputs=Seq2SeqModel.Config.ModelInput(
                     dict_feat=GazetteerTensorizer.Config(
                         text_column="source_sequence"
                     ),
                     contextual_token_embedding=ByteTokenTensorizer.Config(),
                 ),
                 encoder_decoder=RNNModel.Config(
                     encoder=LSTMSequenceEncoder.Config(embed_dim=619)
                 ),
                 dict_embedding=DictEmbedding.Config(),
                 contextual_token_embedding=ContextualTokenEmbedding.Config(
                     embed_dim=7
                 ),
             ),
             get_tensorizers(add_dict_feat=True, add_contextual_feat=True),
         )
         model.eval()
         ts_model = model.torchscriptify()
         res = ts_model(
             ["call", "mom"],
             (["call", "mom"], [0.42, 0.17], [4, 3]),
             [0.42] * (7 * 2),
         )
         assert res is not None
def get_tensorizers(add_dict_feat=False, add_contextual_feat=False):
    schema = {
        "source_sequence": str,
        "dict_feat": Gazetteer,
        "target_sequence": str
    }
    data_source = TSVDataSource.from_config(
        TSVDataSource.Config(
            train_filename=TEST_FILE_NAME,
            field_names=["source_sequence", "dict_feat", "target_sequence"],
        ),
        schema,
    )
    src_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(column="source_sequence",
                               add_eos_token=True,
                               add_bos_token=True))
    tgt_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(column="target_sequence",
                               add_eos_token=True,
                               add_bos_token=True))
    tensorizers = {
        "src_seq_tokens": src_tensorizer,
        "trg_seq_tokens": tgt_tensorizer
    }
    initialize_tensorizers(tensorizers, data_source.train)

    if add_dict_feat:
        tensorizers["dict_feat"] = GazetteerTensorizer.from_config(
            GazetteerTensorizer.Config(text_column="source_sequence",
                                       dict_column="dict_feat"))
        initialize_tensorizers({"dict_feat": tensorizers["dict_feat"]},
                               data_source.train)

    if add_contextual_feat:
        tensorizers[
            "contextual_token_embedding"] = ByteTokenTensorizer.from_config(
                ByteTokenTensorizer.Config(column="source_sequence"))
        initialize_tensorizers(
            {
                "contextual_token_embedding":
                tensorizers["contextual_token_embedding"]
            },
            data_source.train,
        )

    return tensorizers
 def test_tokens_contextual(self):
     model = Seq2SeqModel.from_config(
         Seq2SeqModel.Config(
             source_embedding=WordEmbedding.Config(embed_dim=512),
             target_embedding=WordEmbedding.Config(embed_dim=512),
             inputs=Seq2SeqModel.Config.ModelInput(
                 contextual_token_embedding=ByteTokenTensorizer.Config()),
             contextual_token_embedding=ContextualTokenEmbedding.Config(
                 embed_dim=7),
             encoder_decoder=RNNModel.Config(
                 encoder=LSTMSequenceEncoder.Config(embed_dim=519)),
         ),
         get_tensorizers(add_contextual_feat=True),
     )
     model.eval()
     ts_model = model.torchscriptify()
     res = ts_model(["call", "mom"],
                    contextual_token_embedding=[0.42] * (7 * 2))
     assert res is not None
 def test_tokens_dictfeat_contextual(self):
     model = Seq2SeqModel.from_config(
         Seq2SeqModel.Config(
             source_embedding=WordEmbedding.Config(embed_dim=512),
             target_embedding=WordEmbedding.Config(embed_dim=512),
             inputs=Seq2SeqModel.Config.ModelInput(
                 dict_feat=GazetteerTensorizer.Config(
                     text_column="source_sequence"),
                 contextual_token_embedding=ByteTokenTensorizer.Config(),
             ),
             encoder_decoder=RNNModel.Config(
                 encoder=LSTMSequenceEncoder.Config(embed_dim=619)),
             dict_embedding=DictEmbedding.Config(),
             contextual_token_embedding=ContextualTokenEmbedding.Config(
                 embed_dim=7),
         ),
         get_tensorizers(add_dict_feat=True, add_contextual_feat=True),
     )
     model.eval()
     ts_model = model.torchscriptify()
     res = ts_model(["call", "mom"],
                    (["call", "mom"], [0.42, 0.17], [4, 3]),
                    [0.42] * (7 * 2))
     assert res is not None
예제 #6
0
 class ByteModelInput(DocModel.Config.ModelInput):
     token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config(
     )
예제 #7
0
 class ByteModelInput(Model.Config.ModelInput):
     # We should support characters as well, but CharacterTokenTensorizer
     # does not support adding characters to vocab yet.
     token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config(
     )
     labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()
예제 #8
0
 class ByteModelInput(WordTaggingModel.Config.ModelInput):
     # We should support characters as well, but CharacterTokenTensorizer
     # does not support adding characters to vocab yet.
     tokens: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config()