示例#1
0
    def test_create_word_character_tensors(self):
        tensorizer = WordCharacterTensorizer(column="text")
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"

        def ords(word, pad_to):
            return [ord(c) for c in word] + [0] * (pad_to - len(word))

        batch = [{"text": types.Text(s1)}, {"text": types.Text(s2)}]
        # Note that the tokenizer lowercases here
        expected = [
            [
                ords("i", 6),
                ords("want", 6),
                ords("some", 6),
                ords("coffee", 6)
            ],
            [ords("turn", 6),
             ords("it", 6),
             ords("up", 6),
             ords("", 6)],
        ]

        expected_lens = [[1, 4, 4, 6], [4, 2, 2, 0]]

        chars, seq_lens = tensorizer.tensorize(
            tensorizer.numberize(row) for row in batch)
        self.assertIsInstance(chars, torch.LongTensor)
        self.assertIsInstance(seq_lens, torch.LongTensor)
        self.assertEqual((2, 4, 6), chars.size())
        self.assertEqual((2, 4), seq_lens.size())
        self.assertEqual(expected, chars.tolist())
        self.assertEqual(expected_lens, seq_lens.tolist())
示例#2
0
    def test_create_byte_tensors(self):
        tensorizer = ByteTensorizer(column="text", lower=False)
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"
        rows = [{"text": types.Text(s1)}, {"text": types.Text(s2)}]
        expected = [[ord(c) for c in s1], [ord(c) for c in s2]]

        tensors = (tensorizer.numberize(row) for row in rows)
        chars, seq_len = next(tensors)
        self.assertEqual(len(s1), len(chars))
        self.assertEqual(expected[0], chars)
        self.assertEqual(len(s1), seq_len)

        chars, seq_len = next(tensors)
        self.assertEqual(len(s2), len(chars))
        self.assertEqual(expected[1], chars)
        self.assertEqual(len(s2), seq_len)
示例#3
0
    def test_create_word_tensors(self):
        tensorizer = WordTensorizer(column="text")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        batch = [
            {"text": types.Text("I want some coffee")},
            {"text": types.Text("Turn it up")},
        ]

        tokens, seq_lens = tensorizer.create_training_tensors(batch)
        self.assertIsInstance(tokens, torch.LongTensor)
        self.assertIsInstance(seq_lens, torch.LongTensor)
        self.assertEqual((2, 4), tokens.size())
        self.assertEqual((2,), seq_lens.size())
        self.assertEqual([[24, 0, 0, 0], [13, 47, 9, 1]], tokens.tolist())
        self.assertEqual([4, 3], seq_lens.tolist())
示例#4
0
    def test_create_character_tensors(self):
        tensorizer = CharacterTensorizer(column="text")
        # not initializing because initializing is a no-op for CharacterTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"

        ld = len(s1) - len(s2)

        batch = [{"text": types.Text(s1)}, {"text": types.Text(s2)}]
        expected = [
            [ord(c) for c in s1] + [0] * (max(-ld, 0)),
            [ord(c) for c in s2] + [0] * (max(ld, 0)),
        ]

        chars, seq_lens = tensorizer.create_training_tensors(batch)
        self.assertIsInstance(chars, torch.LongTensor)
        self.assertIsInstance(seq_lens, torch.LongTensor)
        self.assertEqual((2, max(len(s1), len(s2))), chars.size())
        self.assertEqual((2,), seq_lens.size())
        self.assertEqual(expected, chars.tolist())
        self.assertEqual([len(s1), len(s2)], seq_lens.tolist())
示例#5
0
    def test_create_word_tensors(self):
        tensorizer = WordTensorizer(column="text")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        rows = [
            {
                "text": types.Text("I want some coffee")
            },
            {
                "text": types.Text("Turn it up")
            },
        ]
        tensors = (tensorizer.numberize(row) for row in rows)
        tokens, seq_len = next(tensors)
        self.assertEqual([24, 0, 0, 0], tokens)
        self.assertEqual(4, seq_len)

        tokens, seq_len = next(tensors)
        self.assertEqual([13, 47, 9], tokens)
        self.assertEqual(3, seq_len)
示例#6
0
文件: tsv.py 项目: kwikBioInc/pytext
def load_text(s):
    return types.Text(s)