示例#1
0
    def test_preprocess(self):
        # Default case.
        field = data.Field()
        assert field.preprocess("Test string.") == ["Test", "string."]

        # Test that lowercase is properly applied.
        field_lower = data.Field(lower=True)
        assert field_lower.preprocess("Test string.") == ["test", "string."]

        # Test that custom preprocessing pipelines are properly applied.
        preprocess_pipeline = data.Pipeline(lambda x: x + "!")
        field_preprocessing = data.Field(preprocessing=preprocess_pipeline,
                                         lower=True)
        assert field_preprocessing.preprocess("Test string.") == [
            "test!", "string.!"
        ]

        # Test that non-sequential data is properly handled.
        field_not_sequential = data.Field(sequential=False,
                                          lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess(
            "Test string.") == "test string.!"

        # Non-regression test that we do not try to decode unicode strings to unicode
        field_not_sequential = data.Field(sequential=False,
                                          lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess(
            "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
示例#2
0
 def test_subword_trec(self):
     TEXT = data.SubwordField()
     LABEL = data.Field(sequential=False)
     RAW = data.Field(sequential=False, use_vocab=False)
     raw, _ = TREC.splits(RAW, LABEL)
     cooked, _ = TREC.splits(TEXT, LABEL)
     LABEL.build_vocab(cooked)
     TEXT.build_vocab(cooked, max_size=100)
     TEXT.segment(cooked)
     print(cooked[0].text)
     batch = next(iter(data.Iterator(cooked, 1, shuffle=False)))
     self.assertEqual(TEXT.reverse(batch.text.data)[0], raw[0].text)
示例#3
0
    def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False,
                                 use_vocab=False,
                                 dtype=torch.float)
        tsv_fields = [("int", int_field), ("float", float_field),
                      ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path,
            format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        assert_allclose(numericalized_int.data.numpy(), [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data)
        assert_allclose(numericalized_float.data.numpy(),
                        [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(
            sequential=False,
            use_vocab=False,
            postprocessing=lambda arr, _: [x + 1 for x in arr])
        float_field = data.Field(
            sequential=False,
            use_vocab=False,
            dtype=torch.float,
            postprocessing=lambda arr, _: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field),
                      ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path,
            format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        assert_allclose(numericalized_int.data.numpy(), [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data)
        assert_allclose(numericalized_float.data.numpy(),
                        [0.55, 0.05, 1.955, 0.1, 5.1])
示例#4
0
    def test_init_with_nested_field_as_nesting_field(self):
        nesting_field = data.NestedField(data.Field())

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field must not be another NestedField" in str(
            excinfo.value)
示例#5
0
    def test_numericalize_postprocessing(self):
        self.write_test_ppid_dataset(data_format="tsv")

        def reverse_postprocess(arr, vocab):
            return [list(reversed(sentence)) for sentence in arr]

        question_field = data.Field(sequential=True,
                                    postprocessing=reverse_postprocess)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]

        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [
            ["When", "do", "you", "use", "シ", "instead", "of", "し?"],
            ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"],
            ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]
        ]
        reversed_test_example_data = [
            list(reversed(sentence)) for sentence in test_example_data
        ]

        postprocessed_numericalized = question_field.numericalize(
            (test_example_data))
        verify_numericalized_example(question_field,
                                     reversed_test_example_data,
                                     postprocessed_numericalized)
示例#6
0
    def test_init_when_nesting_field_has_include_lengths_equal_true(self):
        nesting_field = data.Field(include_lengths=True)

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field cannot have include_lengths=True" in str(
            excinfo.value)
示例#7
0
    def test_pad_when_pad_first_is_true(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 pad_first=True)
        minibatch = [
            [list("john"), list("loves"),
             list("mary")],
            [list("mary"), list("cries")],
        ]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                    [
                        ["<cpad>"] * 7,
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>"] + list("cries") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include_length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True,
                                 pad_first=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]]
示例#8
0
    def test_preprocess(self):
        nesting_field = data.Field(
            tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs])
        field = data.NestedField(nesting_field,
                                 preprocessing=lambda xs: reversed(xs))
        preprocessed = field.preprocess("john loves mary")

        assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
示例#9
0
    def test_pad_when_nesting_field_has_fix_length(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
            ["<w>"] + list("joh") + ["</w>"],
            ["<w>"] + list("lov") + ["</w>"],
            ["<w>"] + list("mar") + ["</w>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
        ],
                    [
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
                        ["<w>"] + list("mar") + ["</w>"],
                        ["<w>"] + list("cri") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
                        ["<cpad>"] * 5,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]]
示例#10
0
    def test_build_vocab_from_iterable(self):
        nesting_field = data.Field(unk_token="<cunk>", pad_token="<cpad>")
        CHARS = data.NestedField(nesting_field)
        CHARS.build_vocab(
            [[list("aaa"), list("bbb"), ["c"]], [list("bbb"),
                                                 list("aaa")]],
            [[list("ccc"), list("bbb")], [list("bbb")]],
        )

        expected = "a b c <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi
示例#11
0
    def test_pad_when_fix_length_is_not_none(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 fix_length=3)
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                    [
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True,
                                 fix_length=3)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [3, 3]
        assert words_len == [[3, 6, 3], [3, 6, 3]]
示例#12
0
    def test_batch_with_missing_field(self):
        # smoke test to see if batches with missing attributes are shown properly
        with open(self.test_missing_field_dataset_path, "wt") as f:
            f.write("text,label\n1,0")

        dst = data.TabularDataset(path=self.test_missing_field_dataset_path,
                                  format="csv",
                                  skip_header=True,
                                  fields=[("text",
                                           data.Field(use_vocab=False,
                                                      sequential=False)),
                                          ("label", None)])
        itr = data.Iterator(dst, batch_size=64)
        str(next(itr.__iter__()))
示例#13
0
    def test_wikitext2(self):
        # smoke test to ensure wikitext2 works properly
        ds = WikiText2
        TEXT = data.Field(lower=True, batch_first=True)
        train, valid, test = ds.splits(TEXT)
        TEXT.build_vocab(train)
        train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
            (train, valid, test), batch_size=3, bptt_len=30)

        train_iter, valid_iter, test_iter = ds.iters(batch_size=4, bptt_len=30)

        # Delete the dataset after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            datafile = os.path.join(self.project_root, ".data", "wikitext-2")
            conditional_remove(datafile)
示例#14
0
    def test_pad_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [
            ["<s>", "john", "loves", "mary", "</s>"],
            ["<s>", "mary", "cries", "</s>", "<pad>"],
        ]

        assert CHARS.pad(minibatch) == expected
示例#15
0
    def test_build_vocab_from_dataset(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)])
        ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)])
        dataset = data.Dataset([ex1, ex2], [("chars", CHARS)])

        CHARS.build_vocab(dataset, min_freq=2)

        expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi
示例#16
0
    def test_numericalize_basic(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [
            ["When", "do", "you", "use", "シ", "instead", "of", "し?"],
            ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"],
            ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]
        ]

        # Test default
        default_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field, test_example_data,
                                     default_numericalized)
示例#17
0
    def test_pad_when_no_init_and_eos_tokens(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field)
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
        ],
                    [
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>"] + list("cries") + ["</w>"],
                        ["<cpad>"] * 7,
                    ]]

        assert CHARS.pad(minibatch) == expected
示例#18
0
    def test_build_vocab(self):
        nesting_field = data.Field(tokenize=list,
                                   init_token="<w>",
                                   eos_token="</w>")

        field = data.NestedField(nesting_field,
                                 init_token='<s>',
                                 eos_token='</s>',
                                 include_lengths=True,
                                 pad_first=True)

        sources = [[['a'], ['s', 'e', 'n', 't', 'e', 'n', 'c', 'e'],
                    ['o', 'f'], ['d', 'a', 't', 'a'], ['.']],
                   [['y', 'e', 't'], ['a', 'n', 'o', 't', 'h', 'e', 'r']],
                   [['o', 'n', 'e'], ['l', 'a', 's', 't'],
                    ['s', 'e', 'n', 't']]]

        field.build_vocab(sources,
                          vectors='glove.6B.50d',
                          unk_init=init.xavier_normal,
                          vectors_cache=".vector_cache")
示例#19
0
    def test_init_minimal(self):
        nesting_field = data.Field()
        field = data.NestedField(nesting_field)

        assert isinstance(field, data.Field)
        assert field.nesting_field is nesting_field
        assert field.sequential
        assert field.use_vocab
        assert field.init_token is None
        assert field.eos_token is None
        assert field.unk_token == nesting_field.unk_token
        assert field.fix_length is None
        assert field.dtype is torch.long
        assert field.preprocessing is None
        assert field.postprocessing is None
        assert field.lower == nesting_field.lower
        assert field.tokenize("a b c") == "a b c".split()
        assert not field.include_lengths
        assert field.batch_first
        assert field.pad_token == nesting_field.pad_token
        assert not field.pad_first
示例#20
0
    def test_process(self):
        raw_field = data.RawField()
        field = data.Field(sequential=True, use_vocab=False, batch_first=True)

        # Test tensor-like batch data which is accepted by both RawField and Field
        batch = [[1, 2, 3], [2, 3, 4]]
        batch_tensor = torch.LongTensor(batch)

        raw_field_processed = raw_field.process(batch)
        field_processed = field.process(batch)

        assert raw_field_processed == batch
        assert field_processed.data.equal(batch_tensor)

        # Test non-tensor data which is only accepted by RawField
        any_obj = [object() for _ in range(5)]

        raw_field_processed = raw_field.process(any_obj)
        assert any_obj == raw_field_processed

        with pytest.raises(TypeError):
            field.process(any_obj)
示例#21
0
 def test_errors(self):
     # Test that passing a non-tuple (of data and length) to numericalize
     # with Field.include_lengths = True raises an error.
     with self.assertRaises(ValueError):
         self.write_test_ppid_dataset(data_format="tsv")
         question_field = data.Field(sequential=True, include_lengths=True)
         tsv_fields = [("id", None), ("q1", question_field),
                       ("q2", question_field), ("label", None)]
         tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                           format="tsv",
                                           fields=tsv_fields)
         question_field.build_vocab(tsv_dataset)
         test_example_data = [[
             "When", "do", "you", "use", "シ", "instead", "of", "し?"
         ],
                              [
                                  "What", "is", "2+2", "<pad>", "<pad>",
                                  "<pad>", "<pad>", "<pad>"
                              ],
                              [
                                  "Here", "is", "a", "sentence", "with",
                                  "some", "oovs", "<pad>"
                              ]]
         question_field.numericalize(test_example_data)
示例#22
0
    def test_init_full(self):
        nesting_field = data.Field()
        field = data.NestedField(
            nesting_field,
            use_vocab=False,
            init_token="<s>",
            eos_token="</s>",
            fix_length=10,
            dtype=torch.float,
            preprocessing=lambda xs: list(reversed(xs)),
            postprocessing=lambda xs: [x.upper() for x in xs],
            tokenize=list,
            pad_first=True,
        )

        assert not field.use_vocab
        assert field.init_token == "<s>"
        assert field.eos_token == "</s>"
        assert field.fix_length == 10
        assert field.dtype is torch.float
        assert field.preprocessing("a b c".split()) == "c b a".split()
        assert field.postprocessing("a b c".split()) == "A B C".split()
        assert field.tokenize("abc") == ["a", "b", "c"]
        assert field.pad_first
示例#23
0
    def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos
示例#24
0
文件: snli.py 项目: iclementine/text
from pytext import data
from pytext import datasets

TEXT = data.Field()
LABEL = data.Field(sequential=False)

train, val, test = datasets.SNLI.splits(TEXT, LABEL)

print(train.fields)
print(len(train))
print(vars(train[0]))

TEXT.build_vocab(train)
LABEL.build_vocab(train)

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_size=3)

batch = next(iter(train_iter))
print(batch.premise)
print(batch.hypothesis)
print(batch.label)

train_iter, val_iter, test_iter = datasets.SNLI.iters(batch_size=4)

batch = next(iter(train_iter))
print(batch.premise)
print(batch.hypothesis)
print(batch.label)
示例#25
0
文件: imdb.py 项目: iclementine/text
from pytext import data
from pytext import datasets
from pytext.vocab import GloVe

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits((train, test),
                                                   batch_size=3,
                                                   device="cuda:0")

# print batch information
示例#26
0
from pytext import data
from pytext import datasets
from pytext.vocab import GloVe

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, batch_first=True)

# make splits for data
train, valid, test = datasets.WikiText2.splits(TEXT)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0])['text'][0:10])

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))

# make iterator for splits
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test), batch_size=3, bptt_len=30, device="cuda:0")

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.target)
示例#27
0
    def test_numericalize(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]
        numericalized = field.numericalize(examples_data)

        assert numericalized.dim() == 3
        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)

        # test include_lengths
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field, include_lengths=True)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]

        numericalized, seq_len, word_len = field.numericalize(
            (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]]))

        assert numericalized.dim() == 3
        assert len(seq_len) == 2
        assert len(word_len) == 2

        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)
示例#28
0
    def test_build_vocab(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)

        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)

        # Write JSON dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="json")
        json_fields = {
            "question1": ("q1", question_field),
            "question2": ("q2", question_field),
            "label": ("label", label_field)
        }
        json_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                           format="json",
                                           fields=json_fields)

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9,
            'you': 10,
            '"&"': 11,
            '"and"?': 12,
            '2+2': 13,
            '2+2=?': 14,
            'Abraham': 15,
            'What': 16,
            'Where': 17,
            'Which': 18,
            'is': 19,
            'location': 20,
            'し?': 21,
            'シ': 22
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9,
            'you': 10,
            '"&"': 11,
            '"and"?': 12,
            '2+2': 13,
            '2+2=?': 14,
            'Abraham': 15,
            'What': 16,
            'Where': 17,
            'Which': 18,
            'is': 19,
            'location': 20,
            'し?': 21,
            'シ': 22
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab with extra kwargs passed to Vocab
        question_field.build_vocab(tsv_dataset,
                                   json_dataset,
                                   max_size=8,
                                   min_freq=3)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos
示例#29
0
    def test_init_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False)
        field = data.NestedField(nesting_field)

        assert field.pad_token == "<pad>"
示例#30
0
    def test_pad(self):
        # Default case.
        field = data.Field()
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [["a", "sentence", "of", "data", "."],
                                     [
                                         "yet", "another", "<pad>", "<pad>",
                                         "<pad>"
                                     ],
                                     ["one", "last", "sent", "<pad>", "<pad>"]]
        expected_lengths = [5, 2, 3]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)

        # Test fix_length properly truncates and pads.
        field = data.Field(fix_length=3)
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [["a", "sentence", "of"],
                                     ["yet", "another", "<pad>"],
                                     ["one", "last", "sent"]]
        expected_lengths = [3, 2, 3]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(fix_length=3, include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)
        field = data.Field(fix_length=3, truncate_first=True)
        expected_padded_minibatch = [["of", "data", "."],
                                     ["yet", "another", "<pad>"],
                                     ["one", "last", "sent"]]
        assert field.pad(minibatch) == expected_padded_minibatch

        # Test init_token is properly handled.
        field = data.Field(fix_length=4, init_token="<bos>")
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [["<bos>", "a", "sentence", "of"],
                                     ["<bos>", "yet", "another", "<pad>"],
                                     ["<bos>", "one", "last", "sent"]]
        expected_lengths = [4, 3, 4]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(fix_length=4,
                           init_token="<bos>",
                           include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)

        # Test init_token and eos_token are properly handled.
        field = data.Field(init_token="<bos>", eos_token="<eos>")
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [
            ["<bos>", "a", "sentence", "of", "data", ".", "<eos>"],
            ["<bos>", "yet", "another", "<eos>", "<pad>", "<pad>", "<pad>"],
            ["<bos>", "one", "last", "sent", "<eos>", "<pad>", "<pad>"]
        ]
        expected_lengths = [7, 4, 5]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(init_token="<bos>",
                           eos_token="<eos>",
                           include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)

        # Test that non-sequential data is properly handled.
        field = data.Field(init_token="<bos>",
                           eos_token="<eos>",
                           sequential=False)
        minibatch = [["contradiction"], ["neutral"], ["entailment"]]
        assert field.pad(minibatch) == minibatch
        field = data.Field(init_token="<bos>",
                           eos_token="<eos>",
                           sequential=False,
                           include_lengths=True)
        assert field.pad(minibatch) == minibatch