def test_preprocess(self): # Default case. field = data.Field() assert field.preprocess("Test string.") == ["Test", "string."] # Test that lowercase is properly applied. field_lower = data.Field(lower=True) assert field_lower.preprocess("Test string.") == ["test", "string."] # Test that custom preprocessing pipelines are properly applied. preprocess_pipeline = data.Pipeline(lambda x: x + "!") field_preprocessing = data.Field(preprocessing=preprocess_pipeline, lower=True) assert field_preprocessing.preprocess("Test string.") == [ "test!", "string.!" ] # Test that non-sequential data is properly handled. field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess( "Test string.") == "test string.!" # Non-regression test that we do not try to decode unicode strings to unicode field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess( "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
def test_subword_trec(self): TEXT = data.SubwordField() LABEL = data.Field(sequential=False) RAW = data.Field(sequential=False, use_vocab=False) raw, _ = TREC.splits(RAW, LABEL) cooked, _ = TREC.splits(TEXT, LABEL) LABEL.build_vocab(cooked) TEXT.build_vocab(cooked, max_size=100) TEXT.segment(cooked) print(cooked[0].text) batch = next(iter(data.Iterator(cooked, 1, shuffle=False))) self.assertEqual(TEXT.reverse(batch.text.data)[0], raw[0].text)
def test_numerical_features_no_vocab(self): self.write_test_numerical_features_dataset() # Test basic usage int_field = data.Field(sequential=False, use_vocab=False) float_field = data.Field(sequential=False, use_vocab=False, dtype=torch.float) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data) assert_allclose(numericalized_int.data.numpy(), [1, 0, 1, 3, 19]) numericalized_float = float_field.numericalize(test_float_data) assert_allclose(numericalized_float.data.numpy(), [1.1, 0.1, 3.91, 0.2, 10.2]) # Test with postprocessing applied int_field = data.Field( sequential=False, use_vocab=False, postprocessing=lambda arr, _: [x + 1 for x in arr]) float_field = data.Field( sequential=False, use_vocab=False, dtype=torch.float, postprocessing=lambda arr, _: [x * 0.5 for x in arr]) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data) assert_allclose(numericalized_int.data.numpy(), [2, 1, 2, 4, 20]) numericalized_float = float_field.numericalize(test_float_data) assert_allclose(numericalized_float.data.numpy(), [0.55, 0.05, 1.955, 0.1, 5.1])
def test_init_with_nested_field_as_nesting_field(self): nesting_field = data.NestedField(data.Field()) with pytest.raises(ValueError) as excinfo: data.NestedField(nesting_field) assert "nesting field must not be another NestedField" in str( excinfo.value)
def test_numericalize_postprocessing(self): self.write_test_ppid_dataset(data_format="tsv") def reverse_postprocess(arr, vocab): return [list(reversed(sentence)) for sentence in arr] question_field = data.Field(sequential=True, postprocessing=reverse_postprocess) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [ ["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"] ] reversed_test_example_data = [ list(reversed(sentence)) for sentence in test_example_data ] postprocessed_numericalized = question_field.numericalize( (test_example_data)) verify_numericalized_example(question_field, reversed_test_example_data, postprocessed_numericalized)
def test_init_when_nesting_field_has_include_lengths_equal_true(self): nesting_field = data.Field(include_lengths=True) with pytest.raises(ValueError) as excinfo: data.NestedField(nesting_field) assert "nesting field cannot have include_lengths=True" in str( excinfo.value)
def test_pad_when_pad_first_is_true(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", pad_first=True) minibatch = [ [list("john"), list("loves"), list("mary")], [list("mary"), list("cries")], ] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<cpad>"] * 7, ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ]] assert CHARS.pad(minibatch) == expected # test include_length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True, pad_first=True) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [5, 4] assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]]
def test_preprocess(self): nesting_field = data.Field( tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs]) field = data.NestedField(nesting_field, preprocessing=lambda xs: reversed(xs)) preprocessed = field.preprocess("john loves mary") assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
def test_pad_when_nesting_field_has_fix_length(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>", fix_length=5) CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") minibatch = [["john", "loves", "mary"], ["mary", "cries"]] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2, ["<w>"] + list("joh") + ["</w>"], ["<w>"] + list("lov") + ["</w>"], ["<w>"] + list("mar") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2, ["<w>"] + list("mar") + ["</w>"], ["<w>"] + list("cri") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2, ["<cpad>"] * 5, ]] assert CHARS.pad(minibatch) == expected # test include length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>", fix_length=5) CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [5, 4] assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]]
def test_build_vocab_from_iterable(self): nesting_field = data.Field(unk_token="<cunk>", pad_token="<cpad>") CHARS = data.NestedField(nesting_field) CHARS.build_vocab( [[list("aaa"), list("bbb"), ["c"]], [list("bbb"), list("aaa")]], [[list("ccc"), list("bbb")], [list("bbb")]], ) expected = "a b c <cunk> <cpad>".split() assert len(CHARS.vocab) == len(expected) for c in expected: assert c in CHARS.vocab.stoi
def test_pad_when_fix_length_is_not_none(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", fix_length=3) minibatch = [["john", "loves", "mary"], ["mary", "cries"]] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ]] assert CHARS.pad(minibatch) == expected # test include length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True, fix_length=3) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [3, 3] assert words_len == [[3, 6, 3], [3, 6, 3]]
def test_batch_with_missing_field(self): # smoke test to see if batches with missing attributes are shown properly with open(self.test_missing_field_dataset_path, "wt") as f: f.write("text,label\n1,0") dst = data.TabularDataset(path=self.test_missing_field_dataset_path, format="csv", skip_header=True, fields=[("text", data.Field(use_vocab=False, sequential=False)), ("label", None)]) itr = data.Iterator(dst, batch_size=64) str(next(itr.__iter__()))
def test_wikitext2(self): # smoke test to ensure wikitext2 works properly ds = WikiText2 TEXT = data.Field(lower=True, batch_first=True) train, valid, test = ds.splits(TEXT) TEXT.build_vocab(train) train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=3, bptt_len=30) train_iter, valid_iter, test_iter = ds.iters(batch_size=4, bptt_len=30) # Delete the dataset after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": datafile = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(datafile)
def test_pad_when_nesting_field_is_not_sequential(self): nesting_field = data.Field(sequential=False, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") minibatch = [["john", "loves", "mary"], ["mary", "cries"]] expected = [ ["<s>", "john", "loves", "mary", "</s>"], ["<s>", "mary", "cries", "</s>", "<pad>"], ] assert CHARS.pad(minibatch) == expected
def test_build_vocab_from_dataset(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)]) ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)]) dataset = data.Dataset([ex1, ex2], [("chars", CHARS)]) CHARS.build_vocab(dataset, min_freq=2) expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split() assert len(CHARS.vocab) == len(expected) for c in expected: assert c in CHARS.vocab.stoi
def test_numericalize_basic(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [ ["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"] ] # Test default default_numericalized = question_field.numericalize(test_example_data) verify_numericalized_example(question_field, test_example_data, default_numericalized)
def test_pad_when_no_init_and_eos_tokens(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field) minibatch = [["john", "loves", "mary"], ["mary", "cries"]] expected = [[ ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ], [ ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<cpad>"] * 7, ]] assert CHARS.pad(minibatch) == expected
def test_build_vocab(self): nesting_field = data.Field(tokenize=list, init_token="<w>", eos_token="</w>") field = data.NestedField(nesting_field, init_token='<s>', eos_token='</s>', include_lengths=True, pad_first=True) sources = [[['a'], ['s', 'e', 'n', 't', 'e', 'n', 'c', 'e'], ['o', 'f'], ['d', 'a', 't', 'a'], ['.']], [['y', 'e', 't'], ['a', 'n', 'o', 't', 'h', 'e', 'r']], [['o', 'n', 'e'], ['l', 'a', 's', 't'], ['s', 'e', 'n', 't']]] field.build_vocab(sources, vectors='glove.6B.50d', unk_init=init.xavier_normal, vectors_cache=".vector_cache")
def test_init_minimal(self): nesting_field = data.Field() field = data.NestedField(nesting_field) assert isinstance(field, data.Field) assert field.nesting_field is nesting_field assert field.sequential assert field.use_vocab assert field.init_token is None assert field.eos_token is None assert field.unk_token == nesting_field.unk_token assert field.fix_length is None assert field.dtype is torch.long assert field.preprocessing is None assert field.postprocessing is None assert field.lower == nesting_field.lower assert field.tokenize("a b c") == "a b c".split() assert not field.include_lengths assert field.batch_first assert field.pad_token == nesting_field.pad_token assert not field.pad_first
def test_process(self): raw_field = data.RawField() field = data.Field(sequential=True, use_vocab=False, batch_first=True) # Test tensor-like batch data which is accepted by both RawField and Field batch = [[1, 2, 3], [2, 3, 4]] batch_tensor = torch.LongTensor(batch) raw_field_processed = raw_field.process(batch) field_processed = field.process(batch) assert raw_field_processed == batch assert field_processed.data.equal(batch_tensor) # Test non-tensor data which is only accepted by RawField any_obj = [object() for _ in range(5)] raw_field_processed = raw_field.process(any_obj) assert any_obj == raw_field_processed with pytest.raises(TypeError): field.process(any_obj)
def test_errors(self): # Test that passing a non-tuple (of data and length) to numericalize # with Field.include_lengths = True raises an error. with self.assertRaises(ValueError): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [[ "When", "do", "you", "use", "シ", "instead", "of", "し?" ], [ "What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>" ], [ "Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>" ]] question_field.numericalize(test_example_data)
def test_init_full(self): nesting_field = data.Field() field = data.NestedField( nesting_field, use_vocab=False, init_token="<s>", eos_token="</s>", fix_length=10, dtype=torch.float, preprocessing=lambda xs: list(reversed(xs)), postprocessing=lambda xs: [x.upper() for x in xs], tokenize=list, pad_first=True, ) assert not field.use_vocab assert field.init_token == "<s>" assert field.eos_token == "</s>" assert field.fix_length == 10 assert field.dtype is torch.float assert field.preprocessing("a b c".split()) == "c b a".split() assert field.postprocessing("a b c".split()) == "A B C".split() assert field.tokenize("abc") == ["a", "b", "c"] assert field.pad_first
def test_vocab_size(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.LabelField() # Copied from test_build_vocab with minor changes # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Skipping json dataset as we can rely on the original build vocab test label_field.build_vocab(tsv_dataset) assert label_field.vocab.freqs == Counter({'1': 2, '0': 1}) expected_stoi = {'1': 0, '0': 1} # No <unk> assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert label_field.vocab.itos == expected_itos
from pytext import data from pytext import datasets TEXT = data.Field() LABEL = data.Field(sequential=False) train, val, test = datasets.SNLI.splits(TEXT, LABEL) print(train.fields) print(len(train)) print(vars(train[0])) TEXT.build_vocab(train) LABEL.build_vocab(train) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=3) batch = next(iter(train_iter)) print(batch.premise) print(batch.hypothesis) print(batch.label) train_iter, val_iter, test_iter = datasets.SNLI.iters(batch_size=4) batch = next(iter(train_iter)) print(batch.premise) print(batch.hypothesis) print(batch.label)
from pytext import data from pytext import datasets from pytext.vocab import GloVe # Approach 1: # set up fields TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(sequential=False) # make splits for data train, test = datasets.IMDB.splits(TEXT, LABEL) # print information about the data print('train.fields', train.fields) print('len(train)', len(train)) print('vars(train[0])', vars(train[0])) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) # make iterator for splits train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3, device="cuda:0") # print batch information
from pytext import data from pytext import datasets from pytext.vocab import GloVe # Approach 1: # set up fields TEXT = data.Field(lower=True, batch_first=True) # make splits for data train, valid, test = datasets.WikiText2.splits(TEXT) # print information about the data print('train.fields', train.fields) print('len(train)', len(train)) print('vars(train[0])', vars(train[0])['text'][0:10]) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) # make iterator for splits train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=3, bptt_len=30, device="cuda:0") # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.target)
def test_numericalize(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ]] numericalized = field.numericalize(examples_data) assert numericalized.dim() == 3 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example(field, example, numericalized_example, batch_first=True) # test include_lengths nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field, include_lengths=True) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ]] numericalized, seq_len, word_len = field.numericalize( (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]])) assert numericalized.dim() == 3 assert len(seq_len) == 2 assert len(word_len) == 2 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example(field, example, numericalized_example, batch_first=True)
def test_build_vocab(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Write JSON dataset and construct a Dataset self.write_test_ppid_dataset(data_format="json") json_fields = { "question1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field) } json_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="json", fields=json_fields) # Test build_vocab default question_field.build_vocab(tsv_dataset, json_dataset) assert question_field.vocab.freqs == Counter({ 'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2 }) expected_stoi = { '<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9, 'you': 10, '"&"': 11, '"and"?': 12, '2+2': 13, '2+2=?': 14, 'Abraham': 15, 'What': 16, 'Where': 17, 'Which': 18, 'is': 19, 'location': 20, 'し?': 21, 'シ': 22 } assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert question_field.vocab.itos == expected_itos label_field.build_vocab(tsv_dataset, json_dataset) assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) expected_stoi = {'1': 1, '0': 2, '<unk>': 0} assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert label_field.vocab.itos == expected_itos # Test build_vocab default question_field.build_vocab(tsv_dataset, json_dataset) assert question_field.vocab.freqs == Counter({ 'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2 }) expected_stoi = { '<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9, 'you': 10, '"&"': 11, '"and"?': 12, '2+2': 13, '2+2=?': 14, 'Abraham': 15, 'What': 16, 'Where': 17, 'Which': 18, 'is': 19, 'location': 20, 'し?': 21, 'シ': 22 } assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert question_field.vocab.itos == expected_itos label_field.build_vocab(tsv_dataset, json_dataset) assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) expected_stoi = {'1': 1, '0': 2, '<unk>': 0} assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert label_field.vocab.itos == expected_itos # Test build_vocab with extra kwargs passed to Vocab question_field.build_vocab(tsv_dataset, json_dataset, max_size=8, min_freq=3) assert question_field.vocab.freqs == Counter({ 'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2 }) expected_stoi = { '<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9 } assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert question_field.vocab.itos == expected_itos
def test_init_when_nesting_field_is_not_sequential(self): nesting_field = data.Field(sequential=False) field = data.NestedField(nesting_field) assert field.pad_token == "<pad>"
def test_pad(self): # Default case. field = data.Field() minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [["a", "sentence", "of", "data", "."], [ "yet", "another", "<pad>", "<pad>", "<pad>" ], ["one", "last", "sent", "<pad>", "<pad>"]] expected_lengths = [5, 2, 3] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) # Test fix_length properly truncates and pads. field = data.Field(fix_length=3) minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [["a", "sentence", "of"], ["yet", "another", "<pad>"], ["one", "last", "sent"]] expected_lengths = [3, 2, 3] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(fix_length=3, include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) field = data.Field(fix_length=3, truncate_first=True) expected_padded_minibatch = [["of", "data", "."], ["yet", "another", "<pad>"], ["one", "last", "sent"]] assert field.pad(minibatch) == expected_padded_minibatch # Test init_token is properly handled. field = data.Field(fix_length=4, init_token="<bos>") minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [["<bos>", "a", "sentence", "of"], ["<bos>", "yet", "another", "<pad>"], ["<bos>", "one", "last", "sent"]] expected_lengths = [4, 3, 4] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(fix_length=4, init_token="<bos>", include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) # Test init_token and eos_token are properly handled. field = data.Field(init_token="<bos>", eos_token="<eos>") minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [ ["<bos>", "a", "sentence", "of", "data", ".", "<eos>"], ["<bos>", "yet", "another", "<eos>", "<pad>", "<pad>", "<pad>"], ["<bos>", "one", "last", "sent", "<eos>", "<pad>", "<pad>"] ] expected_lengths = [7, 4, 5] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(init_token="<bos>", eos_token="<eos>", include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) # Test that non-sequential data is properly handled. field = data.Field(init_token="<bos>", eos_token="<eos>", sequential=False) minibatch = [["contradiction"], ["neutral"], ["entailment"]] assert field.pad(minibatch) == minibatch field = data.Field(init_token="<bos>", eos_token="<eos>", sequential=False, include_lengths=True) assert field.pad(minibatch) == minibatch