def test_multiple_output_for_input_list(expected_values): lower_case_name_field = Field("Lowercase_name", keep_raw=True) lower_case_name_field.add_pretokenize_hook(str.lower) upper_case_name_field = Field("Uppercase_name", keep_raw=True) upper_case_name_field.add_pretokenize_hook(str.upper) test_field_list = list(field_list) test_field_list[0] = ( test_field_list[0], lower_case_name_field, upper_case_name_field, ) example_factory = ExampleFactory(test_field_list) example = example_factory.from_list(expected_values) raw, tokenized = example["Name"] assert raw == expected_values[0] assert tokenized == expected_values[0].split() raw, tokenized = example["Lowercase_name"] assert raw == expected_values[0].lower() assert tokenized == expected_values[0].lower().split() raw, tokenized = example["Uppercase_name"] assert raw == expected_values[0].upper() assert tokenized == expected_values[0].upper().split() raw, tokenized = example["Score"] assert raw == expected_values[1] raw, tokenized = example["Favorite_food"] assert raw == expected_values[2]
def test_cache_data_field_from_list(expected_values): example_factory = ExampleFactory(field_list) example = example_factory.from_list(expected_values) for field in field_list: field_name = field.name assert field_name in example assert hasattr(example, field_name)
def dataset_with_upper_field(fields): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [fields[0], upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def test_ignore_values_list(expected_values): fields = [None, None, favorite_food_field] example_factory = ExampleFactory(fields) example = example_factory.from_list(expected_values) assert "Favorite_food" in example assert hasattr(example, "Favorite_food") raw, _ = example["Favorite_food"] assert raw == expected_values[2]
def test_from_dataset(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(raw_example) for raw_example in data] dataset = Dataset(examples, fields) pyarrow_dataset = DiskBackedDataset.from_dataset(dataset) for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset): assert ds_ex.number == arrow_ex.number assert ds_ex.tokens == arrow_ex.tokens pyarrow_dataset.delete_cache()
def test_concat_view_fail_no_field_intersection(dataset): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [None, upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, fields) other_dataset.finalize_fields() with pytest.raises(ValueError): DatasetConcatView([dataset, other_dataset])
def test_create_from_list(expected_values): example_factory = ExampleFactory(field_list) example = example_factory.from_list(expected_values) raw, tokenized = example["Name"] assert tokenized == expected_values[0].split() raw, tokenized = example["Score"] assert raw == expected_values[1] raw, tokenized = example["Favorite_food"] assert raw == expected_values[2]
def test_from_examples(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(ex) for ex in data] ad = DiskBackedDataset.from_examples(fields, examples) for (raw, tokenized), (num, _) in zip(ad.number, data): assert raw == num assert tokenized is num for (raw, tokenized), (_, tok) in zip(ad.tokens, data): assert raw == tok assert tokenized == tok.split(" ") ad.delete_cache()
def create_dataset(): fields = ( Field("text", numericalizer=Vocab()), Field("source", numericalizer=Vocab(), tokenizer=list), ) example_factory = ExampleFactory(fields) examples = [ example_factory.from_list(data) for data in zip(TABULAR_TEXT, TABULAR_SOURCES) ] dataset = Dataset(examples, fields) return dataset
def test_concat_view_override_fields_eager(dataset, fields): upper_name_field = Field("name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) other_fields = [fields[0], upper_name_field] example_factory = ExampleFactory(other_fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, other_fields) other_dataset.finalize_fields() new_field = Field("override_name_field", numericalizer=Vocab(eager=True)) dataset_concat = DatasetConcatView([dataset, other_dataset], field_overrides={"name": new_field}) assert dataset_concat.field_dict["override_name_field"].is_finalized concat_vocab = dataset_concat.field_dict["override_name_field"].vocab dataset_vocab = dataset.field_dict["name"].vocab other_vocab = other_dataset.field_dict["name"].vocab assert set( concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
def test_iterator_batch_as_list(): raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)] field = Field( "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True ) fields = (field,) ef = ExampleFactory(fields) examples = [ef.from_list(raw_example) for raw_example in raw_dataset] ds = Dataset(examples, fields) for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)): assert isinstance(batch.test_field, list) field_batch = batch.test_field if i == 0: assert len(field_batch) == 2 assert np.all(field_batch[0] == [1, 2, 3, 4]) assert np.all(field_batch[1] == [2, 3, 4]) if i == 2: assert len(field_batch) == 1 assert np.all(field_batch[0] == [3, 4])
def dataset(fields) -> DatasetBase: example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds