예제 #1
0
def test_multiple_output_for_input_list(expected_values):
    lower_case_name_field = Field("Lowercase_name", keep_raw=True)
    lower_case_name_field.add_pretokenize_hook(str.lower)

    upper_case_name_field = Field("Uppercase_name", keep_raw=True)
    upper_case_name_field.add_pretokenize_hook(str.upper)

    test_field_list = list(field_list)

    test_field_list[0] = (
        test_field_list[0],
        lower_case_name_field,
        upper_case_name_field,
    )

    example_factory = ExampleFactory(test_field_list)
    example = example_factory.from_list(expected_values)

    raw, tokenized = example["Name"]
    assert raw == expected_values[0]
    assert tokenized == expected_values[0].split()

    raw, tokenized = example["Lowercase_name"]
    assert raw == expected_values[0].lower()
    assert tokenized == expected_values[0].lower().split()

    raw, tokenized = example["Uppercase_name"]
    assert raw == expected_values[0].upper()
    assert tokenized == expected_values[0].upper().split()

    raw, tokenized = example["Score"]
    assert raw == expected_values[1]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values[2]
예제 #2
0
def test_cache_data_field_from_list(expected_values):
    example_factory = ExampleFactory(field_list)
    example = example_factory.from_list(expected_values)

    for field in field_list:
        field_name = field.name

        assert field_name in example
        assert hasattr(example, field_name)
예제 #3
0
def dataset_with_upper_field(fields):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
예제 #4
0
def test_ignore_values_list(expected_values):
    fields = [None, None, favorite_food_field]
    example_factory = ExampleFactory(fields)
    example = example_factory.from_list(expected_values)

    assert "Favorite_food" in example
    assert hasattr(example, "Favorite_food")

    raw, _ = example["Favorite_food"]
    assert raw == expected_values[2]
예제 #5
0
def test_from_dataset(data, fields):
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(raw_example) for raw_example in data]
    dataset = Dataset(examples, fields)
    pyarrow_dataset = DiskBackedDataset.from_dataset(dataset)

    for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset):
        assert ds_ex.number == arrow_ex.number
        assert ds_ex.tokens == arrow_ex.tokens

    pyarrow_dataset.delete_cache()
예제 #6
0
def test_concat_view_fail_no_field_intersection(dataset):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [None, upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, fields)
    other_dataset.finalize_fields()

    with pytest.raises(ValueError):
        DatasetConcatView([dataset, other_dataset])
예제 #7
0
def test_create_from_list(expected_values):
    example_factory = ExampleFactory(field_list)
    example = example_factory.from_list(expected_values)

    raw, tokenized = example["Name"]
    assert tokenized == expected_values[0].split()

    raw, tokenized = example["Score"]
    assert raw == expected_values[1]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values[2]
예제 #8
0
def test_from_examples(data, fields):
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(ex) for ex in data]
    ad = DiskBackedDataset.from_examples(fields, examples)

    for (raw, tokenized), (num, _) in zip(ad.number, data):
        assert raw == num
        assert tokenized is num

    for (raw, tokenized), (_, tok) in zip(ad.tokens, data):
        assert raw == tok
        assert tokenized == tok.split(" ")

    ad.delete_cache()
예제 #9
0
    def create_dataset():
        fields = (
            Field("text", numericalizer=Vocab()),
            Field("source", numericalizer=Vocab(), tokenizer=list),
        )
        example_factory = ExampleFactory(fields)

        examples = [
            example_factory.from_list(data)
            for data in zip(TABULAR_TEXT, TABULAR_SOURCES)
        ]

        dataset = Dataset(examples, fields)
        return dataset
예제 #10
0
def test_concat_view_override_fields_eager(dataset, fields):
    upper_name_field = Field("name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    other_fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(other_fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, other_fields)
    other_dataset.finalize_fields()

    new_field = Field("override_name_field", numericalizer=Vocab(eager=True))
    dataset_concat = DatasetConcatView([dataset, other_dataset],
                                       field_overrides={"name": new_field})

    assert dataset_concat.field_dict["override_name_field"].is_finalized

    concat_vocab = dataset_concat.field_dict["override_name_field"].vocab
    dataset_vocab = dataset.field_dict["name"].vocab
    other_vocab = other_dataset.field_dict["name"].vocab
    assert set(
        concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
예제 #11
0
def test_iterator_batch_as_list():
    raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)]
    field = Field(
        "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True
    )
    fields = (field,)
    ef = ExampleFactory(fields)
    examples = [ef.from_list(raw_example) for raw_example in raw_dataset]
    ds = Dataset(examples, fields)

    for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)):
        assert isinstance(batch.test_field, list)
        field_batch = batch.test_field
        if i == 0:
            assert len(field_batch) == 2
            assert np.all(field_batch[0] == [1, 2, 3, 4])
            assert np.all(field_batch[1] == [2, 3, 4])

        if i == 2:
            assert len(field_batch) == 1
            assert np.all(field_batch[0] == [3, 4])
예제 #12
0
def dataset(fields) -> DatasetBase:
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds