Пример #1
0
def test_field_preprocess_eager():
    vocab = MockVocab(eager=True)
    f = Field(name="F", numericalizer=vocab)
    f.preprocess("some text")

    # vocab was updated
    assert len(vocab.values) > 0
Пример #2
0
def test_field_applies_specials():
    bos, eos = BOS(), EOS()
    vocab = Vocab(specials=(bos, eos))
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos])

    assert received == expected

    # Test with empty specials
    vocab = Vocab(specials=())
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", ["asd", "123", "BLA"])

    assert received == expected

    # Test core specials are a no-op
    vocab = Vocab(specials=(PAD(), UNK()))
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", ["asd", "123", "BLA"])

    assert received == expected
Пример #3
0
def test_field_custom_numericalization_no_tokenization():
    tfield = Field("bla", numericalizer=lambda x: x, tokenizer=None)

    _, data1 = tfield.preprocess([1, 2, 3])[0]
    _, data2 = tfield.preprocess([3, 2, 1])[0]
    _, data3 = tfield.preprocess([3, 4, 5, 6])[0]
    _, data4 = tfield.preprocess([2, 3, 6])[0]

    tfield.finalize()

    assert np.all(tfield.numericalize(data1) == np.array([1, 2, 3]))
    assert np.all(tfield.numericalize(data2) == np.array([3, 2, 1]))
    assert np.all(tfield.numericalize(data3) == np.array([3, 4, 5, 6]))
    assert np.all(tfield.numericalize(data4) == np.array([2, 3, 6]))
Пример #4
0
def test_field_custom_numericalization_vocab_non_string():
    vocab = Vocab(specials=())
    tfield = Field("bla", numericalizer=vocab, tokenizer=None)

    _, data1 = tfield.preprocess([1, 2, 3])[0]
    _, data2 = tfield.preprocess([3, 2, 1])[0]
    _, data3 = tfield.preprocess([3, 4, 5, 6])[0]
    _, data4 = tfield.preprocess([2, 3, 6])[0]

    tfield.finalize()

    assert np.all(tfield.numericalize(data1) == vocab.numericalize([1, 2, 3]))
    assert np.all(tfield.numericalize(data2) == vocab.numericalize([3, 2, 1]))
    assert np.all(tfield.numericalize(data3) == vocab.numericalize([3, 4, 5, 6]))
    assert np.all(tfield.numericalize(data4) == vocab.numericalize([2, 3, 6]))
Пример #5
0
def test_field_custom_numericalization_no_tokenization_2():
    label_indexer = {"one": 1, "two": 2, "three": 3, "four": 4}

    tfield = Field("bla", numericalizer=label_indexer.get, tokenizer=None)

    _, data1 = tfield.preprocess(["one", "two", "three"])[0]
    _, data2 = tfield.preprocess(["three", "two", "one"])[0]
    _, data3 = tfield.preprocess(["three", "four", "four", "two"])[0]
    _, data4 = tfield.preprocess(["two", "three", "one"])[0]

    tfield.finalize()

    assert np.all(tfield.numericalize(data1) == np.array([1, 2, 3]))
    assert np.all(tfield.numericalize(data2) == np.array([3, 2, 1]))
    assert np.all(tfield.numericalize(data3) == np.array([3, 4, 4, 2]))
    assert np.all(tfield.numericalize(data4) == np.array([2, 3, 1]))
Пример #6
0
def test_field_repeated_hooks():
    def replace_tag_hook(raw, tokenized):
        replaced_tags = map(lambda s: s.replace("<tag>", "ABC"), tokenized)

        return raw, replaced_tags

    def to_lower_hook(raw, tokenized):
        # keep track of the function call count
        to_lower_hook.call_count += 1

        tokenized = map(str.lower, tokenized)

        return raw, tokenized

    to_lower_hook.call_count = 0

    f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True)

    # TAG -> tag
    f.add_posttokenize_hook(to_lower_hook)

    # <tag> -> ABC
    f.add_posttokenize_hook(replace_tag_hook)

    # ABC -> abc
    f.add_posttokenize_hook(to_lower_hook)

    _, received = f.preprocess("BLA <TAG> bla")[0]

    expected = ("BLA <TAG> bla", ["bla", "abc", "bla"])

    assert received == expected

    # check that the hook that was added twice was also called twice
    assert to_lower_hook.call_count == 2
Пример #7
0
def test_field_posttokenize_hooks_detach():
    f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True)

    def remove_tags_hook(raw, tokenized):
        raw = raw.replace("<tag>", "")
        tokenized = map(lambda x: x.replace("<tag>", ""), tokenized)

        return raw, tokenized

    def to_upper_hook(raw, tokenized):
        raw = raw.upper()
        tokenized = map(str.upper, tokenized)

        return raw, tokenized

    f.add_posttokenize_hook(remove_tags_hook)
    f.add_posttokenize_hook(to_upper_hook)

    # detaching the hooks
    f.remove_posttokenize_hooks()

    _, received = f.preprocess("asd 123<tag> B<tag>LA")[0]
    expected = ("asd 123<tag> B<tag>LA", ["asd", "123<tag>", "B<tag>LA"])

    assert received == expected
Пример #8
0
def test_hook_returning_iterable():
    data = "1,2,3,4"
    expected_tokens = [3, 5, 7, 9]

    field = Field(
        "Iterator_hook_test_field",
        tokenizer=lambda raw: [int(x) for x in raw.split(",")],
        numericalizer=id,
        keep_raw=True,
    )

    def multiply_by_two_hook(raw, tokens):
        return raw, (i * 2 for i in tokens)

    def add_one_hook(raw, tokens):
        assert not isinstance(tokens, (list, tuple))
        return raw, (i + 1 for i in tokens)

    field.add_posttokenize_hook(multiply_by_two_hook)
    field.add_posttokenize_hook(add_one_hook)

    _, (raw, tokens) = field.preprocess(data)[0]

    assert raw == data
    assert isinstance(tokens, (list, tuple))
    assert tokens == expected_tokens
Пример #9
0
def test_missing_symbol_index_vocab():
    vocab = Vocab()
    fld = Field(
        name="test_field",
        tokenizer="split",
        keep_raw=False,
        numericalizer=vocab,
        allow_missing_data=True,
    )

    fld.preprocess("a b c d")
    ((_, data),) = fld.preprocess(None)
    assert data == (None, None)

    fld.finalize()
    assert fld.numericalize((None, None)) is None
    assert fld.get_default_value() == -1
Пример #10
0
def test_field_get_tokenizer_spacy_ok():
    mp = patch.dict("sys.modules", spacy=MockSpacy())
    mp.start()

    f = Field(name="F", numericalizer=MockVocab(), tokenizer="spacy")
    _, data = f.preprocess("bla blu")[0]
    assert data == (None, ["bla", "blu"])

    mp.stop()
Пример #11
0
def test_missing_values_default_sequential():
    fld = Field(
        name="bla",
        keep_raw=False,
        tokenizer="split",
        numericalizer=hash,
        allow_missing_data=True,
    )

    _, data_missing = fld.preprocess(None)[0]
    _, data_exists = fld.preprocess("data_string")[0]

    assert data_missing == (None, None)
    assert data_exists == (None, ["data_string"])
    fld.finalize()

    assert fld.numericalize(data_missing) is None
    assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")]))
Пример #12
0
def test_field_get_tokenizer_callable():
    vocab = MockVocab()

    def my_tokenizer(string):
        return [string[0], string[1:]]

    f = Field(name="F", numericalizer=vocab, tokenizer=my_tokenizer)

    _, data = f.preprocess("asd dsa")[0]
    assert data == (None, ["a", "sd dsa"])
Пример #13
0
def test_missing_values_custom_numericalize():
    fld = Field(
        name="test_field",
        keep_raw=True,
        tokenizer=None,
        numericalizer=int,
        allow_missing_data=True,
    )

    _, data_missing = fld.preprocess(None)[0]
    _, data_exists = fld.preprocess("404")[0]

    assert data_missing == (None, None)
    assert data_exists == ("404", "404")

    fld.finalize()

    assert fld.numericalize(data_missing) is None
    assert np.all(fld.numericalize(data_exists) == np.array([404]))
Пример #14
0
def test_field_preprocess_raw_sequential(
    value, store_raw, tokenize, expected_raw_value, expected_tokenized_value
):
    tokenizer = "split" if tokenize else None
    f = Field(name="F", keep_raw=store_raw, tokenizer=tokenizer)

    ((_, (received_raw_value, received_tokenized_value)),) = f.preprocess(value)

    assert received_raw_value == expected_raw_value
    assert received_tokenized_value == expected_tokenized_value
Пример #15
0
def test_field_pretokenize_hooks():
    f = Field(name="F", tokenizer="split", keep_raw=True)

    f.add_pretokenize_hook(str.lower)
    f.add_pretokenize_hook(lambda x: x.replace("bla", "blu"))
    f.add_pretokenize_hook(lambda x: x.replace(";", " "))
    f.add_pretokenize_hook(lambda x: x.replace(",", " "))

    raw_str = "asd;123,BLA"

    _, received = f.preprocess(raw_str)[0]
    expected = ("asd 123 blu", ["asd", "123", "blu"])

    assert received == expected
Пример #16
0
def test_field_vocab_no_tokenization():
    vocab = Vocab(eager=True)
    pretokenized_input1 = ["word", "words", "uttering"]
    pretokenized_input2 = ["word", "words"]
    pretokenized_input3 = ["word"]

    pretokenized_input4 = ["word", "uttering"]

    tokenized_field = Field("test_field", tokenizer=None, numericalizer=vocab)

    _, data1 = tokenized_field.preprocess(pretokenized_input1)[0]
    _, data2 = tokenized_field.preprocess(pretokenized_input2)[0]
    _, data3 = tokenized_field.preprocess(pretokenized_input3)[0]
    _, data4 = tokenized_field.preprocess(pretokenized_input4)[0]

    tokenized_field.finalize()

    expected_numericalization_1 = np.array([2, 3, 4])
    _, tok1 = data1
    assert np.all(vocab.numericalize(tok1) == expected_numericalization_1)
    assert np.all(tokenized_field.numericalize(data1) == expected_numericalization_1)

    expected_numericalization_2 = np.array([2, 3])
    _, tok2 = data2
    assert np.all(vocab.numericalize(tok2) == expected_numericalization_2)
    assert np.all(tokenized_field.numericalize(data2) == expected_numericalization_2)

    expected_numericalization_3 = np.array([2])
    _, tok3 = data3
    assert np.all(vocab.numericalize(tok3) == expected_numericalization_3)
    assert np.all(tokenized_field.numericalize(data3) == expected_numericalization_3)

    expected_numericalization_4 = np.array([2, 4])
    _, tok4 = data4
    assert np.all(vocab.numericalize(tok4) == expected_numericalization_4)
    assert np.all(tokenized_field.numericalize(data4) == expected_numericalization_4)
Пример #17
0
def test_field_pretokenize_hooks_detach():
    f = Field(name="F", tokenizer="split", keep_raw=True)

    f.add_pretokenize_hook(str.lower)
    f.add_pretokenize_hook(lambda x: x.replace(";", " "))
    f.add_pretokenize_hook(lambda x: x.replace(",", " "))

    # detaching
    f.remove_pretokenize_hooks()

    raw_str = "asd;123,BLA"

    _, received = f.preprocess(raw_str)[0]

    expected = (raw_str, [raw_str])

    assert received == expected
Пример #18
0
def test_field_pickle_spacy_tokenizer(tmpdir):
    mp = patch.dict("sys.modules", spacy=MockSpacy())
    mp.start()

    fld = Field(name="F", numericalizer=None, tokenizer="spacy")
    _, data = fld.preprocess("bla blu")[0]
    assert data == (None, ["bla", "blu"])

    field_file = os.path.join(tmpdir, "field.pkl")

    with open(field_file, "wb") as fdata:
        dill.dump(fld, fdata)

    with open(field_file, "rb") as fdata:
        loaded_fld = dill.load(fdata)

        assert loaded_fld._tokenizer_arg_string == "spacy"

        _, data = loaded_fld.preprocess("bla blu")[0]
        assert data == (None, ["bla", "blu"])

    mp.stop()
Пример #19
0
def test_field_posttokenize_hooks():
    f = Field(name="F", tokenizer="split", keep_raw=True)

    def remove_tags_hook(raw, tokenized):
        raw = raw.replace("<tag>", "")
        tokenized = map(lambda x: x.replace("<tag>", ""), tokenized)

        return raw, tokenized

    def to_upper_hook(raw, tokenized):
        raw = raw.upper()
        tokenized = map(str.upper, tokenized)

        return raw, tokenized

    f.add_posttokenize_hook(remove_tags_hook)
    f.add_posttokenize_hook(to_upper_hook)

    _, received = f.preprocess("asd 123<tag> B<tag>LA")[0]
    expected = ("ASD 123 BLA", ["ASD", "123", "BLA"])

    assert received == expected
Пример #20
0
def test_posttokenize_hooks_in_field_no_tokenization_single_execution(mocker):
    f = Field(name="F", tokenizer=None)

    def hk(data, tokenized):
        def caseness(token):
            if token.islower():
                return "lowercase"
            else:
                return "uppercase"

        return data, [caseness(token) for token in tokenized]

    patched_hook = mocker.spy(hk, "__call__")

    f.add_posttokenize_hook(patched_hook)

    raw_str = ["Upper", "lower"]

    _, received = f.preprocess(raw_str)[0]
    expected = (None, ["uppercase", "lowercase"])

    assert received == expected
    patched_hook.assert_called_once()
Пример #21
0
def test_field_pickle_tokenized(
    value, store_raw, tokenize, expected_raw_value, expected_tokenized_value, tmpdir
):
    tokenizer = "split" if tokenize else None
    fld = Field(name="F", keep_raw=store_raw, tokenizer=tokenizer)

    ((_, (received_raw_value, received_tokenized_value)),) = fld.preprocess(value)

    assert received_raw_value == expected_raw_value
    assert received_tokenized_value == expected_tokenized_value

    field_file = os.path.join(tmpdir, "field.pkl")

    with open(field_file, "wb") as fdata:
        dill.dump(fld, fdata)

    with open(field_file, "rb") as fdata:
        loaded_fld = dill.load(fdata)
        ((_, (raw_value, tokenized_value)),) = loaded_fld.preprocess(value)

        assert raw_value == expected_raw_value
        assert tokenized_value == expected_tokenized_value
        assert loaded_fld.name == "F"
        assert loaded_fld._keep_raw == store_raw
Пример #22
0
def test_field_get_tokenizer_default():
    f = Field(name="F", numericalizer=MockVocab())

    _, data = f.preprocess("asd dsa")[0]
    assert data == (None, ["asd", "dsa"])
Пример #23
0
def test_missing_values_fail():
    fld = Field(name="bla", keep_raw=True, tokenizer=None, numericalizer=hash)

    with pytest.raises(ValueError):
        fld.preprocess(None)