예제 #1
0
def test_multiple_output_for_input_list(expected_values):
    lower_case_name_field = Field("Lowercase_name", keep_raw=True)
    lower_case_name_field.add_pretokenize_hook(str.lower)

    upper_case_name_field = Field("Uppercase_name", keep_raw=True)
    upper_case_name_field.add_pretokenize_hook(str.upper)

    test_field_list = list(field_list)

    test_field_list[0] = (
        test_field_list[0],
        lower_case_name_field,
        upper_case_name_field,
    )

    example_factory = ExampleFactory(test_field_list)
    example = example_factory.from_list(expected_values)

    raw, tokenized = example["Name"]
    assert raw == expected_values[0]
    assert tokenized == expected_values[0].split()

    raw, tokenized = example["Lowercase_name"]
    assert raw == expected_values[0].lower()
    assert tokenized == expected_values[0].lower().split()

    raw, tokenized = example["Uppercase_name"]
    assert raw == expected_values[0].upper()
    assert tokenized == expected_values[0].upper().split()

    raw, tokenized = example["Score"]
    assert raw == expected_values[1]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values[2]
예제 #2
0
def test_text_clean_up(kwargs, data, expected_output):
    pytest.importorskip("cleantext")

    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_pretokenize_hook(TextCleanUp(**kwargs))
    example = ExampleFactory([field]).from_list([data])

    assert expected_output == example["data"][1]
예제 #3
0
def test_regex_replace():
    data = "This item costs 100$."
    field = Field(name="data", tokenizer=None, keep_raw=True)
    regex_replace = RegexReplace([(r"\d+", "<NUMBER>"), (r"\s+", "<WHITESPACE>")])
    field.add_pretokenize_hook(regex_replace)
    example = ExampleFactory([field]).from_list([data])

    expected_raw = "This<WHITESPACE>item<WHITESPACE>costs<WHITESPACE><NUMBER>$."
    assert expected_raw == example["data"][1]
예제 #4
0
def test_truecase():
    pytest.importorskip("truecase")

    data = "hey how are you"
    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_pretokenize_hook(truecase())
    example = ExampleFactory([field]).from_list([data])

    assert "Hey how are you" == example["data"][0]
예제 #5
0
def test_moses_normalizer():
    pytest.importorskip("sacremoses")

    data = "What's    up!"
    field = Field(name="data", tokenizer=None, keep_raw=True)
    normalizer = MosesNormalizer()
    field.add_pretokenize_hook(normalizer)
    example = ExampleFactory([field]).from_list([data])

    assert "What's up!" == example["data"][1]
예제 #6
0
def test_field_pretokenize_hooks_detach():
    f = Field(name="F", tokenizer="split", keep_raw=True)

    f.add_pretokenize_hook(str.lower)
    f.add_pretokenize_hook(lambda x: x.replace(";", " "))
    f.add_pretokenize_hook(lambda x: x.replace(",", " "))

    # detaching
    f.remove_pretokenize_hooks()

    raw_str = "asd;123,BLA"

    _, received = f.preprocess(raw_str)[0]

    expected = (raw_str, [raw_str])

    assert received == expected
예제 #7
0
def test_field_pretokenize_hooks():
    f = Field(name="F", tokenizer="split", keep_raw=True)

    f.add_pretokenize_hook(str.lower)
    f.add_pretokenize_hook(lambda x: x.replace("bla", "blu"))
    f.add_pretokenize_hook(lambda x: x.replace(";", " "))
    f.add_pretokenize_hook(lambda x: x.replace(",", " "))

    raw_str = "asd;123,BLA"

    _, received = f.preprocess(raw_str)[0]
    expected = ("asd 123 blu", ["asd", "123", "blu"])

    assert received == expected