Пример #1
0
    def get_default_fields():
        """
        Method returns default Cornell Movie Dialogs fields: sentence and reply.
        Fields share same vocabulary.

        Returns
        -------
        fields : dict(str, Field)
            Dictionary mapping field name to field.
        """
        vocabulary = Vocab()
        statement = Field(
            name="statement",
            numericalizer=vocabulary,
            tokenizer="split",
            keep_raw=False,
            is_target=False,
        )
        reply = Field(
            name="reply",
            numericalizer=vocabulary,
            tokenizer="split",
            keep_raw=False,
            is_target=True,
        )
        fields = {"statement": statement, "reply": reply}
        return fields
Пример #2
0
def test_field_pad_to_length_custom_pad(row, length, expected_row):
    f = Field(name="F", numericalizer=None)

    row_arr = np.array(row)
    received_row = f._pad_to_length(row_arr, length, custom_pad_symbol=CUSTOM_PAD)

    assert received_row.tolist() == expected_row
Пример #3
0
def tabular_dataset_fields(fixed_length=None,
                           disable_numericalize_caching=False,
                           include_lengths=False):
    text = Field(
        "text",
        numericalizer=Vocab(eager=True),
        fixed_length=fixed_length,
        allow_missing_data=False,
        include_lengths=include_lengths,
        disable_numericalize_caching=disable_numericalize_caching,
    )
    text_missing = Field(
        "text_with_missing_data",
        numericalizer=Vocab(eager=True),
        fixed_length=fixed_length,
        allow_missing_data=True,
    )
    rating = LabelField("rating", numericalizer=float)

    fields = {
        "text": text,
        "text_with_missing_data": text_missing,
        "rating": rating
    }

    return fields
Пример #4
0
    def get_default_fields():
        """
        Method returns the three main SNLI fields in the following order:
        gold_label, sentence1, sentence2.

        Returns
        -------
        fields : dict(str, Field)
            Dictionary mapping field names to respective Fields.
        """

        gold_label = LabelField(
            name=SNLISimple.GOLD_LABEL_FIELD_NAME, numericalizer=Vocab(specials=())
        )
        sentence_vocab = Vocab()
        sentence1 = Field(
            name=SNLISimple.SENTENCE1_FIELD_NAME,
            numericalizer=sentence_vocab,
            tokenizer="split",
            keep_raw=False,
        )
        sentence2 = Field(
            name=SNLISimple.SENTENCE2_FIELD_NAME,
            numericalizer=sentence_vocab,
            tokenizer="split",
            keep_raw=False,
        )
        fields = {
            SNLISimple.GOLD_LABEL_FIELD_NAME: gold_label,
            SNLISimple.SENTENCE1_FIELD_NAME: sentence1,
            SNLISimple.SENTENCE2_FIELD_NAME: sentence2,
        }
        return fields
Пример #5
0
def get_dataset():
    data = [
        {
            "Name": "Mark Dark",
            "Score": 5
        },
        {
            "Name": "Stephen Smith",
            "Score": 10
        },
        {
            "Name": "Ann Mann",
            "Score": 15
        },
    ]

    name_field = Field("Name",
                       numericalizer=Vocab(),
                       keep_raw=True,
                       tokenizer="split")

    score_field = Field("Score",
                        numericalizer=int,
                        keep_raw=True,
                        tokenizer=None,
                        is_target=True)

    fields = {"Name": name_field, "Score": score_field}

    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_dict(data_) for data_ in data]

    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
Пример #6
0
def test_field_preprocess_eager():
    vocab = MockVocab(eager=True)
    f = Field(name="F", numericalizer=vocab)
    f.preprocess("some text")

    # vocab was updated
    assert len(vocab.values) > 0
Пример #7
0
def test_text_clean_up(kwargs, data, expected_output):
    pytest.importorskip("cleantext")

    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_pretokenize_hook(TextCleanUp(**kwargs))
    example = ExampleFactory([field]).from_list([data])

    assert expected_output == example["data"][1]
Пример #8
0
def test_field_is_target():
    f1 = Field(name="text", is_target=False)
    f2 = Field(name="label", is_target=True)
    f3 = Field(name="bla")

    assert not f1.is_target
    assert f2.is_target
    assert not f3.is_target
Пример #9
0
def fields():
    num_field = Field("number", tokenizer=None)
    name_field = Field("name", numericalizer=Vocab(), is_target=True)
    name_chars_field = Field("name_chars",
                             tokenizer=list,
                             numericalizer=Vocab(),
                             is_target=True)
    return [num_field, (name_field, name_chars_field)]
Пример #10
0
def test_remove_stopwords():
    data = "I'll tell you a joke"
    field = Field(name="data")
    field.add_posttokenize_hook(remove_stopwords("en"))
    example = ExampleFactory([field]).from_list([data])

    assert "you" not in example["data"][1]
    assert "a" not in example["data"][1]
Пример #11
0
    def get_default_fields():
        """
        Method returns a dict of default CoNLL-U fields.

        Returns
        -------
        fields : Dict[str, Field]
            Dict containing all default CoNLL-U fields.
        """

        id = Field(name="id", tokenizer=None, numericalizer=None)

        form = Field(name="form",
                     tokenizer=None,
                     numericalizer=Vocab(specials=()))

        lemma = Field(name="lemma",
                      tokenizer=None,
                      numericalizer=Vocab(specials=()))

        upos = Field(
            name="upos",
            tokenizer=None,
            numericalizer=Vocab(specials=()),
        )

        xpos = Field(
            name="xpos",
            tokenizer=None,
            numericalizer=Vocab(specials=()),
        )

        feats = Field(name="feats", tokenizer=None, numericalizer=None)

        head = Field(
            name="head",
            tokenizer=None,
            numericalizer=int,
        )

        deprel = Field(name="deprel", tokenizer=None)

        deps = Field(name="deps", tokenizer=None, numericalizer=None)

        misc = Field(name="misc", tokenizer=None, numericalizer=None)

        return {
            "id": id,
            "form": form,
            "lemma": lemma,
            "upos": upos,
            "xpos": xpos,
            "feats": feats,
            "head": head,
            "deprel": deprel,
            "deps": deps,
            "misc": misc,
        }
Пример #12
0
def test_field_pad_to_length(row, length, expected_row, pad_left, truncate_left):
    vocab = MockVocab()
    f = Field(name="F", numericalizer=vocab)

    received_row = f._pad_to_length(
        np.array(row), length, pad_left=pad_left, truncate_left=truncate_left
    )

    assert received_row.tolist() == expected_row
Пример #13
0
def test_field_get_tokenizer_spacy_ok():
    mp = patch.dict("sys.modules", spacy=MockSpacy())
    mp.start()

    f = Field(name="F", numericalizer=MockVocab(), tokenizer="spacy")
    _, data = f.preprocess("bla blu")[0]
    assert data == (None, ["bla", "blu"])

    mp.stop()
Пример #14
0
def test_truecase():
    pytest.importorskip("truecase")

    data = "hey how are you"
    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_pretokenize_hook(truecase())
    example = ExampleFactory([field]).from_list([data])

    assert "Hey how are you" == example["data"][0]
Пример #15
0
def test_regex_replace():
    data = "This item costs 100$."
    field = Field(name="data", tokenizer=None, keep_raw=True)
    regex_replace = RegexReplace([(r"\d+", "<NUMBER>"), (r"\s+", "<WHITESPACE>")])
    field.add_pretokenize_hook(regex_replace)
    example = ExampleFactory([field]).from_list([data])

    expected_raw = "This<WHITESPACE>item<WHITESPACE>costs<WHITESPACE><NUMBER>$."
    assert expected_raw == example["data"][1]
Пример #16
0
def test_keyword_extractor(alg, alg_pkg_name):
    pytest.importorskip(alg_pkg_name)

    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_posttokenize_hook(KeywordExtractor(alg))
    example = ExampleFactory([field]).from_list([TEXT])

    # make sure all the keywords originate from the raw data
    text_ = TEXT.lower()
    assert all(kw in text_ for kws in example["data"][1] for kw in kws.lower().split())
Пример #17
0
def test_moses_normalizer():
    pytest.importorskip("sacremoses")

    data = "What's    up!"
    field = Field(name="data", tokenizer=None, keep_raw=True)
    normalizer = MosesNormalizer()
    field.add_pretokenize_hook(normalizer)
    example = ExampleFactory([field]).from_list([data])

    assert "What's up!" == example["data"][1]
Пример #18
0
def test_field_preprocess_raw_sequential(
    value, store_raw, tokenize, expected_raw_value, expected_tokenized_value
):
    tokenizer = "split" if tokenize else None
    f = Field(name="F", keep_raw=store_raw, tokenizer=tokenizer)

    ((_, (received_raw_value, received_tokenized_value)),) = f.preprocess(value)

    assert received_raw_value == expected_raw_value
    assert received_tokenized_value == expected_tokenized_value
Пример #19
0
def test_field_get_tokenizer_callable():
    vocab = MockVocab()

    def my_tokenizer(string):
        return [string[0], string[1:]]

    f = Field(name="F", numericalizer=vocab, tokenizer=my_tokenizer)

    _, data = f.preprocess("asd dsa")[0]
    assert data == (None, ["a", "sd dsa"])
Пример #20
0
def test_field_applies_specials():
    bos, eos = BOS(), EOS()
    vocab = Vocab(specials=(bos, eos))
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos])

    assert received == expected

    # Test with empty specials
    vocab = Vocab(specials=())
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", ["asd", "123", "BLA"])

    assert received == expected

    # Test core specials are a no-op
    vocab = Vocab(specials=(PAD(), UNK()))
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", ["asd", "123", "BLA"])

    assert received == expected
Пример #21
0
def test_field_repeated_hooks():
    def replace_tag_hook(raw, tokenized):
        replaced_tags = map(lambda s: s.replace("<tag>", "ABC"), tokenized)

        return raw, replaced_tags

    def to_lower_hook(raw, tokenized):
        # keep track of the function call count
        to_lower_hook.call_count += 1

        tokenized = map(str.lower, tokenized)

        return raw, tokenized

    to_lower_hook.call_count = 0

    f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True)

    # TAG -> tag
    f.add_posttokenize_hook(to_lower_hook)

    # <tag> -> ABC
    f.add_posttokenize_hook(replace_tag_hook)

    # ABC -> abc
    f.add_posttokenize_hook(to_lower_hook)

    _, received = f.preprocess("BLA <TAG> bla")[0]

    expected = ("BLA <TAG> bla", ["bla", "abc", "bla"])

    assert received == expected

    # check that the hook that was added twice was also called twice
    assert to_lower_hook.call_count == 2
Пример #22
0
def test_field_posttokenize_hooks_detach():
    f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True)

    def remove_tags_hook(raw, tokenized):
        raw = raw.replace("<tag>", "")
        tokenized = map(lambda x: x.replace("<tag>", ""), tokenized)

        return raw, tokenized

    def to_upper_hook(raw, tokenized):
        raw = raw.upper()
        tokenized = map(str.upper, tokenized)

        return raw, tokenized

    f.add_posttokenize_hook(remove_tags_hook)
    f.add_posttokenize_hook(to_upper_hook)

    # detaching the hooks
    f.remove_posttokenize_hooks()

    _, received = f.preprocess("asd 123<tag> B<tag>LA")[0]
    expected = ("asd 123<tag> B<tag>LA", ["asd", "123<tag>", "B<tag>LA"])

    assert received == expected
Пример #23
0
def test_missing_symbol_index_custom_numericalize():
    fld = Field(
        name="test_field",
        keep_raw=True,
        tokenizer=None,
        numericalizer=int,
        allow_missing_data=True,
    )

    fld.finalize()
    assert fld.get_default_value() == -1
Пример #24
0
def test_from_pandas_field_list(data):

    df = pd.DataFrame(data)
    fields = [
        Field("text", keep_raw=True, tokenizer="split"),
        Field("number", tokenizer=None),
    ]

    ds = Dataset.from_pandas(df, fields)

    for original, (raw, _) in zip(data, ds.text):
        assert original[0] == raw
Пример #25
0
def test_from_pandas_field_dict(data):
    df = pd.DataFrame(data, columns=["text", "number"])
    fields = {
        "text": Field("text_field", keep_raw=True, tokenizer="split"),
        "number": Field("number_field", tokenizer=None),
    }

    ds = Dataset.from_pandas(df, fields)
    assert set(ds.field_dict) == set(["text_field", "number_field"])

    for original, (raw, _) in zip(data, ds.text_field):
        assert original[0] == raw
Пример #26
0
def test_from_pandas_index(data):
    df = pd.DataFrame([[x[0]] for x in data], index=[x[1] for x in data])
    fields = [Field("text", keep_raw=True, tokenizer="split")]

    ds = Dataset.from_pandas(df,
                             fields,
                             index_field=Field("numbers",
                                               tokenizer=None,
                                               keep_raw=True))

    for original, (raw, _) in zip(data, ds.numbers):
        assert original[1] == raw
Пример #27
0
def test_from_pandas_field_dict(data):
    import pandas as pd

    df = pd.DataFrame(data, columns=["number", "text"])
    fields = {
        "number": Field("number", tokenizer=None),
        "text": Field("text", keep_raw=True, tokenizer="split"),
    }

    ds = DiskBackedDataset.from_pandas(df, fields)

    for original, (raw, _) in zip(data, ds.text):
        assert original[1] == raw
Пример #28
0
def fields():
    number_field = Field(
        "number", keep_raw=True, numericalizer=int, tokenizer=None, is_target=True
    )

    token_field = Field(
        "tokens",
        keep_raw=True,
        numericalizer=Vocab(keep_freqs=True),
        tokenizer=partial(str.split, sep=" "),
    )

    return [number_field, token_field]
Пример #29
0
def test_field_pad_custom_numericalize():
    custom_padding_token = -999
    f = Field(
        "test_field",
        numericalizer=int,
        padding_token=custom_padding_token,
        tokenizer="split",
    )
    mock_numericalization = np.array([1, 2, 3, 4])
    expected_numericalization = np.array([1, 2, 3, 4] + [custom_padding_token] * 6)

    padded = f._pad_to_length(mock_numericalization, 10, pad_left=False)
    assert np.all(padded == expected_numericalization)
Пример #30
0
def test_from_pandas_index(data):
    import pandas as pd

    df = pd.DataFrame([[x[1]] for x in data], index=[x[0] for x in data])
    fields = [Field("text_field", keep_raw=True, tokenizer="split")]

    ds = DiskBackedDataset.from_pandas(
        df, fields, index_field=Field("number_field", tokenizer=None, keep_raw=True)
    )

    assert set(ds.field_dict) == set(["text_field", "number_field"])
    for original, (raw, _) in zip(data, ds.number_field):
        assert original[0] == raw