예제 #1
0
파일: conftest.py 프로젝트: TakeLab/podium
def tabular_dataset_fields(fixed_length=None,
                           disable_numericalize_caching=False,
                           include_lengths=False):
    text = Field(
        "text",
        numericalizer=Vocab(eager=True),
        fixed_length=fixed_length,
        allow_missing_data=False,
        include_lengths=include_lengths,
        disable_numericalize_caching=disable_numericalize_caching,
    )
    text_missing = Field(
        "text_with_missing_data",
        numericalizer=Vocab(eager=True),
        fixed_length=fixed_length,
        allow_missing_data=True,
    )
    rating = LabelField("rating", numericalizer=float)

    fields = {
        "text": text,
        "text_with_missing_data": text_missing,
        "rating": rating
    }

    return fields
예제 #2
0
파일: snli.py 프로젝트: TakeLab/podium
    def get_default_fields():
        """
        Method returns the three main SNLI fields in the following order:
        gold_label, sentence1, sentence2.

        Returns
        -------
        fields : dict(str, Field)
            Dictionary mapping field names to respective Fields.
        """

        gold_label = LabelField(
            name=SNLISimple.GOLD_LABEL_FIELD_NAME, numericalizer=Vocab(specials=())
        )
        sentence_vocab = Vocab()
        sentence1 = Field(
            name=SNLISimple.SENTENCE1_FIELD_NAME,
            numericalizer=sentence_vocab,
            tokenizer="split",
            keep_raw=False,
        )
        sentence2 = Field(
            name=SNLISimple.SENTENCE2_FIELD_NAME,
            numericalizer=sentence_vocab,
            tokenizer="split",
            keep_raw=False,
        )
        fields = {
            SNLISimple.GOLD_LABEL_FIELD_NAME: gold_label,
            SNLISimple.SENTENCE1_FIELD_NAME: sentence1,
            SNLISimple.SENTENCE2_FIELD_NAME: sentence2,
        }
        return fields
예제 #3
0
def test_field_applies_specials():
    bos, eos = BOS(), EOS()
    vocab = Vocab(specials=(bos, eos))
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos])

    assert received == expected

    # Test with empty specials
    vocab = Vocab(specials=())
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", ["asd", "123", "BLA"])

    assert received == expected

    # Test core specials are a no-op
    vocab = Vocab(specials=(PAD(), UNK()))
    f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True)

    _, received = f.preprocess("asd 123 BLA")[0]
    expected = ("asd 123 BLA", ["asd", "123", "BLA"])

    assert received == expected
예제 #4
0
def fields():
    num_field = Field("number", tokenizer=None)
    name_field = Field("name", numericalizer=Vocab(), is_target=True)
    name_chars_field = Field("name_chars",
                             tokenizer=list,
                             numericalizer=Vocab(),
                             is_target=True)
    return [num_field, (name_field, name_chars_field)]
예제 #5
0
    def get_default_fields():
        """
        Method returns a dict of default CoNLL-U fields.

        Returns
        -------
        fields : Dict[str, Field]
            Dict containing all default CoNLL-U fields.
        """

        id = Field(name="id", tokenizer=None, numericalizer=None)

        form = Field(name="form",
                     tokenizer=None,
                     numericalizer=Vocab(specials=()))

        lemma = Field(name="lemma",
                      tokenizer=None,
                      numericalizer=Vocab(specials=()))

        upos = Field(
            name="upos",
            tokenizer=None,
            numericalizer=Vocab(specials=()),
        )

        xpos = Field(
            name="xpos",
            tokenizer=None,
            numericalizer=Vocab(specials=()),
        )

        feats = Field(name="feats", tokenizer=None, numericalizer=None)

        head = Field(
            name="head",
            tokenizer=None,
            numericalizer=int,
        )

        deprel = Field(name="deprel", tokenizer=None)

        deps = Field(name="deps", tokenizer=None, numericalizer=None)

        misc = Field(name="misc", tokenizer=None, numericalizer=None)

        return {
            "id": id,
            "form": form,
            "lemma": lemma,
            "upos": upos,
            "xpos": xpos,
            "feats": feats,
            "head": head,
            "deprel": deprel,
            "deps": deps,
            "misc": misc,
        }
예제 #6
0
def test_count_matrix_specials_indexes():
    specials = (UNK(), PAD())
    vocab = Vocab(specials=specials)
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()

    count_vectorizer = CountVectorizer(vocab=vocab)
    count_vectorizer._init_special_indexes()

    assert len(count_vectorizer._special_indexes) == 2
    for i in specials:
        assert vocab.stoi[i] in count_vectorizer._special_indexes
예제 #7
0
def test_specials_indexes():
    specials = (UNK(), PAD())
    vocab = Vocab(specials=specials)
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()

    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf._init_special_indexes()

    assert len(tfidf._special_indexes) == 2
    for i in specials:
        assert vocab.stoi[i] in tfidf._special_indexes
예제 #8
0
    def create_dataset():
        fields = (
            Field("text", numericalizer=Vocab()),
            Field("source", numericalizer=Vocab(), tokenizer=list),
        )
        example_factory = ExampleFactory(fields)

        examples = [
            example_factory.from_list(data)
            for data in zip(TABULAR_TEXT, TABULAR_SOURCES)
        ]

        dataset = Dataset(examples, fields)
        return dataset
예제 #9
0
def test_build_count_matrix_costum_specials_vocab_with_specials():
    vocab = Vocab(specials=(UNK(), PAD()))
    vocab_words = ["this", "is", "the", "first", "document"]
    vocab += vocab_words
    vocab.finalize()
    tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"])
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([[0, 1, 1, 1], [1, 1, 1, 2], [3, 1, 1, 0],
                         [0, 1, 1, 1]])
    assert np.all(count_matrix == expected)
예제 #10
0
def test_field_custom_numericalization_vocab_non_string():
    vocab = Vocab(specials=())
    tfield = Field("bla", numericalizer=vocab, tokenizer=None)

    _, data1 = tfield.preprocess([1, 2, 3])[0]
    _, data2 = tfield.preprocess([3, 2, 1])[0]
    _, data3 = tfield.preprocess([3, 4, 5, 6])[0]
    _, data4 = tfield.preprocess([2, 3, 6])[0]

    tfield.finalize()

    assert np.all(tfield.numericalize(data1) == vocab.numericalize([1, 2, 3]))
    assert np.all(tfield.numericalize(data2) == vocab.numericalize([3, 2, 1]))
    assert np.all(tfield.numericalize(data3) == vocab.numericalize([3, 4, 5, 6]))
    assert np.all(tfield.numericalize(data4) == vocab.numericalize([2, 3, 6]))
예제 #11
0
def test_build_count_matrix_costum_specials_vocab_without_specials():
    vocab = Vocab(specials=())
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    tfidf = TfIdfVectorizer(
        vocab=vocab,
        specials=["the", "first", "second", "one", "third", "and"])
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([[1, 1, 1], [1, 1, 2], [1, 1, 0], [1, 1, 1]])
    assert np.all(count_matrix == expected)
예제 #12
0
def test_label_field():
    vocab = Vocab(specials=())
    data = ["label_1", "label_2", "label_3"]

    vocab += data
    vocab.finalize()

    label_field = LabelField("test_label_field", numericalizer=vocab)

    preprocessed_data = [label_field.preprocess(label) for label in data]

    for x in preprocessed_data:
        _, data = x[0]
        _, tokenized = data
        assert label_field.numericalize(data) == vocab.stoi[tokenized]
예제 #13
0
def test_multilabel_field_specials_in_vocab_fail():
    with pytest.raises(ValueError):
        MultilabelField(
            name="bla",
            numericalizer=Vocab(specials=(UNK())),
            num_of_classes=10,
        )
예제 #14
0
def get_dataset():
    data = [
        {
            "Name": "Mark Dark",
            "Score": 5
        },
        {
            "Name": "Stephen Smith",
            "Score": 10
        },
        {
            "Name": "Ann Mann",
            "Score": 15
        },
    ]

    name_field = Field("Name",
                       numericalizer=Vocab(),
                       keep_raw=True,
                       tokenizer="split")

    score_field = Field("Score",
                        numericalizer=int,
                        keep_raw=True,
                        tokenizer=None,
                        is_target=True)

    fields = {"Name": name_field, "Score": score_field}

    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_dict(data_) for data_ in data]

    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
예제 #15
0
    def get_default_fields():
        """
        Method returns default Cornell Movie Dialogs fields: sentence and reply.
        Fields share same vocabulary.

        Returns
        -------
        fields : dict(str, Field)
            Dictionary mapping field name to field.
        """
        vocabulary = Vocab()
        statement = Field(
            name="statement",
            numericalizer=vocabulary,
            tokenizer="split",
            keep_raw=False,
            is_target=False,
        )
        reply = Field(
            name="reply",
            numericalizer=vocabulary,
            tokenizer="split",
            keep_raw=False,
            is_target=True,
        )
        fields = {"statement": statement, "reply": reply}
        return fields
예제 #16
0
def test_count_vectorizer_transform_tokens_tensor():
    vocab = Vocab(specials=())
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    count_vectorizer = CountVectorizer(vocab=vocab)
    count_vectorizer.fit(dataset=None, field=None)

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    bow = count_vectorizer.transform(numericalized_data).todense()
    expected = np.array([
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 2, 0, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
    ])
    assert np.allclose(a=bow, b=expected, rtol=0, atol=1.0e-6)
예제 #17
0
파일: imdb.py 프로젝트: TakeLab/podium
    def get_default_fields():
        """
        Method returns default Imdb fields: text and label.

        Returns
        -------
        fields : dict(str, Field)
            Dictionary mapping field name to field.
        """
        text = Field(
            name=IMDB.TEXT_FIELD_NAME,
            numericalizer=Vocab(),
            tokenizer="spacy",
        )
        label = LabelField(name=IMDB.LABEL_FIELD_NAME,
                           numericalizer=Vocab(specials=()))
        return {IMDB.TEXT_FIELD_NAME: text, IMDB.LABEL_FIELD_NAME: label}
예제 #18
0
def test_build_count_matrix_from_tensor_with_specials():
    vocab = Vocab(specials=(UNK(), PAD()))
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 2, 0, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
    ])
    assert np.all(count_matrix == expected)
예제 #19
0
def test_multilabel_too_many_classes_in_data_exception():
    vocab = Vocab(specials=(), eager=True)
    field = MultilabelField(name="test_field", num_of_classes=3, numericalizer=vocab)

    for data in "cls1", "cls2", "cls3", "cls4":
        field.preprocess(data)

    with pytest.raises(ValueError):
        field.finalize()
예제 #20
0
def dataset_with_upper_field(fields):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
예제 #21
0
def test_concat_view_override_fields_eager(dataset, fields):
    upper_name_field = Field("name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    other_fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(other_fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, other_fields)
    other_dataset.finalize_fields()

    new_field = Field("override_name_field", numericalizer=Vocab(eager=True))
    dataset_concat = DatasetConcatView([dataset, other_dataset],
                                       field_overrides={"name": new_field})

    assert dataset_concat.field_dict["override_name_field"].is_finalized

    concat_vocab = dataset_concat.field_dict["override_name_field"].vocab
    dataset_vocab = dataset.field_dict["name"].vocab
    other_vocab = other_dataset.field_dict["name"].vocab
    assert set(
        concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
예제 #22
0
def test_hierarchical_dataset_finalize_fields(hierarchical_dataset_parser):
    name_vocab = Vocab()
    number_vocab = Vocab()
    name_field = Field("name",
                       keep_raw=True,
                       tokenizer=None,
                       numericalizer=name_vocab)
    number_field = Field("number",
                         keep_raw=True,
                         tokenizer=None,
                         numericalizer=number_vocab)

    fields = {"name": name_field, "number": number_field}
    dataset = HierarchicalDataset.from_json(
        dataset=HIERARCHIAL_DATASET_JSON_EXAMPLE,
        fields=fields,
        parser=hierarchical_dataset_parser,
    )
    dataset.finalize_fields()
    assert name_vocab.is_finalized
    assert number_vocab.is_finalized
예제 #23
0
def test_missing_datatype_exception(data, fields, tmpdir):
    data_null = [(*d, None) for d in data]
    null_field = Field(
        "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab()
    )
    fields_null = [*fields, null_field]

    exf = ExampleFactory(fields_null)
    examples = map(exf.from_list, data_null)

    with pytest.raises(RuntimeError):
        DiskBackedDataset.from_examples(fields_null, examples, cache_path=tmpdir)
예제 #24
0
def test_concat_view_fail_no_field_intersection(dataset):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [None, upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, fields)
    other_dataset.finalize_fields()

    with pytest.raises(ValueError):
        DatasetConcatView([dataset, other_dataset])
예제 #25
0
def fields():
    number_field = Field(
        "number", keep_raw=True, numericalizer=int, tokenizer=None, is_target=True
    )

    token_field = Field(
        "tokens",
        keep_raw=True,
        numericalizer=Vocab(keep_freqs=True),
        tokenizer=partial(str.split, sep=" "),
    )

    return [number_field, token_field]
예제 #26
0
def test_multilabel_field_vocab_numericalization(tokens):
    vocab = Vocab(specials=())
    vocab += tokens

    field = MultilabelField("test field", num_of_classes=5, numericalizer=vocab)
    ((_, preprocessed),) = field.preprocess(tokens)
    field.finalize()

    multilabel_from_vocab = np.zeros(5, dtype=np.bool)
    for token in tokens:
        multilabel_from_vocab[vocab.stoi[token]] = 1

    multilabel_from_field = field.numericalize(preprocessed)

    assert np.all(multilabel_from_field == multilabel_from_vocab)
예제 #27
0
def test_missing_symbol_index_vocab():
    vocab = Vocab()
    fld = Field(
        name="test_field",
        tokenizer="split",
        keep_raw=False,
        numericalizer=vocab,
        allow_missing_data=True,
    )

    fld.preprocess("a b c d")
    ((_, data),) = fld.preprocess(None)
    assert data == (None, None)

    fld.finalize()
    assert fld.numericalize((None, None)) is None
    assert fld.get_default_value() == -1
예제 #28
0
def test_multilabel_field_class_count():
    vocab = Vocab(specials=(), eager=True)
    field = MultilabelField(name="test field", num_of_classes=None, numericalizer=vocab)

    example_1 = ["class1", "class2", "class3", "class4"]
    example_2 = ["class1", "class2", "class3"]

    ((_, data_1),) = field.preprocess(example_1)
    ((_, data_2),) = field.preprocess(example_2)
    field.finalize()

    assert field._num_of_classes == 4

    numericalized = field.numericalize(data_1)
    assert len(numericalized) == 4

    numericalized = field.numericalize(data_2)
    assert len(numericalized) == 4
예제 #29
0
def test_iterator_missing_data_in_batch(json_file_path):
    missing_data_default_value = -99
    fields = tabular_dataset_fields()
    missing_value_field = Field(
        "missing_value_field",
        tokenizer="split",
        numericalizer=Vocab(),
        allow_missing_data=True,
        keep_raw=True,
        missing_data_token=missing_data_default_value,
    )
    fields["text_with_missing_data"] = missing_value_field
    ds = create_tabular_dataset_from_json(fields, json_file_path)

    for batch in Iterator(ds, batch_size=len(ds), shuffle=False):
        # test if the value we know is missing is correctly filled out
        missing_value_row = batch.missing_value_field[3]
        assert np.all(missing_value_row == missing_data_default_value)
예제 #30
0
def test_datatype_definition(data, fields):
    data_null = [(*d, None) for d in data]
    null_field = Field(
        "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab()
    )
    fields_null = [*fields, null_field]

    exf = ExampleFactory(fields_null)
    examples = map(exf.from_list, data_null)

    datatypes = {"null_field": (pa.string(), pa.list_(pa.string()))}
    dataset = DiskBackedDataset.from_examples(fields_null, examples, data_types=datatypes)

    for ex, d in zip(dataset, data_null):
        assert int(ex["number"][0]) == d[0]
        assert ex["tokens"][0] == d[1]

    dataset.delete_cache()