def tabular_dataset_fields(fixed_length=None, disable_numericalize_caching=False, include_lengths=False): text = Field( "text", numericalizer=Vocab(eager=True), fixed_length=fixed_length, allow_missing_data=False, include_lengths=include_lengths, disable_numericalize_caching=disable_numericalize_caching, ) text_missing = Field( "text_with_missing_data", numericalizer=Vocab(eager=True), fixed_length=fixed_length, allow_missing_data=True, ) rating = LabelField("rating", numericalizer=float) fields = { "text": text, "text_with_missing_data": text_missing, "rating": rating } return fields
def get_default_fields(): """ Method returns the three main SNLI fields in the following order: gold_label, sentence1, sentence2. Returns ------- fields : dict(str, Field) Dictionary mapping field names to respective Fields. """ gold_label = LabelField( name=SNLISimple.GOLD_LABEL_FIELD_NAME, numericalizer=Vocab(specials=()) ) sentence_vocab = Vocab() sentence1 = Field( name=SNLISimple.SENTENCE1_FIELD_NAME, numericalizer=sentence_vocab, tokenizer="split", keep_raw=False, ) sentence2 = Field( name=SNLISimple.SENTENCE2_FIELD_NAME, numericalizer=sentence_vocab, tokenizer="split", keep_raw=False, ) fields = { SNLISimple.GOLD_LABEL_FIELD_NAME: gold_label, SNLISimple.SENTENCE1_FIELD_NAME: sentence1, SNLISimple.SENTENCE2_FIELD_NAME: sentence2, } return fields
def test_field_applies_specials(): bos, eos = BOS(), EOS() vocab = Vocab(specials=(bos, eos)) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos]) assert received == expected # Test with empty specials vocab = Vocab(specials=()) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", ["asd", "123", "BLA"]) assert received == expected # Test core specials are a no-op vocab = Vocab(specials=(PAD(), UNK())) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", ["asd", "123", "BLA"]) assert received == expected
def fields(): num_field = Field("number", tokenizer=None) name_field = Field("name", numericalizer=Vocab(), is_target=True) name_chars_field = Field("name_chars", tokenizer=list, numericalizer=Vocab(), is_target=True) return [num_field, (name_field, name_chars_field)]
def get_default_fields(): """ Method returns a dict of default CoNLL-U fields. Returns ------- fields : Dict[str, Field] Dict containing all default CoNLL-U fields. """ id = Field(name="id", tokenizer=None, numericalizer=None) form = Field(name="form", tokenizer=None, numericalizer=Vocab(specials=())) lemma = Field(name="lemma", tokenizer=None, numericalizer=Vocab(specials=())) upos = Field( name="upos", tokenizer=None, numericalizer=Vocab(specials=()), ) xpos = Field( name="xpos", tokenizer=None, numericalizer=Vocab(specials=()), ) feats = Field(name="feats", tokenizer=None, numericalizer=None) head = Field( name="head", tokenizer=None, numericalizer=int, ) deprel = Field(name="deprel", tokenizer=None) deps = Field(name="deps", tokenizer=None, numericalizer=None) misc = Field(name="misc", tokenizer=None, numericalizer=None) return { "id": id, "form": form, "lemma": lemma, "upos": upos, "xpos": xpos, "feats": feats, "head": head, "deprel": deprel, "deps": deps, "misc": misc, }
def test_count_matrix_specials_indexes(): specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") vocab.finalize() count_vectorizer = CountVectorizer(vocab=vocab) count_vectorizer._init_special_indexes() assert len(count_vectorizer._special_indexes) == 2 for i in specials: assert vocab.stoi[i] in count_vectorizer._special_indexes
def test_specials_indexes(): specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab) tfidf._init_special_indexes() assert len(tfidf._special_indexes) == 2 for i in specials: assert vocab.stoi[i] in tfidf._special_indexes
def create_dataset(): fields = ( Field("text", numericalizer=Vocab()), Field("source", numericalizer=Vocab(), tokenizer=list), ) example_factory = ExampleFactory(fields) examples = [ example_factory.from_list(data) for data in zip(TABULAR_TEXT, TABULAR_SOURCES) ] dataset = Dataset(examples, fields) return dataset
def test_build_count_matrix_costum_specials_vocab_with_specials(): vocab = Vocab(specials=(UNK(), PAD())) vocab_words = ["this", "is", "the", "first", "document"] vocab += vocab_words vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([[0, 1, 1, 1], [1, 1, 1, 2], [3, 1, 1, 0], [0, 1, 1, 1]]) assert np.all(count_matrix == expected)
def test_field_custom_numericalization_vocab_non_string(): vocab = Vocab(specials=()) tfield = Field("bla", numericalizer=vocab, tokenizer=None) _, data1 = tfield.preprocess([1, 2, 3])[0] _, data2 = tfield.preprocess([3, 2, 1])[0] _, data3 = tfield.preprocess([3, 4, 5, 6])[0] _, data4 = tfield.preprocess([2, 3, 6])[0] tfield.finalize() assert np.all(tfield.numericalize(data1) == vocab.numericalize([1, 2, 3])) assert np.all(tfield.numericalize(data2) == vocab.numericalize([3, 2, 1])) assert np.all(tfield.numericalize(data3) == vocab.numericalize([3, 4, 5, 6])) assert np.all(tfield.numericalize(data4) == vocab.numericalize([2, 3, 6]))
def test_build_count_matrix_costum_specials_vocab_without_specials(): vocab = Vocab(specials=()) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer( vocab=vocab, specials=["the", "first", "second", "one", "third", "and"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([[1, 1, 1], [1, 1, 2], [1, 1, 0], [1, 1, 1]]) assert np.all(count_matrix == expected)
def test_label_field(): vocab = Vocab(specials=()) data = ["label_1", "label_2", "label_3"] vocab += data vocab.finalize() label_field = LabelField("test_label_field", numericalizer=vocab) preprocessed_data = [label_field.preprocess(label) for label in data] for x in preprocessed_data: _, data = x[0] _, tokenized = data assert label_field.numericalize(data) == vocab.stoi[tokenized]
def test_multilabel_field_specials_in_vocab_fail(): with pytest.raises(ValueError): MultilabelField( name="bla", numericalizer=Vocab(specials=(UNK())), num_of_classes=10, )
def get_dataset(): data = [ { "Name": "Mark Dark", "Score": 5 }, { "Name": "Stephen Smith", "Score": 10 }, { "Name": "Ann Mann", "Score": 15 }, ] name_field = Field("Name", numericalizer=Vocab(), keep_raw=True, tokenizer="split") score_field = Field("Score", numericalizer=int, keep_raw=True, tokenizer=None, is_target=True) fields = {"Name": name_field, "Score": score_field} example_factory = ExampleFactory(fields) examples = [example_factory.from_dict(data_) for data_ in data] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def get_default_fields(): """ Method returns default Cornell Movie Dialogs fields: sentence and reply. Fields share same vocabulary. Returns ------- fields : dict(str, Field) Dictionary mapping field name to field. """ vocabulary = Vocab() statement = Field( name="statement", numericalizer=vocabulary, tokenizer="split", keep_raw=False, is_target=False, ) reply = Field( name="reply", numericalizer=vocabulary, tokenizer="split", keep_raw=False, is_target=True, ) fields = {"statement": statement, "reply": reply} return fields
def test_count_vectorizer_transform_tokens_tensor(): vocab = Vocab(specials=()) for i in DATA: vocab += i.split(" ") vocab.finalize() count_vectorizer = CountVectorizer(vocab=vocab) count_vectorizer.fit(dataset=None, field=None) numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) bow = count_vectorizer.transform(numericalized_data).todense() expected = np.array([ [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 2, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0], ]) assert np.allclose(a=bow, b=expected, rtol=0, atol=1.0e-6)
def get_default_fields(): """ Method returns default Imdb fields: text and label. Returns ------- fields : dict(str, Field) Dictionary mapping field name to field. """ text = Field( name=IMDB.TEXT_FIELD_NAME, numericalizer=Vocab(), tokenizer="spacy", ) label = LabelField(name=IMDB.LABEL_FIELD_NAME, numericalizer=Vocab(specials=())) return {IMDB.TEXT_FIELD_NAME: text, IMDB.LABEL_FIELD_NAME: label}
def test_build_count_matrix_from_tensor_with_specials(): vocab = Vocab(specials=(UNK(), PAD())) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([ [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 2, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0], ]) assert np.all(count_matrix == expected)
def test_multilabel_too_many_classes_in_data_exception(): vocab = Vocab(specials=(), eager=True) field = MultilabelField(name="test_field", num_of_classes=3, numericalizer=vocab) for data in "cls1", "cls2", "cls3", "cls4": field.preprocess(data) with pytest.raises(ValueError): field.finalize()
def dataset_with_upper_field(fields): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [fields[0], upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def test_concat_view_override_fields_eager(dataset, fields): upper_name_field = Field("name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) other_fields = [fields[0], upper_name_field] example_factory = ExampleFactory(other_fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, other_fields) other_dataset.finalize_fields() new_field = Field("override_name_field", numericalizer=Vocab(eager=True)) dataset_concat = DatasetConcatView([dataset, other_dataset], field_overrides={"name": new_field}) assert dataset_concat.field_dict["override_name_field"].is_finalized concat_vocab = dataset_concat.field_dict["override_name_field"].vocab dataset_vocab = dataset.field_dict["name"].vocab other_vocab = other_dataset.field_dict["name"].vocab assert set( concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
def test_hierarchical_dataset_finalize_fields(hierarchical_dataset_parser): name_vocab = Vocab() number_vocab = Vocab() name_field = Field("name", keep_raw=True, tokenizer=None, numericalizer=name_vocab) number_field = Field("number", keep_raw=True, tokenizer=None, numericalizer=number_vocab) fields = {"name": name_field, "number": number_field} dataset = HierarchicalDataset.from_json( dataset=HIERARCHIAL_DATASET_JSON_EXAMPLE, fields=fields, parser=hierarchical_dataset_parser, ) dataset.finalize_fields() assert name_vocab.is_finalized assert number_vocab.is_finalized
def test_missing_datatype_exception(data, fields, tmpdir): data_null = [(*d, None) for d in data] null_field = Field( "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab() ) fields_null = [*fields, null_field] exf = ExampleFactory(fields_null) examples = map(exf.from_list, data_null) with pytest.raises(RuntimeError): DiskBackedDataset.from_examples(fields_null, examples, cache_path=tmpdir)
def test_concat_view_fail_no_field_intersection(dataset): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [None, upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, fields) other_dataset.finalize_fields() with pytest.raises(ValueError): DatasetConcatView([dataset, other_dataset])
def fields(): number_field = Field( "number", keep_raw=True, numericalizer=int, tokenizer=None, is_target=True ) token_field = Field( "tokens", keep_raw=True, numericalizer=Vocab(keep_freqs=True), tokenizer=partial(str.split, sep=" "), ) return [number_field, token_field]
def test_multilabel_field_vocab_numericalization(tokens): vocab = Vocab(specials=()) vocab += tokens field = MultilabelField("test field", num_of_classes=5, numericalizer=vocab) ((_, preprocessed),) = field.preprocess(tokens) field.finalize() multilabel_from_vocab = np.zeros(5, dtype=np.bool) for token in tokens: multilabel_from_vocab[vocab.stoi[token]] = 1 multilabel_from_field = field.numericalize(preprocessed) assert np.all(multilabel_from_field == multilabel_from_vocab)
def test_missing_symbol_index_vocab(): vocab = Vocab() fld = Field( name="test_field", tokenizer="split", keep_raw=False, numericalizer=vocab, allow_missing_data=True, ) fld.preprocess("a b c d") ((_, data),) = fld.preprocess(None) assert data == (None, None) fld.finalize() assert fld.numericalize((None, None)) is None assert fld.get_default_value() == -1
def test_multilabel_field_class_count(): vocab = Vocab(specials=(), eager=True) field = MultilabelField(name="test field", num_of_classes=None, numericalizer=vocab) example_1 = ["class1", "class2", "class3", "class4"] example_2 = ["class1", "class2", "class3"] ((_, data_1),) = field.preprocess(example_1) ((_, data_2),) = field.preprocess(example_2) field.finalize() assert field._num_of_classes == 4 numericalized = field.numericalize(data_1) assert len(numericalized) == 4 numericalized = field.numericalize(data_2) assert len(numericalized) == 4
def test_iterator_missing_data_in_batch(json_file_path): missing_data_default_value = -99 fields = tabular_dataset_fields() missing_value_field = Field( "missing_value_field", tokenizer="split", numericalizer=Vocab(), allow_missing_data=True, keep_raw=True, missing_data_token=missing_data_default_value, ) fields["text_with_missing_data"] = missing_value_field ds = create_tabular_dataset_from_json(fields, json_file_path) for batch in Iterator(ds, batch_size=len(ds), shuffle=False): # test if the value we know is missing is correctly filled out missing_value_row = batch.missing_value_field[3] assert np.all(missing_value_row == missing_data_default_value)
def test_datatype_definition(data, fields): data_null = [(*d, None) for d in data] null_field = Field( "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab() ) fields_null = [*fields, null_field] exf = ExampleFactory(fields_null) examples = map(exf.from_list, data_null) datatypes = {"null_field": (pa.string(), pa.list_(pa.string()))} dataset = DiskBackedDataset.from_examples(fields_null, examples, data_types=datatypes) for ex, d in zip(dataset, data_null): assert int(ex["number"][0]) == d[0] assert ex["tokens"][0] == d[1] dataset.delete_cache()