def get_default_fields(): """ Method returns default Cornell Movie Dialogs fields: sentence and reply. Fields share same vocabulary. Returns ------- fields : dict(str, Field) Dictionary mapping field name to field. """ vocabulary = Vocab() statement = Field( name="statement", numericalizer=vocabulary, tokenizer="split", keep_raw=False, is_target=False, ) reply = Field( name="reply", numericalizer=vocabulary, tokenizer="split", keep_raw=False, is_target=True, ) fields = {"statement": statement, "reply": reply} return fields
def test_field_pad_to_length_custom_pad(row, length, expected_row): f = Field(name="F", numericalizer=None) row_arr = np.array(row) received_row = f._pad_to_length(row_arr, length, custom_pad_symbol=CUSTOM_PAD) assert received_row.tolist() == expected_row
def tabular_dataset_fields(fixed_length=None, disable_numericalize_caching=False, include_lengths=False): text = Field( "text", numericalizer=Vocab(eager=True), fixed_length=fixed_length, allow_missing_data=False, include_lengths=include_lengths, disable_numericalize_caching=disable_numericalize_caching, ) text_missing = Field( "text_with_missing_data", numericalizer=Vocab(eager=True), fixed_length=fixed_length, allow_missing_data=True, ) rating = LabelField("rating", numericalizer=float) fields = { "text": text, "text_with_missing_data": text_missing, "rating": rating } return fields
def get_default_fields(): """ Method returns the three main SNLI fields in the following order: gold_label, sentence1, sentence2. Returns ------- fields : dict(str, Field) Dictionary mapping field names to respective Fields. """ gold_label = LabelField( name=SNLISimple.GOLD_LABEL_FIELD_NAME, numericalizer=Vocab(specials=()) ) sentence_vocab = Vocab() sentence1 = Field( name=SNLISimple.SENTENCE1_FIELD_NAME, numericalizer=sentence_vocab, tokenizer="split", keep_raw=False, ) sentence2 = Field( name=SNLISimple.SENTENCE2_FIELD_NAME, numericalizer=sentence_vocab, tokenizer="split", keep_raw=False, ) fields = { SNLISimple.GOLD_LABEL_FIELD_NAME: gold_label, SNLISimple.SENTENCE1_FIELD_NAME: sentence1, SNLISimple.SENTENCE2_FIELD_NAME: sentence2, } return fields
def get_dataset(): data = [ { "Name": "Mark Dark", "Score": 5 }, { "Name": "Stephen Smith", "Score": 10 }, { "Name": "Ann Mann", "Score": 15 }, ] name_field = Field("Name", numericalizer=Vocab(), keep_raw=True, tokenizer="split") score_field = Field("Score", numericalizer=int, keep_raw=True, tokenizer=None, is_target=True) fields = {"Name": name_field, "Score": score_field} example_factory = ExampleFactory(fields) examples = [example_factory.from_dict(data_) for data_ in data] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def test_field_preprocess_eager(): vocab = MockVocab(eager=True) f = Field(name="F", numericalizer=vocab) f.preprocess("some text") # vocab was updated assert len(vocab.values) > 0
def test_text_clean_up(kwargs, data, expected_output): pytest.importorskip("cleantext") field = Field(name="data", tokenizer=None, keep_raw=True) field.add_pretokenize_hook(TextCleanUp(**kwargs)) example = ExampleFactory([field]).from_list([data]) assert expected_output == example["data"][1]
def test_field_is_target(): f1 = Field(name="text", is_target=False) f2 = Field(name="label", is_target=True) f3 = Field(name="bla") assert not f1.is_target assert f2.is_target assert not f3.is_target
def fields(): num_field = Field("number", tokenizer=None) name_field = Field("name", numericalizer=Vocab(), is_target=True) name_chars_field = Field("name_chars", tokenizer=list, numericalizer=Vocab(), is_target=True) return [num_field, (name_field, name_chars_field)]
def test_remove_stopwords(): data = "I'll tell you a joke" field = Field(name="data") field.add_posttokenize_hook(remove_stopwords("en")) example = ExampleFactory([field]).from_list([data]) assert "you" not in example["data"][1] assert "a" not in example["data"][1]
def get_default_fields(): """ Method returns a dict of default CoNLL-U fields. Returns ------- fields : Dict[str, Field] Dict containing all default CoNLL-U fields. """ id = Field(name="id", tokenizer=None, numericalizer=None) form = Field(name="form", tokenizer=None, numericalizer=Vocab(specials=())) lemma = Field(name="lemma", tokenizer=None, numericalizer=Vocab(specials=())) upos = Field( name="upos", tokenizer=None, numericalizer=Vocab(specials=()), ) xpos = Field( name="xpos", tokenizer=None, numericalizer=Vocab(specials=()), ) feats = Field(name="feats", tokenizer=None, numericalizer=None) head = Field( name="head", tokenizer=None, numericalizer=int, ) deprel = Field(name="deprel", tokenizer=None) deps = Field(name="deps", tokenizer=None, numericalizer=None) misc = Field(name="misc", tokenizer=None, numericalizer=None) return { "id": id, "form": form, "lemma": lemma, "upos": upos, "xpos": xpos, "feats": feats, "head": head, "deprel": deprel, "deps": deps, "misc": misc, }
def test_field_pad_to_length(row, length, expected_row, pad_left, truncate_left): vocab = MockVocab() f = Field(name="F", numericalizer=vocab) received_row = f._pad_to_length( np.array(row), length, pad_left=pad_left, truncate_left=truncate_left ) assert received_row.tolist() == expected_row
def test_field_get_tokenizer_spacy_ok(): mp = patch.dict("sys.modules", spacy=MockSpacy()) mp.start() f = Field(name="F", numericalizer=MockVocab(), tokenizer="spacy") _, data = f.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) mp.stop()
def test_truecase(): pytest.importorskip("truecase") data = "hey how are you" field = Field(name="data", tokenizer=None, keep_raw=True) field.add_pretokenize_hook(truecase()) example = ExampleFactory([field]).from_list([data]) assert "Hey how are you" == example["data"][0]
def test_regex_replace(): data = "This item costs 100$." field = Field(name="data", tokenizer=None, keep_raw=True) regex_replace = RegexReplace([(r"\d+", "<NUMBER>"), (r"\s+", "<WHITESPACE>")]) field.add_pretokenize_hook(regex_replace) example = ExampleFactory([field]).from_list([data]) expected_raw = "This<WHITESPACE>item<WHITESPACE>costs<WHITESPACE><NUMBER>$." assert expected_raw == example["data"][1]
def test_keyword_extractor(alg, alg_pkg_name): pytest.importorskip(alg_pkg_name) field = Field(name="data", tokenizer=None, keep_raw=True) field.add_posttokenize_hook(KeywordExtractor(alg)) example = ExampleFactory([field]).from_list([TEXT]) # make sure all the keywords originate from the raw data text_ = TEXT.lower() assert all(kw in text_ for kws in example["data"][1] for kw in kws.lower().split())
def test_moses_normalizer(): pytest.importorskip("sacremoses") data = "What's up!" field = Field(name="data", tokenizer=None, keep_raw=True) normalizer = MosesNormalizer() field.add_pretokenize_hook(normalizer) example = ExampleFactory([field]).from_list([data]) assert "What's up!" == example["data"][1]
def test_field_preprocess_raw_sequential( value, store_raw, tokenize, expected_raw_value, expected_tokenized_value ): tokenizer = "split" if tokenize else None f = Field(name="F", keep_raw=store_raw, tokenizer=tokenizer) ((_, (received_raw_value, received_tokenized_value)),) = f.preprocess(value) assert received_raw_value == expected_raw_value assert received_tokenized_value == expected_tokenized_value
def test_field_get_tokenizer_callable(): vocab = MockVocab() def my_tokenizer(string): return [string[0], string[1:]] f = Field(name="F", numericalizer=vocab, tokenizer=my_tokenizer) _, data = f.preprocess("asd dsa")[0] assert data == (None, ["a", "sd dsa"])
def test_field_applies_specials(): bos, eos = BOS(), EOS() vocab = Vocab(specials=(bos, eos)) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos]) assert received == expected # Test with empty specials vocab = Vocab(specials=()) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", ["asd", "123", "BLA"]) assert received == expected # Test core specials are a no-op vocab = Vocab(specials=(PAD(), UNK())) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", ["asd", "123", "BLA"]) assert received == expected
def test_field_repeated_hooks(): def replace_tag_hook(raw, tokenized): replaced_tags = map(lambda s: s.replace("<tag>", "ABC"), tokenized) return raw, replaced_tags def to_lower_hook(raw, tokenized): # keep track of the function call count to_lower_hook.call_count += 1 tokenized = map(str.lower, tokenized) return raw, tokenized to_lower_hook.call_count = 0 f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True) # TAG -> tag f.add_posttokenize_hook(to_lower_hook) # <tag> -> ABC f.add_posttokenize_hook(replace_tag_hook) # ABC -> abc f.add_posttokenize_hook(to_lower_hook) _, received = f.preprocess("BLA <TAG> bla")[0] expected = ("BLA <TAG> bla", ["bla", "abc", "bla"]) assert received == expected # check that the hook that was added twice was also called twice assert to_lower_hook.call_count == 2
def test_field_posttokenize_hooks_detach(): f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True) def remove_tags_hook(raw, tokenized): raw = raw.replace("<tag>", "") tokenized = map(lambda x: x.replace("<tag>", ""), tokenized) return raw, tokenized def to_upper_hook(raw, tokenized): raw = raw.upper() tokenized = map(str.upper, tokenized) return raw, tokenized f.add_posttokenize_hook(remove_tags_hook) f.add_posttokenize_hook(to_upper_hook) # detaching the hooks f.remove_posttokenize_hooks() _, received = f.preprocess("asd 123<tag> B<tag>LA")[0] expected = ("asd 123<tag> B<tag>LA", ["asd", "123<tag>", "B<tag>LA"]) assert received == expected
def test_missing_symbol_index_custom_numericalize(): fld = Field( name="test_field", keep_raw=True, tokenizer=None, numericalizer=int, allow_missing_data=True, ) fld.finalize() assert fld.get_default_value() == -1
def test_from_pandas_field_list(data): df = pd.DataFrame(data) fields = [ Field("text", keep_raw=True, tokenizer="split"), Field("number", tokenizer=None), ] ds = Dataset.from_pandas(df, fields) for original, (raw, _) in zip(data, ds.text): assert original[0] == raw
def test_from_pandas_field_dict(data): df = pd.DataFrame(data, columns=["text", "number"]) fields = { "text": Field("text_field", keep_raw=True, tokenizer="split"), "number": Field("number_field", tokenizer=None), } ds = Dataset.from_pandas(df, fields) assert set(ds.field_dict) == set(["text_field", "number_field"]) for original, (raw, _) in zip(data, ds.text_field): assert original[0] == raw
def test_from_pandas_index(data): df = pd.DataFrame([[x[0]] for x in data], index=[x[1] for x in data]) fields = [Field("text", keep_raw=True, tokenizer="split")] ds = Dataset.from_pandas(df, fields, index_field=Field("numbers", tokenizer=None, keep_raw=True)) for original, (raw, _) in zip(data, ds.numbers): assert original[1] == raw
def test_from_pandas_field_dict(data): import pandas as pd df = pd.DataFrame(data, columns=["number", "text"]) fields = { "number": Field("number", tokenizer=None), "text": Field("text", keep_raw=True, tokenizer="split"), } ds = DiskBackedDataset.from_pandas(df, fields) for original, (raw, _) in zip(data, ds.text): assert original[1] == raw
def fields(): number_field = Field( "number", keep_raw=True, numericalizer=int, tokenizer=None, is_target=True ) token_field = Field( "tokens", keep_raw=True, numericalizer=Vocab(keep_freqs=True), tokenizer=partial(str.split, sep=" "), ) return [number_field, token_field]
def test_field_pad_custom_numericalize(): custom_padding_token = -999 f = Field( "test_field", numericalizer=int, padding_token=custom_padding_token, tokenizer="split", ) mock_numericalization = np.array([1, 2, 3, 4]) expected_numericalization = np.array([1, 2, 3, 4] + [custom_padding_token] * 6) padded = f._pad_to_length(mock_numericalization, 10, pad_left=False) assert np.all(padded == expected_numericalization)
def test_from_pandas_index(data): import pandas as pd df = pd.DataFrame([[x[1]] for x in data], index=[x[0] for x in data]) fields = [Field("text_field", keep_raw=True, tokenizer="split")] ds = DiskBackedDataset.from_pandas( df, fields, index_field=Field("number_field", tokenizer=None, keep_raw=True) ) assert set(ds.field_dict) == set(["text_field", "number_field"]) for original, (raw, _) in zip(data, ds.number_field): assert original[0] == raw