def test_missing_datatype_exception(data, fields, tmpdir): data_null = [(*d, None) for d in data] null_field = Field( "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab() ) fields_null = [*fields, null_field] exf = ExampleFactory(fields_null) examples = map(exf.from_list, data_null) with pytest.raises(RuntimeError): DiskBackedDataset.from_examples(fields_null, examples, cache_path=tmpdir)
def test_delete_cache(data, fields): cache_dir = tempfile.mkdtemp() example_factory = ExampleFactory(fields) examples = map(example_factory.from_list, data) ad = DiskBackedDataset.from_examples(fields, examples, cache_path=cache_dir) assert os.path.exists(cache_dir) ad.delete_cache() assert not os.path.exists(cache_dir)
def test_from_examples(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(ex) for ex in data] ad = DiskBackedDataset.from_examples(fields, examples) for (raw, tokenized), (num, _) in zip(ad.number, data): assert raw == num assert tokenized is num for (raw, tokenized), (_, tok) in zip(ad.tokens, data): assert raw == tok assert tokenized == tok.split(" ") ad.delete_cache()
def test_datatype_definition(data, fields): data_null = [(*d, None) for d in data] null_field = Field( "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab() ) fields_null = [*fields, null_field] exf = ExampleFactory(fields_null) examples = map(exf.from_list, data_null) datatypes = {"null_field": (pa.string(), pa.list_(pa.string()))} dataset = DiskBackedDataset.from_examples(fields_null, examples, data_types=datatypes) for ex, d in zip(dataset, data_null): assert int(ex["number"][0]) == d[0] assert ex["tokens"][0] == d[1] dataset.delete_cache()
def pyarrow_dataset(data, fields): example_factory = ExampleFactory(fields) examples = map(example_factory.from_list, data) return DiskBackedDataset.from_examples(fields, examples)