def test_from_dataset(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(raw_example) for raw_example in data] dataset = Dataset(examples, fields) pyarrow_dataset = DiskBackedDataset.from_dataset(dataset) for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset): assert ds_ex.number == arrow_ex.number assert ds_ex.tokens == arrow_ex.tokens pyarrow_dataset.delete_cache()
def test_slice_view_to_dataset(dataset, tmp_path): start, stop, step = 3, 8, 2 slc = slice(start, stop, step) dataset_view = DatasetSlicedView(dataset, s=slc) # cast to Dataset ds = Dataset.from_dataset(dataset_view) assert isinstance(ds, Dataset) assert len(ds) == len(dataset_view) for ex_view, ex_dataset in zip(dataset_view, ds): for f in dataset.fields: assert ex_view[f.name] == ex_dataset[f.name] # cast to DiskBackedDataset ds = DiskBackedDataset.from_dataset(dataset_view, cache_path=tmp_path) assert isinstance(ds, DiskBackedDataset) assert len(ds) == len(dataset_view) for ex_view, ex_dataset in zip(dataset_view, ds): for f in dataset.fields: assert ex_view[f.name] == ex_dataset[f.name]