def test_iterable_dataset_cast(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10}) features = Features({"id": Value("int64"), "label": Value("int64")}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) new_features = Features({"id": Value("int64"), "label": Value("bool")}) casted_dataset = dataset.cast(new_features) assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
def test_map_on_task_template(self): info = DatasetInfo(task_templates=QuestionAnsweringExtractive()) dataset = Dataset.from_dict({k: [v] for k, v in SAMPLE_QUESTION_ANSWERING_EXTRACTIVE.items()}, info=info) assert isinstance(dataset.info.task_templates, list) assert len(dataset.info.task_templates) == 1 def keep_task(x): return x def dont_keep_task(x): out = deepcopy(SAMPLE_QUESTION_ANSWERING_EXTRACTIVE) out["answers"]["foobar"] = 0 return out mapped_dataset = dataset.map(keep_task) assert mapped_dataset.info.task_templates == dataset.info.task_templates # reload from cache mapped_dataset = dataset.map(keep_task) assert mapped_dataset.info.task_templates == dataset.info.task_templates mapped_dataset = dataset.map(dont_keep_task) assert mapped_dataset.info.task_templates == [] # reload from cache mapped_dataset = dataset.map(dont_keep_task) assert mapped_dataset.info.task_templates == []
def test_read_files(self): train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) with tempfile.TemporaryDirectory() as tmp_dir: reader = ReaderTest(tmp_dir, info) files = [ { "filename": os.path.join(tmp_dir, "train") }, { "filename": os.path.join(tmp_dir, "test"), "skip": 10, "take": 10 }, ] dset = Dataset( **reader.read_files(files, original_instructions="")) self.assertEqual(dset.num_rows, 110) self.assertEqual(dset.num_columns, 1) self.assertEqual(dset._data_files, files) del dset
def test_read(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) with tempfile.TemporaryDirectory() as tmp_dir: reader = ReaderTest(tmp_dir, info) instructions = "test[:33%]" dset = Dataset(**reader.read(name, instructions, split_infos)) self.assertEqual(dset["filename"][0], f"{name}-test") self.assertEqual(dset.num_rows, 33) self.assertEqual(dset.num_columns, 1) instructions = ["train", "test[:33%]"] datasets_kwargs = [ reader.read(name, instr, split_infos) for instr in instructions ] train_dset, test_dset = [ Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs ] self.assertEqual(train_dset["filename"][0], f"{name}-train") self.assertEqual(train_dset.num_rows, 100) self.assertEqual(train_dset.num_columns, 1) self.assertEqual(test_dset["filename"][0], f"{name}-test") self.assertEqual(test_dset.num_rows, 33) self.assertEqual(test_dset.num_columns, 1) del train_dset, test_dset
def test_iterable_dataset_features(generate_examples_fn, features): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) if features: expected = [features.encode_example(x) for _, x in ex_iterable] else: expected = [x for _, x in ex_iterable] assert list(dataset) == expected
def test_iterable_dataset_info(generate_examples_fn): info = DatasetInfo(description="desc", citation="@article{}", size_in_bytes=42) ex_iterable = ExamplesIterable(generate_examples_fn, {}) dataset = IterableDataset(ex_iterable, info=info) assert dataset.info == info assert dataset.description == info.description assert dataset.citation == info.citation assert dataset.size_in_bytes == info.size_in_bytes
def test_remove_and_map_on_task_template(self): features = Features({"text": Value("string"), "label": ClassLabel(names=("pos", "neg"))}) task_templates = TextClassification(text_column="text", label_column="label") info = DatasetInfo(features=features, task_templates=task_templates) dataset = Dataset.from_dict({"text": ["A sentence."], "label": ["pos"]}, info=info) def process(example): return example modified_dataset = dataset.remove_columns("label") mapped_dataset = modified_dataset.map(process) assert mapped_dataset.info.task_templates == []
def dataset_with_several_columns(generate_examples_fn): ex_iterable = ExamplesIterable( generate_examples_fn, { "filepath": ["data0.txt", "data1.txt", "data2.txt"], "metadata": { "sources": ["https://foo.bar"] } }, ) return IterableDataset(ex_iterable, info=DatasetInfo(description="dummy"), split="train")
def test_interleave_datasets_with_features(dataset: IterableDataset, generate_examples_fn): features = Features( { "id": Value("int64"), "label": ClassLabel(names=["negative", "positive"]), } ) ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0}) dataset_with_features = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) merged_dataset = interleave_datasets([dataset, dataset_with_features], probabilities=[0, 1]) assert isinstance(merged_dataset._ex_iterable, CyclingMultiSourcesExamplesIterable) assert isinstance(merged_dataset._ex_iterable.ex_iterables[1], TypedExamplesIterable) assert merged_dataset._ex_iterable.ex_iterables[1].features == features assert next(iter(merged_dataset)) == next(iter(dataset_with_features))
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn): # https://github.com/huggingface/datasets/issues/3505 ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"}) features = Features( { "id": Value("int64"), "label": Value("string"), } ) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"])) dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x}) assert isinstance(dataset._ex_iterable, MappedExamplesIterable) features["label"] = ClassLabel(names=["negative", "positive"]) assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [ features.encode_example(ex) for _, ex in ex_iterable ]
def test_read(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) with tempfile.TemporaryDirectory() as tmp_dir: reader = ReaderTest(tmp_dir, info) instructions = "test[:33%]" dset = Dataset(**reader.read(name, instructions, split_infos)) self.assertEqual(dset["filename"][0], f"{name}-test") self.assertEqual(dset.num_rows, 33) self.assertEqual(dset.num_columns, 1) instructions1 = ["train", "test[:33%]"] instructions2 = [ Split.TRAIN, ReadInstruction.from_spec("test[:33%]") ] for instructions in [instructions1, instructions2]: datasets_kwargs = [ reader.read(name, instr, split_infos) for instr in instructions ] train_dset, test_dset = (Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs) self.assertEqual(train_dset["filename"][0], f"{name}-train") self.assertEqual(train_dset.num_rows, 100) self.assertEqual(train_dset.num_columns, 1) self.assertIsInstance(train_dset.split, NamedSplit) self.assertEqual(str(train_dset.split), "train") self.assertEqual(test_dset["filename"][0], f"{name}-test") self.assertEqual(test_dset.num_rows, 33) self.assertEqual(test_dset.num_columns, 1) self.assertIsInstance(test_dset.split, NamedSplit) self.assertEqual(str(test_dset.split), "test[:33%]") del train_dset, test_dset
def test_feature_named_type(self): """reference: issue #1110""" features = Features({"_type": Value("string")}) ds_info = DatasetInfo(features=features) reloaded_features = Features.from_dict(asdict(ds_info)["features"]) assert features == reloaded_features
def _info(self) -> DatasetInfo: return DatasetInfo(features=Features({"text": Value("string")}))
def dataset(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {}) return IterableDataset(ex_iterable, info=DatasetInfo(description="dummy"), split="train")
def _info(self): return DatasetInfo(features=Features({"id": Value("int8")}))