def test_examples_iterable_shuffle_data_sources(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {"filepaths": ["0.txt", "1.txt"]}) ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(40)) expected = list(generate_examples_fn( filepaths=["1.txt", "0.txt"])) # shuffle the filepaths assert list(ex_iterable) == expected
def test_randomly_cycling_multi_sources_examples_iterable(generate_examples_fn, probabilities): seed = 42 generator = np.random.default_rng(seed) ex_iterable1 = ExamplesIterable(generate_examples_fn, {"text": "foo"}) ex_iterable2 = ExamplesIterable(generate_examples_fn, {"text": "bar"}) ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable( [ex_iterable1, ex_iterable2], generator=generator, probabilities=probabilities ) # The source used randomly changes at each example. It stops when one of the iterators is empty. rng = deepcopy(generator) iterators = (generate_examples_fn(text="foo"), generate_examples_fn(text="bar")) indices_iterator = RandomlyCyclingMultiSourcesExamplesIterable._iter_random_indices( rng, len(iterators), p=probabilities ) expected = [] for i in indices_iterator: for key, example in iterators[i]: expected.append((key, example)) break else: break assert next(iter(ex_iterable)) == expected[0] assert list(ex_iterable) == expected
def test_cycling_multi_sources_examples_iterable(generate_examples_fn): ex_iterable1 = ExamplesIterable(generate_examples_fn, {"text": "foo"}) ex_iterable2 = ExamplesIterable(generate_examples_fn, {"text": "bar"}) ex_iterable = CyclingMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2]) expected = list(chain(*zip(generate_examples_fn(text="foo"), generate_examples_fn(text="bar")))) assert next(iter(ex_iterable)) == expected[0] assert list(ex_iterable) == expected assert all((x["id"], x["text"]) == (i // 2, "bar" if i % 2 else "foo") for i, (_, x) in enumerate(ex_iterable))
def test_iterable_dataset_shuffle(dataset: IterableDataset, generate_examples_fn, seed, epoch): buffer_size = 3 dataset = deepcopy(dataset) dataset._ex_iterable.kwargs["filepaths"] = ["0.txt", "1.txt"] dataset = dataset.shuffle(seed, buffer_size=buffer_size) assert isinstance(dataset._shuffling, ShufflingConfig) assert isinstance(dataset._shuffling.generator, np.random.Generator) assert is_rng_equal(dataset._shuffling.generator, np.random.default_rng(seed)) # Effective seed is sum of seed and epoch if epoch is None or epoch == 0: effective_seed = seed else: dataset.set_epoch(epoch) effective_seed = np.random.default_rng(seed).integers(0, 1 << 63) - epoch # Shuffling adds a shuffle buffer expected_first_example_index = next( iter(BufferShuffledExamplesIterable._iter_random_indices(np.random.default_rng(effective_seed), buffer_size)) ) assert isinstance(dataset._ex_iterable, BufferShuffledExamplesIterable) # It also shuffles the underlying examples iterable expected_ex_iterable = ExamplesIterable( generate_examples_fn, {"filepaths": ["0.txt", "1.txt"]} ).shuffle_data_sources(np.random.default_rng(effective_seed)) assert isinstance(dataset._ex_iterable.ex_iterable, ExamplesIterable) assert next(iter(dataset)) == list(islice(expected_ex_iterable, expected_first_example_index + 1))[-1][1]
def test_mapped_examples_iterable(generate_examples_fn, n, func, batch_size): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = MappedExamplesIterable(base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size) all_examples = list(generate_examples_fn(n=n)) if batch_size is None: expected = [(key, func(x)) for key, x in all_examples] else: # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function expected_examples_per_batch = [ list(_batch_to_examples(func(_examples_to_batch([x for _, x in all_examples[i : i + batch_size]])))) for i in range(0, len(all_examples), batch_size) ] # The new key is the concatenation of the keys of each example in the batch expected_keys_per_batch = [ ["_".join(key for key, _ in all_examples[i : i + batch_size])] * len(examples) for i, examples in zip(range(0, len(all_examples), batch_size), expected_examples_per_batch) ] # Combine keys and examples expected = [ (key, example) for expected_keys, expected_examples in zip(expected_keys_per_batch, expected_examples_per_batch) for key, example in zip(expected_keys, expected_examples) ] assert next(iter(ex_iterable)) == expected[0] assert list(ex_iterable) == expected
def test_take_examples_iterable(generate_examples_fn): total, count = 10, 2 base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": total}) take_ex_iterable = TakeExamplesIterable(base_ex_iterable, n=count) expected = list(generate_examples_fn(n=total))[:count] assert list(take_ex_iterable) == expected assert take_ex_iterable.shuffle_data_sources(42) is take_ex_iterable, "skip examples makes the shards order fixed"
def test_mapped_examples_iterable_with_indices(generate_examples_fn, n, func, batch_size): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = MappedExamplesIterable(base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, with_indices=True) all_examples = [x for _, x in generate_examples_fn(n=n)] if batch_size is None: expected = [{ **x, **func(x, idx) } for idx, x in enumerate(all_examples)] else: # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function all_transformed_examples = [] for batch_offset in range(0, len(all_examples), batch_size): examples = all_examples[batch_offset:batch_offset + batch_size] batch = _examples_to_batch(examples) indices = list(range(batch_offset, batch_offset + len(examples))) transformed_batch = func(batch, indices) all_transformed_examples.extend( _batch_to_examples(transformed_batch)) expected = _examples_to_batch(all_examples) expected.update(_examples_to_batch(all_transformed_examples)) expected = list(_batch_to_examples(expected)) assert next(iter(ex_iterable))[1] == expected[0] assert list(x for _, x in ex_iterable) == expected
def test_iterable_dataset_cast(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10}) features = Features({"id": Value("int64"), "label": Value("int64")}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) new_features = Features({"id": Value("int64"), "label": Value("bool")}) casted_dataset = dataset.cast(new_features) assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
def test_filtered_examples_iterable_input_columns(generate_examples_fn, n, func, batch_size, input_columns): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = FilteredExamplesIterable(base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, input_columns=input_columns) all_examples = [x for _, x in generate_examples_fn(n=n)] columns_to_input = input_columns if isinstance(input_columns, list) else [input_columns] if batch_size is None: expected = [ x for x in all_examples if func(*[x[col] for col in columns_to_input]) ] else: # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function expected = [] for batch_offset in range(0, len(all_examples), batch_size): examples = all_examples[batch_offset:batch_offset + batch_size] batch = _examples_to_batch(examples) mask = func(*[batch[col] for col in columns_to_input]) expected.extend( [x for x, to_keep in zip(examples, mask) if to_keep]) assert next(iter(ex_iterable))[1] == expected[0] assert list(x for _, x in ex_iterable) == expected
def test_mapped_examples_iterable_input_columns(generate_examples_fn, n, func, batch_size, input_columns): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = MappedExamplesIterable(base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, input_columns=input_columns) all_examples = [x for _, x in generate_examples_fn(n=n)] columns_to_input = input_columns if isinstance(input_columns, list) else [input_columns] if batch_size is None: expected = [{ **x, **func(*[x[col] for col in columns_to_input]) } for x in all_examples] else: # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function all_transformed_examples = [] for batch_offset in range(0, len(all_examples), batch_size): examples = all_examples[batch_offset:batch_offset + batch_size] batch = _examples_to_batch(examples) transformed_batch = func(*[batch[col] for col in columns_to_input]) all_transformed_examples.extend( _batch_to_examples(transformed_batch)) expected = _examples_to_batch(all_examples) expected.update(_examples_to_batch(all_transformed_examples)) expected = list(_batch_to_examples(expected)) assert next(iter(ex_iterable))[1] == expected[0] assert list(x for _, x in ex_iterable) == expected
def test_buffer_shuffled_examples_iterable(generate_examples_fn, seed): n, buffer_size = 100, 30 generator = np.random.default_rng(seed) base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = BufferShuffledExamplesIterable(base_ex_iterable, buffer_size=buffer_size, generator=generator) rng = deepcopy(generator) expected_indices_used_for_shuffling = list( islice(BufferShuffledExamplesIterable._iter_random_indices(rng, buffer_size=buffer_size), n - buffer_size) ) # indices to pick in the shuffle buffer should all be in the right range assert all(0 <= index_to_pick < buffer_size for index_to_pick in expected_indices_used_for_shuffling) # it should be random indices assert expected_indices_used_for_shuffling != list(range(buffer_size)) # The final order of examples is the result of a shuffle buffer. all_examples = list(generate_examples_fn(n=n)) # We create a buffer and we pick random examples from it. buffer, rest = all_examples[:buffer_size], all_examples[buffer_size:] expected = [] for i, index_to_pick in enumerate(expected_indices_used_for_shuffling): expected.append(buffer[index_to_pick]) # The picked examples are directly replaced by the next examples from the iterable. buffer[index_to_pick] = rest.pop(0) # Once we have reached the end of the iterable, we shuffle the buffer and return the remaining examples. rng.shuffle(buffer) expected += buffer assert next(iter(ex_iterable)) == expected[0] assert list(ex_iterable) == expected assert sorted(list(ex_iterable)) == sorted(all_examples)
def test_skip_examples_iterable(generate_examples_fn): total, count = 10, 2 base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": total}) skip_ex_iterable = SkipExamplesIterable(base_ex_iterable, n=count) expected = list(generate_examples_fn(n=total))[count:] assert list(skip_ex_iterable) == expected assert (skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable), "skip examples makes the shards order fixed"
def test_examples_iterable_shuffle_shards_and_metadata(): def gen(filepaths, all_metadata): for i, (filepath, metadata) in enumerate(zip(filepaths, all_metadata)): yield i, {"filepath": filepath, "metadata": metadata} ex_iterable = ExamplesIterable( gen, { "filepaths": [f"{i}.txt" for i in range(100)], "all_metadata": [{"id": str(i)} for i in range(100)], }, ) ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(42)) out = list(ex_iterable) filepaths_ids = [x["filepath"].split(".")[0] for _, x in out] metadata_ids = [x["metadata"]["id"] for _, x in out] assert filepaths_ids == metadata_ids, "entangled lists of shards/metadata should be shuffled the same way"
def test_iterable_dataset_info(generate_examples_fn): info = DatasetInfo(description="desc", citation="@article{}", size_in_bytes=42) ex_iterable = ExamplesIterable(generate_examples_fn, {}) dataset = IterableDataset(ex_iterable, info=info) assert dataset.info == info assert dataset.description == info.description assert dataset.citation == info.citation assert dataset.size_in_bytes == info.size_in_bytes
def test_iterable_dataset_features(generate_examples_fn, features): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) if features: expected = [features.encode_example(x) for _, x in ex_iterable] else: expected = [x for _, x in ex_iterable] assert list(dataset) == expected
def test_iterable_dataset_factory_torch_integration(generate_examples_fn): import torch ex_iterable = ExamplesIterable(generate_examples_fn, {}) dataset = iterable_dataset(ex_iterable, format_type="torch") assert isinstance(dataset, IterableDataset) assert isinstance(dataset, torch.utils.data.IterableDataset) assert dataset._format_type == "torch" assert dataset._ex_iterable is ex_iterable
def test_iterable_dataset_shuffle_after_skip_or_take(generate_examples_fn, method): seed = 42 n, n_shards = 3, 10 count = 7 ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n, "filepaths": [f"{i}.txt" for i in range(n_shards)]}) dataset = IterableDataset(ex_iterable) dataset = dataset.skip(n) if method == "skip" else dataset.take(count) shuffled_dataset = dataset.shuffle(seed, buffer_size=DEFAULT_N_EXAMPLES) # shuffling a skip/take dataset should keep the same examples and don't shuffle the shards key = lambda x: f"{x['filepath']}_{x['id']}" # noqa: E731 assert sorted(dataset, key=key) == sorted(shuffled_dataset, key=key)
def test_examples_iterable_with_kwargs(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, { "filepaths": ["0.txt", "1.txt"], "split": "train" }) expected = list( generate_examples_fn(filepaths=["0.txt", "1.txt"], split="train")) assert list(ex_iterable) == expected assert all("split" in ex for _, ex in ex_iterable) assert sorted({ex["filepath"] for _, ex in ex_iterable}) == ["0.txt", "1.txt"]
def dataset_with_several_columns(generate_examples_fn): ex_iterable = ExamplesIterable( generate_examples_fn, { "filepath": ["data0.txt", "data1.txt", "data2.txt"], "metadata": { "sources": ["https://foo.bar"] } }, ) return IterableDataset(ex_iterable, info=DatasetInfo(description="dummy"), split="train")
def test_interleave_datasets_with_features(dataset: IterableDataset, generate_examples_fn): features = Features( { "id": Value("int64"), "label": ClassLabel(names=["negative", "positive"]), } ) ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0}) dataset_with_features = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) merged_dataset = interleave_datasets([dataset, dataset_with_features], probabilities=[0, 1]) assert isinstance(merged_dataset._ex_iterable, CyclingMultiSourcesExamplesIterable) assert isinstance(merged_dataset._ex_iterable.ex_iterables[1], TypedExamplesIterable) assert merged_dataset._ex_iterable.ex_iterables[1].features == features assert next(iter(merged_dataset)) == next(iter(dataset_with_features))
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn): # https://github.com/huggingface/datasets/issues/3505 ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"}) features = Features( { "id": Value("int64"), "label": Value("string"), } ) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"])) dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x}) assert isinstance(dataset._ex_iterable, MappedExamplesIterable) features["label"] = ClassLabel(names=["negative", "positive"]) assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [ features.encode_example(ex) for _, ex in ex_iterable ]
def test_filtered_examples_iterable_with_indices(generate_examples_fn, n, func, batch_size): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = FilteredExamplesIterable( base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, with_indices=True ) all_examples = [x for _, x in generate_examples_fn(n=n)] if batch_size is None: expected = [x for idx, x in enumerate(all_examples) if func(x, idx)] else: # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function expected = [] for batch_offset in range(0, len(all_examples), batch_size): examples = all_examples[batch_offset : batch_offset + batch_size] batch = _examples_to_batch(examples) indices = list(range(batch_offset, batch_offset + len(examples))) mask = func(batch, indices) expected.extend([x for x, to_keep in zip(examples, mask) if to_keep]) assert next(iter(ex_iterable))[1] == expected[0] assert list(x for _, x in ex_iterable) == expected
def test_mapped_examples_iterable_drop_last_batch(generate_examples_fn, n, func, batch_size): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n}) ex_iterable = MappedExamplesIterable(base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, drop_last_batch=True) all_examples = [x for _, x in generate_examples_fn(n=n)] is_empty = False if batch_size is None: # `drop_last_batch` has no effect here expected = [{**x, **func(x)} for x in all_examples] else: # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function all_transformed_examples = [] for batch_offset in range(0, len(all_examples), batch_size): examples = all_examples[batch_offset:batch_offset + batch_size] if len(examples) < batch_size: # ignore last batch break batch = _examples_to_batch(examples) transformed_batch = func(batch) all_transformed_examples.extend( _batch_to_examples(transformed_batch)) all_examples = all_examples if n % batch_size == 0 else all_examples[:n // batch_size * batch_size] if all_examples: expected = _examples_to_batch(all_examples) expected.update(_examples_to_batch(all_transformed_examples)) expected = list(_batch_to_examples(expected)) else: is_empty = True if not is_empty: assert next(iter(ex_iterable))[1] == expected[0] assert list(x for _, x in ex_iterable) == expected else: with pytest.raises(StopIteration): next(iter(ex_iterable))
def test_mapped_examples_iterable_remove_columns(generate_examples_fn, n, func, batch_size, remove_columns): base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n, "extra_column": "foo"}) ex_iterable = MappedExamplesIterable( base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, remove_columns=remove_columns ) all_examples = [x for _, x in generate_examples_fn(n=n)] columns_to_remove = remove_columns if isinstance(remove_columns, list) else [remove_columns] if batch_size is None: expected = [{**{k: v for k, v in x.items() if k not in columns_to_remove}, **func(x)} for x in all_examples] else: # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function all_transformed_examples = [] for batch_offset in range(0, len(all_examples), batch_size): examples = all_examples[batch_offset : batch_offset + batch_size] batch = _examples_to_batch(examples) transformed_batch = func(batch) all_transformed_examples.extend(_batch_to_examples(transformed_batch)) expected = {k: v for k, v in _examples_to_batch(all_examples).items() if k not in columns_to_remove} expected.update(_examples_to_batch(all_transformed_examples)) expected = list(_batch_to_examples(expected)) assert next(iter(ex_iterable))[1] == expected[0] assert list(x for _, x in ex_iterable) == expected
def test_examples_iterable(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {}) expected = list(generate_examples_fn()) assert next(iter(ex_iterable)) == expected[0] assert list(ex_iterable) == expected
def test_iterable_dataset(generate_examples_fn): dataset = IterableDataset(ExamplesIterable(generate_examples_fn, {})) expected = [x for _, x in generate_examples_fn()] assert next(iter(dataset)) == expected[0] assert list(dataset) == expected
def test_iterable_dataset_factory(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {}) dataset = iterable_dataset(ex_iterable) assert isinstance(dataset, IterableDataset) assert dataset._ex_iterable is ex_iterable
def dataset(generate_examples_fn): ex_iterable = ExamplesIterable(generate_examples_fn, {}) return IterableDataset(ex_iterable, info=DatasetInfo(description="dummy"), split="train")