def _prepare_split(self, split_generator, **kwargs): fname = "{}-{}.arrow".format(self.name, split_generator.name) writer = ArrowWriter(features=self.info.features, path=os.path.join(self._cache_dir, fname)) writer.write_batch({"text": ["foo"] * 100}) num_examples, num_bytes = writer.finalize() split_generator.split_info.num_examples = num_examples split_generator.split_info.num_bytes = num_bytes
def write(my_features, dummy_data, tmp_dir): writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) for key, record in dummy_data: example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize()
def test_array2d_nonspecific_shape(self): with tempfile.TemporaryDirectory() as tmp_dir: my_features = DEFAULT_FEATURES.copy() writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) for key, record in generate_examples( features=my_features, num_examples=1, ): example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file( os.path.join(tmp_dir, "beta.arrow")) dataset.set_format("numpy") row = dataset[0] first_shape = row["image"].shape second_shape = row["text"].shape self.assertTrue( first_shape is not None and second_shape is not None, "need atleast 2 different shapes") self.assertEqual(len(first_shape), len(second_shape), "both shapes are supposed to be equal length") self.assertNotEqual(first_shape, second_shape, "shapes must not be the same") del dataset
def test_write_batch_schema(self): fields = {"col_1": pa.string(), "col_2": pa.int64()} output = pa.BufferOutputStream() writer = ArrowWriter(stream=output, schema=pa.schema(fields)) writer.write_batch({"col_1": ["foo", "bar"], "col_2": [1, 2]}) num_examples, num_bytes = writer.finalize() self.assertEqual(num_examples, 2) self.assertGreater(num_bytes, 0) self.assertEqual(writer._schema, pa.schema(fields, metadata=writer._schema.metadata)) self._check_output(output.getvalue())
def test_write_no_schema(self): output = pa.BufferOutputStream() writer = ArrowWriter(stream=output) writer.write({"col_1": "foo", "col_2": 1}) writer.write({"col_1": "bar", "col_2": 2}) num_examples, num_bytes = writer.finalize() self.assertEqual(num_examples, 2) self.assertGreater(num_bytes, 0) fields = {"col_1": pa.string(), "col_2": pa.int64()} self.assertEqual(writer._schema, pa.schema(fields, metadata=writer._schema.metadata)) self._check_output(output.getvalue())
def test_write_file(self): with tempfile.TemporaryDirectory() as tmp_dir: fields = {"col_1": pa.string(), "col_2": pa.int64()} output = os.path.join(tmp_dir, "test.arrow") writer = ArrowWriter(path=output, schema=pa.schema(fields)) writer.write_batch({"col_1": ["foo", "bar"], "col_2": [1, 2]}) num_examples, num_bytes = writer.finalize() self.assertEqual(num_examples, 2) self.assertGreater(num_bytes, 0) self.assertEqual( writer._schema, pa.schema(fields, metadata=writer._schema.metadata)) self._check_output(output)
def test_compatability_with_string_values(self): with tempfile.TemporaryDirectory() as tmp_dir: my_features = DEFAULT_FEATURES.copy() my_features["image_id"] = datasets.Value("string") writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) for key, record in generate_examples(features=my_features, num_examples=1): example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file( os.path.join(tmp_dir, "beta.arrow")) self.assertTrue(isinstance(dataset[0]["image_id"], str), "image id must be of type string")
def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path): cache_dir = str(tmp_path) dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for info_split in dummy_builder.info.splits: with ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"), features=Features({"text": Value("string")}), ) as writer: writer.write_batch({"text": ["foo"] * 10}) writer.finalize() with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase(): dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory) assert isinstance(dataset, expected_dataset_class) if isinstance(dataset, DatasetDict): assert list(dataset.keys()) == ["train", "test"] datasets = dataset.values() expected_splits = ["train", "test"] elif isinstance(dataset, Dataset): datasets = [dataset] expected_splits = [split] for dataset, expected_split in zip(datasets, expected_splits): assert dataset.split == expected_split assert len(dataset) == expected_dataset_length assert dataset.features == Features({"text": Value("string")}) dataset.column_names == ["text"]
def test_multiple_extensions_same_row(self): with tempfile.TemporaryDirectory() as tmp_dir: my_features = DEFAULT_FEATURES.copy() with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) as writer: for key, record in generate_examples(features=my_features, num_examples=1): example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file( os.path.join(tmp_dir, "beta.arrow")) dataset.set_format("numpy") row = dataset[0] first_len = len(row["image"].shape) second_len = len(row["text"].shape) third_len = len(row["dynamic"].shape) self.assertEqual(first_len, 2, "use a sequence type if dim is < 2") self.assertEqual(second_len, 2, "use a sequence type if dim is < 2") self.assertEqual(third_len, 2, "use a sequence type if dim is < 2") del dataset
def test_write_batch(self, array_feature, shape_1, shape_2): with tempfile.TemporaryDirectory() as tmp_dir: my_features = self.get_features(array_feature, shape_1, shape_2) writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) dict_examples = self.get_dict_examples(shape_1, shape_2) dict_examples = my_features.encode_batch(dict_examples) writer.write_batch(dict_examples) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file( os.path.join(tmp_dir, "beta.arrow")) self._check_getitem_output_type(dataset, shape_1, shape_2, dict_examples["matrix"][0])
def test_extension_indexing(self): with tempfile.TemporaryDirectory() as tmp_dir: my_features = DEFAULT_FEATURES.copy() my_features["explicit_ext"] = Array2D((3, 3), dtype="float32") writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) for key, record in generate_examples(features=my_features, num_examples=1): example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file( os.path.join(tmp_dir, "beta.arrow")) dataset.set_format("numpy") data = dataset[0]["explicit_ext"] self.assertIsInstance( data, np.ndarray, "indexed extension must return numpy.ndarray")
def test_write(self, array_feature, shape_1, shape_2): with tempfile.TemporaryDirectory() as tmp_dir: my_features = self.get_features(array_feature, shape_1, shape_2) writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) my_examples = [ (0, self.get_dict_example_0(shape_1, shape_2)), (1, self.get_dict_example_1(shape_1, shape_2)), ] for key, record in my_examples: example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file( os.path.join(tmp_dir, "beta.arrow")) self._check_getitem_output_type(dataset, shape_1, shape_2, my_examples[0][1]["matrix"])
def test_key_datatype(writer_batch_size): output = pa.BufferOutputStream() with ArrowWriter( stream=output, writer_batch_size=writer_batch_size, hash_salt="split_name", check_duplicates=True, ) as writer: with pytest.raises(InvalidKeyError): writer.write({"col_1": "foo", "col_2": 1}, key=[1, 2]) num_examples, num_bytes = writer.finalize()
def test_write_file(): with tempfile.TemporaryDirectory() as tmp_dir: fields = {"col_1": pa.string(), "col_2": pa.int64()} output = os.path.join(tmp_dir, "test.arrow") with ArrowWriter(path=output, schema=pa.schema(fields)) as writer: writer.write_batch({"col_1": ["foo", "bar"], "col_2": [1, 2]}) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata) _check_output(output, 1)
def test_arrow_writer_closes_stream(raise_exception, tmp_path): path = str(tmp_path / "dataset-train.arrow") try: with ArrowWriter(path=path) as writer: if raise_exception: raise pa.lib.ArrowInvalid() else: writer.stream.close() except pa.lib.ArrowInvalid: pass finally: assert writer.stream.closed
def test_write_table(fields, writer_batch_size): output = pa.BufferOutputStream() schema = pa.schema(fields) if fields else None with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer: writer.write_table(pa.Table.from_pydict({"col_1": ["foo", "bar"], "col_2": [1, 2]})) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 if not fields: fields = {"col_1": pa.string(), "col_2": pa.int64()} assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata) _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)
def test_duplicate_keys(writer_batch_size): output = pa.BufferOutputStream() with ArrowWriter( stream=output, writer_batch_size=writer_batch_size, hash_salt="split_name", check_duplicates=True, ) as writer: with pytest.raises(DuplicatedKeysError): writer.write({"col_1": "foo", "col_2": 1}, key=10) writer.write({"col_1": "bar", "col_2": 2}, key=10) num_examples, num_bytes = writer.finalize()
def save(self, path: str) -> None: # Make all the directories to the path os.makedirs(path, exist_ok=True) # Taken from Huggingface datasets.Dataset # Prepare output buffer and batched writer in memory or on file if we update # the table writer = ArrowWriter( features=self.features, path=os.path.join(path, "data.arrow"), writer_batch_size=1000, ) # Loop over single examples or batches and write to buffer/file if examples # are to be updated for i, example in tqdm(enumerate(self)): writer.write(example) writer.finalize() # Write DatasetInfo self.info.write_to_directory(path) # Write split to file with open(os.path.join(path, "split.p"), "wb") as f: pickle.dump(self.split, f)
def test_write_with_keys(writer_batch_size): output = pa.BufferOutputStream() with ArrowWriter( stream=output, writer_batch_size=writer_batch_size, hash_salt="split_name", check_duplicates=True, ) as writer: writer.write({"col_1": "foo", "col_2": 1}, key=1) writer.write({"col_1": "bar", "col_2": 2}, key=2) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)
def test_write_with_features(): output = pa.BufferOutputStream() features = Features({"labels": ClassLabel(names=["neg", "pos"])}) with ArrowWriter(stream=output, features=features) as writer: writer.write({"labels": 0}) writer.write({"labels": 1}) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 assert writer._schema == features.arrow_schema assert writer._schema.metadata == features.arrow_schema.metadata stream = pa.BufferReader(output.getvalue()) f = pa.ipc.open_stream(stream) pa_table: pa.Table = f.read_all() schema = pa_table.schema assert pa_table.num_rows == 2 assert schema == features.arrow_schema assert schema.metadata == features.arrow_schema.metadata assert features == Features.from_arrow_schema(schema)
def test_as_dataset(self): with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset
def generate_example_dataset(dataset_path, features, num_examples=100, seq_shapes=None): dummy_data = generate_examples(features, num_examples=num_examples, seq_shapes=seq_shapes) with ArrowWriter(features=features, path=dataset_path) as writer: for key, record in dummy_data: example = features.encode_example(record) writer.write(example) num_final_examples, num_bytes = writer.finalize() if not num_final_examples == num_examples: raise ValueError( f"Error writing the dataset, wrote {num_final_examples} examples but should have written {num_examples}." ) dataset = datasets.Dataset.from_file( filename=dataset_path, info=datasets.DatasetInfo(features=features)) return dataset
def test_as_dataset_with_post_process_with_index(self): def _post_process(self, dataset, resources_paths): if os.path.exists(resources_paths["index"]): dataset.load_faiss_index("my_index", resources_paths["index"]) return dataset else: dataset.add_faiss_index_from_external_arrays( external_arrays=np.ones((len(dataset), 8)), string_factory="Flat", index_name="my_index") dataset.save_faiss_index("my_index", resources_paths["index"]) return dataset def _post_processing_resources(self, split): return {"index": "Flat-{split}.faiss".format(split=split)} with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder._post_process = types.MethodType( _post_process, dummy_builder) dummy_builder._post_processing_resources = types.MethodType( _post_processing_resources, dummy_builder) os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"small_dataset-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 2}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) self.assertListEqual(dsets["train"].list_indexes(), ["my_index"]) self.assertListEqual(dsets["test"].list_indexes(), ["my_index"]) self.assertGreater(dummy_builder.info.post_processing_size, 0) self.assertGreater( dummy_builder.info.post_processed.resources_checksums["train"] ["index"]["num_bytes"], 0) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) self.assertListEqual(dset.list_indexes(), ["my_index"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) self.assertListEqual(dset.list_indexes(), ["my_index"]) del dset
def test_as_dataset_with_post_process(self): def _post_process(self, dataset, resources_paths): def char_tokenize(example): return {"tokens": list(example["text"])} return dataset.map( char_tokenize, cache_file_name=resources_paths["tokenized_dataset"]) def _post_processing_resources(self, split): return { "tokenized_dataset": "tokenized_dataset-{split}.arrow".format(split=split) } with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder.info.post_processed = PostProcessedInfo( features=Features({ "text": Value("string"), "tokens": [Value("string")] })) dummy_builder._post_process = types.MethodType( _post_process, dummy_builder) dummy_builder._post_processing_resources = types.MethodType( _post_processing_resources, dummy_builder) os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"tokenized_dataset-{split}.arrow"), features=Features({ "text": Value("string"), "tokens": [Value("string")] }), ) writer.write_batch({ "text": ["foo"] * 10, "tokens": [list("foo")] * 10 }) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual( dsets["train"].features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertDictEqual( dsets["test"].features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertListEqual(dsets["train"].column_names, ["text", "tokens"]) self.assertListEqual(dsets["test"].column_names, ["text", "tokens"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual( dset.features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertListEqual(dset.column_names, ["text", "tokens"]) self.assertGreater(dummy_builder.info.post_processing_size, 0) self.assertGreater( dummy_builder.info.post_processed.resources_checksums["train"] ["tokenized_dataset"]["num_bytes"], 0) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual( dset.features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertListEqual(dset.column_names, ["text", "tokens"]) del dset def _post_process(self, dataset, resources_paths): return dataset.select([0, 1], keep_in_memory=True) with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder._post_process = types.MethodType( _post_process, dummy_builder) os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"small_dataset-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 2}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 2) self.assertEqual(len(dsets["test"]), 2) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 2) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 2) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset