def test_array_xd_with_none(): # Fixed shape features = datasets.Features( {"foo": datasets.Array2D(dtype="int32", shape=(2, 2))}) dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict( {"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance( arr, np.ndarray) and arr.dtype == np.float64 and arr.shape == (3, 2, 2) assert np.allclose(arr[0], dummy_array) and np.allclose( arr[2], dummy_array) assert np.all(np.isnan(arr[1])) # broadcasted np.nan - use np.all # Dynamic shape features = datasets.Features( {"foo": datasets.Array2D(dtype="int32", shape=(None, 2))}) dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict( {"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance( arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3, ) np.testing.assert_equal(arr[0], dummy_array) np.testing.assert_equal(arr[2], dummy_array) assert np.isnan(arr[1]) # a single np.nan value - np.all not needed
def test_numpy_extractor(self): pa_table = self._create_dummy_table() extractor = NumpyArrowExtractor() row = extractor.extract_row(pa_table) np.testing.assert_equal(row, {"a": _COL_A[0], "b": _COL_B[0], "c": np.array(_COL_C[0])}) col = extractor.extract_column(pa_table) np.testing.assert_equal(col, np.array(_COL_A)) batch = extractor.extract_batch(pa_table) np.testing.assert_equal(batch, {"a": np.array(_COL_A), "b": np.array(_COL_B), "c": np.array(_COL_C)})
def test_numpy_extractor_np_array_kwargs(self): pa_table = self._create_dummy_table().drop(["b"]) extractor = NumpyArrowExtractor(dtype=np.float16) row = extractor.extract_row(pa_table) self.assertEqual(row["c"].dtype, np.dtype(np.float16)) col = extractor.extract_column(pa_table) self.assertEqual(col.dtype, np.float16) batch = extractor.extract_batch(pa_table) self.assertEqual(batch["a"].dtype, np.dtype(np.float16)) self.assertEqual(batch["c"].dtype, np.dtype(np.float16))
def test_array_xd_numpy_arrow_extractor(dtype, dummy_value): features = datasets.Features( {"foo": datasets.Array2D(dtype=dtype, shape=(2, 2))}) dataset = datasets.Dataset.from_dict({"foo": [[[dummy_value] * 2] * 2]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance(arr, np.ndarray) np.testing.assert_equal( arr, np.array([[[dummy_value] * 2] * 2], dtype=np.dtype(dtype)))