def test_dataset_to_zarr(self): dataset = xarray.Dataset( {'foo': ('x', np.arange(0, 60, 10))}, coords={'x': np.arange(6)}, attrs={'meta': 'data'}, ) chunked = dataset.chunk({'x': 3}) temp_dir = self.create_tempdir().full_path ( test_util.EagerPipeline() | xarray_beam.DatasetToZarr(chunked, temp_dir) ) actual = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(actual, dataset) temp_dir = self.create_tempdir().full_path with self.assertRaisesRegex( ValueError, 'template does not have any variables chunked with Dask', ): ( test_util.EagerPipeline() | xarray_beam.DatasetToZarr(dataset, temp_dir) )
def test_dataset_to_chunks_whole(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [(xbeam.Key({'x': 0}), dataset)] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': -1})) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={})) self.assertIdenticalChunks(actual, expected)
def test_validate_chunks_compose_in_pipeline(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [(xbeam.Key({'x': 0}), dataset)] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': -1}) | xbeam.ValidateEachChunk()) self.assertIdenticalChunks(actual, expected)
def test_multiple_datasets_with_subchunks_returns_multiple_datasets( self, time_step: int, longitude_step: int, chunks: Dict[str, int], ): expected = [] for t, o in itertools.product(range(0, 360 * 4, time_step), range(0, 144, longitude_step)): expected.extend( split_chunks( core.Key({ "latitude": 0, "longitude": o, "time": t }), self.test_data.isel(time=slice(t, t + time_step), longitude=slice(o, o + longitude_step)), chunks)) with self.multifile_pattern(time_step, longitude_step) as pattern: actual = test_util.EagerPipeline() | FilePatternToChunks( pattern, chunks=chunks) self.assertAllCloseChunks(actual, expected)
def test_dataset_to_chunks_multiple(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [ (xbeam.Key({'x': 0}), dataset.head(x=3)), (xbeam.Key({'x': 3}), dataset.tail(x=3)), ] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset.chunk({'x': 3}))) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset.chunk({'x': 3}), num_threads=2)) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': 3})) self.assertIdenticalChunks(actual, expected)
def test_returns_single_dataset(self): expected = [(core.Key({ "time": 0, "latitude": 0, "longitude": 0 }), self.test_data)] with self.pattern_from_testdata() as pattern: actual = test_util.EagerPipeline() | FilePatternToChunks(pattern) self.assertAllCloseChunks(actual, expected)
def test_single_subchunks_returns_multiple_datasets(self): with self.pattern_from_testdata() as pattern: result = (test_util.EagerPipeline() | FilePatternToChunks(pattern, chunks={"longitude": 48})) expected = [(core.Key({ "time": 0, "latitude": 0, "longitude": i }), self.test_data.isel(longitude=slice(i, i + 48))) for i in range(0, 144, 48)] self.assertAllCloseChunks(result, expected)
def test_dataset_to_chunks_vars(self): dataset = xarray.Dataset({ 'foo': ('x', np.arange(6)), 'bar': ('x', -np.arange(6)), }) expected = [ (xbeam.Key({'x': 0}, {'foo'}), dataset.head(x=3)[['foo']]), (xbeam.Key({'x': 0}, {'bar'}), dataset.head(x=3)[['bar']]), (xbeam.Key({'x': 3}, {'foo'}), dataset.tail(x=3)[['foo']]), (xbeam.Key({'x': 3}, {'bar'}), dataset.tail(x=3)[['bar']]), ] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks( dataset, chunks={'x': 3}, split_vars=True)) self.assertIdenticalChunks(actual, expected)
def test_multiple_datasets_returns_multiple_datasets( self, time_step: int, longitude_step: int): expected = [ (core.Key({ "time": t, "latitude": 0, "longitude": o }), self.test_data.isel(time=slice(t, t + time_step), longitude=slice(o, o + longitude_step))) for t, o in itertools.product(range(0, 360 * 4, time_step), range(0, 144, longitude_step)) ] with self.multifile_pattern(time_step, longitude_step) as pattern: actual = test_util.EagerPipeline() | FilePatternToChunks(pattern) self.assertAllCloseChunks(actual, expected)
def test_multiple_subchunks_returns_multiple_datasets(self): with self.pattern_from_testdata() as pattern: result = (test_util.EagerPipeline() | FilePatternToChunks(pattern, chunks={ "longitude": 48, "latitude": 24 })) expected = [ (core.Key({ "time": 0, "longitude": o, "latitude": a }), self.test_data.isel(longitude=slice(o, o + 48), latitude=slice(a, a + 24))) for o, a in itertools.product(range(0, 144, 48), range(0, 73, 24)) ] self.assertAllCloseChunks(result, expected)