def main(argv): # By passing chunks=None, we use Xarray's lazy-loading instead of Dask. This # result is much less data being passed from the launch script to workers. source_dataset = xarray.open_zarr( INPUT_PATH.value, chunks=None, consolidated=True, ) # This lazy "template" allows us to setup the Zarr outputs before running the # pipeline. We don't really need to supply a template here because the outputs # are small (the template argument in ChunksToZarr is optional), but it makes # the pipeline slightly more efficient. max_month = source_dataset.time.dt.month.max().item() # normally 12 template = (source_dataset.chunk().pipe(xarray.zeros_like).isel( time=0, drop=True).expand_dims(month=np.arange(1, max_month + 1), hour=np.arange(24))) output_chunks = {'hour': 1, 'month': 1} with beam.Pipeline(runner=RUNNER.value, argv=argv) as root: (root | xbeam.DatasetToChunks(source_dataset, {'time': 31}) | xbeam.SplitChunks({'time': 1}) | beam.MapTuple(rekey_chunk_on_month_hour) | xbeam.Mean.PerKey() | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, output_chunks))
def test_chunks_to_zarr(self): dataset = xarray.Dataset( {'foo': ('x', np.arange(0, 60, 10))}, coords={'x': np.arange(6)}, ) chunked = dataset.chunk() inputs = [ (xarray_beam.ChunkKey({'x': 0}), dataset), ] with self.subTest('no template'): temp_dir = self.create_tempdir().full_path inputs | xarray_beam.ChunksToZarr(temp_dir) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('with template'): temp_dir = self.create_tempdir().full_path inputs | xarray_beam.ChunksToZarr(temp_dir, chunked) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('with zarr_chunks and with template'): temp_dir = self.create_tempdir().full_path zarr_chunks = {'x': 3} inputs | xarray_beam.ChunksToZarr(temp_dir, chunked, zarr_chunks) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) self.assertEqual(result.chunks, {'x': (3, 3)}) with self.subTest('with zarr_chunks and no template'): temp_dir = self.create_tempdir().full_path zarr_chunks = {'x': 3} inputs | xarray_beam.ChunksToZarr(temp_dir, zarr_chunks=zarr_chunks) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) self.assertEqual(result.chunks, {'x': (3, 3)}) temp_dir = self.create_tempdir().full_path with self.assertRaisesRegex( ValueError, 'template does not have any variables chunked with Dask', ): xarray_beam.ChunksToZarr(temp_dir, dataset) temp_dir = self.create_tempdir().full_path template = chunked.assign_coords(x=np.zeros(6)) with self.assertRaisesRegex( ValueError, 'template and chunk indexes do not match', ): inputs | xarray_beam.ChunksToZarr(temp_dir, template) inputs2 = [ (xarray_beam.ChunkKey({'x': 0}), dataset.expand_dims(z=[1, 2])), ] temp_dir = self.create_tempdir().full_path with self.assertRaisesRegex( ValueError, 'unexpected new indexes found in chunk', ): inputs2 | xarray_beam.ChunksToZarr(temp_dir, template)
def test_multiple_vars_chunks_to_zarr(self): dataset = xarray.Dataset( { 'foo': ('x', np.arange(0, 60, 10)), 'bar': ('x', -np.arange(6)), }, coords={'x': np.arange(6)}, ) chunked = dataset.chunk() inputs = [ (xbeam.Key({'x': 0}, {'foo'}), dataset[['foo']]), (xbeam.Key({'x': 0}, {'bar'}), dataset[['bar']]), ] with self.subTest('no template'): temp_dir = self.create_tempdir().full_path inputs | xbeam.ChunksToZarr(temp_dir) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('with template'): temp_dir = self.create_tempdir().full_path inputs | xbeam.ChunksToZarr(temp_dir, chunked) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result)
def test_2d_chunks_to_zarr(self, coords): dataset = xarray.Dataset( {'foo': (('x', 'y'), np.arange(0, 60, 10).reshape(3, 2))}, coords=coords, ) with self.subTest('partial key'): inputs = [(xbeam.Key({'x': 0}), dataset)] temp_dir = self.create_tempdir().full_path inputs | xbeam.ChunksToZarr(temp_dir) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('split along partial key'): inputs = [(xbeam.Key({'x': 0}), dataset)] temp_dir = self.create_tempdir().full_path inputs | xbeam.SplitChunks({'x': 1}) | xbeam.ChunksToZarr(temp_dir) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('full key'): inputs = [(xbeam.Key({'x': 0, 'y': 0}), dataset)] temp_dir = self.create_tempdir().full_path inputs | xbeam.ChunksToZarr(temp_dir) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result)
def test_rechunk_zarr_to_zarr(self, template_method, split_vars): src_dir = self.create_tempdir('source').full_path dest_dir = self.create_tempdir('destination').full_path source_chunks = {'t': 1, 'x': 100, 'y': 120} target_chunks = {'t': -1, 'x': 20, 'y': 20} rs = np.random.RandomState(0) raw_data = rs.randint(2**30, size=(60, 100, 120)) # 5.76 MB dataset = xarray.Dataset({ 'foo': (('t', 'x', 'y'), raw_data), 'bar': (('t', 'x', 'y'), raw_data - 1), }) dataset.chunk(source_chunks).to_zarr(src_dir, consolidated=True) on_disk = xarray.open_zarr(src_dir, consolidated=True) on_disk_chunked = on_disk.chunk(target_chunks) with beam.Pipeline('DirectRunner') as pipeline: # make template if template_method == 'eager': target_template = on_disk_chunked elif template_method == 'lazy': target_template = beam.pvalue.AsSingleton( pipeline | beam.Create([on_disk_chunked])) elif template_method == 'infer': target_template = None # run pipeline (pipeline | xbeam.DatasetToChunks(on_disk, split_vars=split_vars) | xbeam.Rechunk( on_disk.sizes, source_chunks, target_chunks, itemsize=8, max_mem=10_000_000, # require two stages ) | xbeam.ChunksToZarr(dest_dir, target_template)) roundtripped = xarray.open_zarr(dest_dir, consolidated=True, chunks=False) xarray.testing.assert_identical(roundtripped, dataset)
def main(argv): source_dataset = xarray.open_zarr( INPUT_PATH.value, chunks=None, consolidated=True ) template = xarray.zeros_like(source_dataset.chunk()) source_chunks = {'latitude': -1, 'longitude': -1, 'time': 31} target_chunks = {'latitude': 5, 'longitude': 5, 'time': -1} with beam.Pipeline(runner=RUNNER.value, argv=argv) as root: ( root # Note: splitting across the 19 variables in this dataset is a critical # optimization step here, because it allows rechunking to make use of # much larger intermediate chunks. | xbeam.DatasetToChunks(source_dataset, source_chunks, split_vars=True) | xbeam.Rechunk( source_dataset.sizes, source_chunks, target_chunks, itemsize=4, ) | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, target_chunks) )