def test_dataset_to_chunks_whole(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [(xbeam.Key({'x': 0}), dataset)] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': -1})) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={})) self.assertIdenticalChunks(actual, expected)
def test_validate_chunks_compose_in_pipeline(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [(xbeam.Key({'x': 0}), dataset)] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': -1}) | xbeam.ValidateEachChunk()) self.assertIdenticalChunks(actual, expected)
def main(argv): # By passing chunks=None, we use Xarray's lazy-loading instead of Dask. This # result is much less data being passed from the launch script to workers. source_dataset = xarray.open_zarr( INPUT_PATH.value, chunks=None, consolidated=True, ) # This lazy "template" allows us to setup the Zarr outputs before running the # pipeline. We don't really need to supply a template here because the outputs # are small (the template argument in ChunksToZarr is optional), but it makes # the pipeline slightly more efficient. max_month = source_dataset.time.dt.month.max().item() # normally 12 template = (source_dataset.chunk().pipe(xarray.zeros_like).isel( time=0, drop=True).expand_dims(month=np.arange(1, max_month + 1), hour=np.arange(24))) output_chunks = {'hour': 1, 'month': 1} with beam.Pipeline(runner=RUNNER.value, argv=argv) as root: (root | xbeam.DatasetToChunks(source_dataset, {'time': 31}) | xbeam.SplitChunks({'time': 1}) | beam.MapTuple(rekey_chunk_on_month_hour) | xbeam.Mean.PerKey() | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, output_chunks))
def test_dataset_to_chunks_multiple(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [ (xbeam.Key({'x': 0}), dataset.head(x=3)), (xbeam.Key({'x': 3}), dataset.tail(x=3)), ] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset.chunk({'x': 3}))) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset.chunk({'x': 3}), num_threads=2)) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': 3})) self.assertIdenticalChunks(actual, expected)
def test_dataset_to_chunks_vars(self): dataset = xarray.Dataset({ 'foo': ('x', np.arange(6)), 'bar': ('x', -np.arange(6)), }) expected = [ (xbeam.Key({'x': 0}, {'foo'}), dataset.head(x=3)[['foo']]), (xbeam.Key({'x': 0}, {'bar'}), dataset.head(x=3)[['bar']]), (xbeam.Key({'x': 3}, {'foo'}), dataset.tail(x=3)[['foo']]), (xbeam.Key({'x': 3}, {'bar'}), dataset.tail(x=3)[['bar']]), ] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks( dataset, chunks={'x': 3}, split_vars=True)) self.assertIdenticalChunks(actual, expected)
def test_rechunk_zarr_to_zarr(self, template_method, split_vars): src_dir = self.create_tempdir('source').full_path dest_dir = self.create_tempdir('destination').full_path source_chunks = {'t': 1, 'x': 100, 'y': 120} target_chunks = {'t': -1, 'x': 20, 'y': 20} rs = np.random.RandomState(0) raw_data = rs.randint(2**30, size=(60, 100, 120)) # 5.76 MB dataset = xarray.Dataset({ 'foo': (('t', 'x', 'y'), raw_data), 'bar': (('t', 'x', 'y'), raw_data - 1), }) dataset.chunk(source_chunks).to_zarr(src_dir, consolidated=True) on_disk = xarray.open_zarr(src_dir, consolidated=True) on_disk_chunked = on_disk.chunk(target_chunks) with beam.Pipeline('DirectRunner') as pipeline: # make template if template_method == 'eager': target_template = on_disk_chunked elif template_method == 'lazy': target_template = beam.pvalue.AsSingleton( pipeline | beam.Create([on_disk_chunked])) elif template_method == 'infer': target_template = None # run pipeline (pipeline | xbeam.DatasetToChunks(on_disk, split_vars=split_vars) | xbeam.Rechunk( on_disk.sizes, source_chunks, target_chunks, itemsize=8, max_mem=10_000_000, # require two stages ) | xbeam.ChunksToZarr(dest_dir, target_template)) roundtripped = xarray.open_zarr(dest_dir, consolidated=True, chunks=False) xarray.testing.assert_identical(roundtripped, dataset)
def main(argv): source_dataset = xarray.open_zarr( INPUT_PATH.value, chunks=None, consolidated=True ) template = xarray.zeros_like(source_dataset.chunk()) source_chunks = {'latitude': -1, 'longitude': -1, 'time': 31} target_chunks = {'latitude': 5, 'longitude': 5, 'time': -1} with beam.Pipeline(runner=RUNNER.value, argv=argv) as root: ( root # Note: splitting across the 19 variables in this dataset is a critical # optimization step here, because it allows rechunking to make use of # much larger intermediate chunks. | xbeam.DatasetToChunks(source_dataset, source_chunks, split_vars=True) | xbeam.Rechunk( source_dataset.sizes, source_chunks, target_chunks, itemsize=4, ) | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, target_chunks) )