Exemplo n.º 1
0
def main(argv):
    # By passing chunks=None, we use Xarray's lazy-loading instead of Dask. This
    # result is much less data being passed from the launch script to workers.
    source_dataset = xarray.open_zarr(
        INPUT_PATH.value,
        chunks=None,
        consolidated=True,
    )

    # This lazy "template" allows us to setup the Zarr outputs before running the
    # pipeline. We don't really need to supply a template here because the outputs
    # are small (the template argument in ChunksToZarr is optional), but it makes
    # the pipeline slightly more efficient.
    max_month = source_dataset.time.dt.month.max().item()  # normally 12
    template = (source_dataset.chunk().pipe(xarray.zeros_like).isel(
        time=0, drop=True).expand_dims(month=np.arange(1, max_month + 1),
                                       hour=np.arange(24)))
    output_chunks = {'hour': 1, 'month': 1}

    with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
        (root
         | xbeam.DatasetToChunks(source_dataset, {'time': 31})
         | xbeam.SplitChunks({'time': 1})
         | beam.MapTuple(rekey_chunk_on_month_hour)
         | xbeam.Mean.PerKey()
         | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, output_chunks))
Exemplo n.º 2
0
  def test_chunks_to_zarr(self):
    dataset = xarray.Dataset(
        {'foo': ('x', np.arange(0, 60, 10))},
        coords={'x': np.arange(6)},
    )
    chunked = dataset.chunk()
    inputs = [
        (xarray_beam.ChunkKey({'x': 0}), dataset),
    ]
    with self.subTest('no template'):
      temp_dir = self.create_tempdir().full_path
      inputs | xarray_beam.ChunksToZarr(temp_dir)
      result = xarray.open_zarr(temp_dir, consolidated=True)
      xarray.testing.assert_identical(dataset, result)
    with self.subTest('with template'):
      temp_dir = self.create_tempdir().full_path
      inputs | xarray_beam.ChunksToZarr(temp_dir, chunked)
      result = xarray.open_zarr(temp_dir, consolidated=True)
      xarray.testing.assert_identical(dataset, result)
    with self.subTest('with zarr_chunks and with template'):
      temp_dir = self.create_tempdir().full_path
      zarr_chunks = {'x': 3}
      inputs | xarray_beam.ChunksToZarr(temp_dir, chunked, zarr_chunks)
      result = xarray.open_zarr(temp_dir, consolidated=True)
      xarray.testing.assert_identical(dataset, result)
      self.assertEqual(result.chunks, {'x': (3, 3)})
    with self.subTest('with zarr_chunks and no template'):
      temp_dir = self.create_tempdir().full_path
      zarr_chunks = {'x': 3}
      inputs | xarray_beam.ChunksToZarr(temp_dir, zarr_chunks=zarr_chunks)
      result = xarray.open_zarr(temp_dir, consolidated=True)
      xarray.testing.assert_identical(dataset, result)
      self.assertEqual(result.chunks, {'x': (3, 3)})

    temp_dir = self.create_tempdir().full_path
    with self.assertRaisesRegex(
        ValueError,
        'template does not have any variables chunked with Dask',
    ):
      xarray_beam.ChunksToZarr(temp_dir, dataset)

    temp_dir = self.create_tempdir().full_path
    template = chunked.assign_coords(x=np.zeros(6))
    with self.assertRaisesRegex(
        ValueError,
        'template and chunk indexes do not match',
    ):
      inputs | xarray_beam.ChunksToZarr(temp_dir, template)

    inputs2 = [
        (xarray_beam.ChunkKey({'x': 0}),
         dataset.expand_dims(z=[1, 2])),
    ]
    temp_dir = self.create_tempdir().full_path
    with self.assertRaisesRegex(
        ValueError,
        'unexpected new indexes found in chunk',
    ):
      inputs2 | xarray_beam.ChunksToZarr(temp_dir, template)
Exemplo n.º 3
0
 def test_multiple_vars_chunks_to_zarr(self):
     dataset = xarray.Dataset(
         {
             'foo': ('x', np.arange(0, 60, 10)),
             'bar': ('x', -np.arange(6)),
         },
         coords={'x': np.arange(6)},
     )
     chunked = dataset.chunk()
     inputs = [
         (xbeam.Key({'x': 0}, {'foo'}), dataset[['foo']]),
         (xbeam.Key({'x': 0}, {'bar'}), dataset[['bar']]),
     ]
     with self.subTest('no template'):
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
     with self.subTest('with template'):
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.ChunksToZarr(temp_dir, chunked)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
Exemplo n.º 4
0
 def test_2d_chunks_to_zarr(self, coords):
     dataset = xarray.Dataset(
         {'foo': (('x', 'y'), np.arange(0, 60, 10).reshape(3, 2))},
         coords=coords,
     )
     with self.subTest('partial key'):
         inputs = [(xbeam.Key({'x': 0}), dataset)]
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
     with self.subTest('split along partial key'):
         inputs = [(xbeam.Key({'x': 0}), dataset)]
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.SplitChunks({'x': 1}) | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
     with self.subTest('full key'):
         inputs = [(xbeam.Key({'x': 0, 'y': 0}), dataset)]
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
Exemplo n.º 5
0
    def test_rechunk_zarr_to_zarr(self, template_method, split_vars):
        src_dir = self.create_tempdir('source').full_path
        dest_dir = self.create_tempdir('destination').full_path

        source_chunks = {'t': 1, 'x': 100, 'y': 120}
        target_chunks = {'t': -1, 'x': 20, 'y': 20}

        rs = np.random.RandomState(0)
        raw_data = rs.randint(2**30, size=(60, 100, 120))  # 5.76 MB
        dataset = xarray.Dataset({
            'foo': (('t', 'x', 'y'), raw_data),
            'bar': (('t', 'x', 'y'), raw_data - 1),
        })
        dataset.chunk(source_chunks).to_zarr(src_dir, consolidated=True)

        on_disk = xarray.open_zarr(src_dir, consolidated=True)
        on_disk_chunked = on_disk.chunk(target_chunks)
        with beam.Pipeline('DirectRunner') as pipeline:
            # make template
            if template_method == 'eager':
                target_template = on_disk_chunked
            elif template_method == 'lazy':
                target_template = beam.pvalue.AsSingleton(
                    pipeline | beam.Create([on_disk_chunked]))
            elif template_method == 'infer':
                target_template = None
            # run pipeline
            (pipeline
             | xbeam.DatasetToChunks(on_disk, split_vars=split_vars)
             | xbeam.Rechunk(
                 on_disk.sizes,
                 source_chunks,
                 target_chunks,
                 itemsize=8,
                 max_mem=10_000_000,  # require two stages
             )
             | xbeam.ChunksToZarr(dest_dir, target_template))
        roundtripped = xarray.open_zarr(dest_dir,
                                        consolidated=True,
                                        chunks=False)

        xarray.testing.assert_identical(roundtripped, dataset)
Exemplo n.º 6
0
def main(argv):
  source_dataset = xarray.open_zarr(
      INPUT_PATH.value, chunks=None, consolidated=True
  )
  template = xarray.zeros_like(source_dataset.chunk())
  source_chunks = {'latitude': -1, 'longitude': -1, 'time': 31}
  target_chunks = {'latitude': 5, 'longitude': 5, 'time': -1}

  with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
    (
        root
        # Note: splitting across the 19 variables in this dataset is a critical
        # optimization step here, because it allows rechunking to make use of
        # much larger intermediate chunks.
        | xbeam.DatasetToChunks(source_dataset, source_chunks, split_vars=True)
        | xbeam.Rechunk(
            source_dataset.sizes,
            source_chunks,
            target_chunks,
            itemsize=4,
        )
        | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, target_chunks)
    )