Exemplo n.º 1
0
  def test_dataset_to_zarr(self):
    dataset = xarray.Dataset(
        {'foo': ('x', np.arange(0, 60, 10))},
        coords={'x': np.arange(6)},
        attrs={'meta': 'data'},
    )
    chunked = dataset.chunk({'x': 3})

    temp_dir = self.create_tempdir().full_path
    (
        test_util.EagerPipeline()
        | xarray_beam.DatasetToZarr(chunked, temp_dir)
    )
    actual = xarray.open_zarr(temp_dir, consolidated=True)
    xarray.testing.assert_identical(actual, dataset)

    temp_dir = self.create_tempdir().full_path
    with self.assertRaisesRegex(
        ValueError,
        'template does not have any variables chunked with Dask',
    ):
      (
          test_util.EagerPipeline()
          | xarray_beam.DatasetToZarr(dataset, temp_dir)
      )
Exemplo n.º 2
0
    def test_dataset_to_chunks_whole(self):
        dataset = xarray.Dataset({'foo': ('x', np.arange(6))})
        expected = [(xbeam.Key({'x': 0}), dataset)]
        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset, chunks={'x': -1}))
        self.assertIdenticalChunks(actual, expected)

        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset, chunks={}))
        self.assertIdenticalChunks(actual, expected)
Exemplo n.º 3
0
 def test_validate_chunks_compose_in_pipeline(self):
     dataset = xarray.Dataset({'foo': ('x', np.arange(6))})
     expected = [(xbeam.Key({'x': 0}), dataset)]
     actual = (test_util.EagerPipeline()
               | xbeam.DatasetToChunks(dataset, chunks={'x': -1})
               | xbeam.ValidateEachChunk())
     self.assertIdenticalChunks(actual, expected)
Exemplo n.º 4
0
    def test_multiple_datasets_with_subchunks_returns_multiple_datasets(
        self,
        time_step: int,
        longitude_step: int,
        chunks: Dict[str, int],
    ):

        expected = []
        for t, o in itertools.product(range(0, 360 * 4, time_step),
                                      range(0, 144, longitude_step)):
            expected.extend(
                split_chunks(
                    core.Key({
                        "latitude": 0,
                        "longitude": o,
                        "time": t
                    }),
                    self.test_data.isel(time=slice(t, t + time_step),
                                        longitude=slice(o,
                                                        o + longitude_step)),
                    chunks))
        with self.multifile_pattern(time_step, longitude_step) as pattern:
            actual = test_util.EagerPipeline() | FilePatternToChunks(
                pattern, chunks=chunks)

            self.assertAllCloseChunks(actual, expected)
Exemplo n.º 5
0
    def test_dataset_to_chunks_multiple(self):
        dataset = xarray.Dataset({'foo': ('x', np.arange(6))})
        expected = [
            (xbeam.Key({'x': 0}), dataset.head(x=3)),
            (xbeam.Key({'x': 3}), dataset.tail(x=3)),
        ]
        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset.chunk({'x': 3})))
        self.assertIdenticalChunks(actual, expected)

        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset.chunk({'x': 3}),
                                          num_threads=2))
        self.assertIdenticalChunks(actual, expected)

        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset, chunks={'x': 3}))
        self.assertIdenticalChunks(actual, expected)
Exemplo n.º 6
0
    def test_returns_single_dataset(self):
        expected = [(core.Key({
            "time": 0,
            "latitude": 0,
            "longitude": 0
        }), self.test_data)]
        with self.pattern_from_testdata() as pattern:
            actual = test_util.EagerPipeline() | FilePatternToChunks(pattern)

        self.assertAllCloseChunks(actual, expected)
Exemplo n.º 7
0
    def test_single_subchunks_returns_multiple_datasets(self):
        with self.pattern_from_testdata() as pattern:
            result = (test_util.EagerPipeline()
                      | FilePatternToChunks(pattern, chunks={"longitude": 48}))

        expected = [(core.Key({
            "time": 0,
            "latitude": 0,
            "longitude": i
        }), self.test_data.isel(longitude=slice(i, i + 48)))
                    for i in range(0, 144, 48)]
        self.assertAllCloseChunks(result, expected)
Exemplo n.º 8
0
 def test_dataset_to_chunks_vars(self):
     dataset = xarray.Dataset({
         'foo': ('x', np.arange(6)),
         'bar': ('x', -np.arange(6)),
     })
     expected = [
         (xbeam.Key({'x': 0}, {'foo'}), dataset.head(x=3)[['foo']]),
         (xbeam.Key({'x': 0}, {'bar'}), dataset.head(x=3)[['bar']]),
         (xbeam.Key({'x': 3}, {'foo'}), dataset.tail(x=3)[['foo']]),
         (xbeam.Key({'x': 3}, {'bar'}), dataset.tail(x=3)[['bar']]),
     ]
     actual = (test_util.EagerPipeline()
               | xbeam.DatasetToChunks(
                   dataset, chunks={'x': 3}, split_vars=True))
     self.assertIdenticalChunks(actual, expected)
Exemplo n.º 9
0
    def test_multiple_datasets_returns_multiple_datasets(
            self, time_step: int, longitude_step: int):
        expected = [
            (core.Key({
                "time": t,
                "latitude": 0,
                "longitude": o
            }),
             self.test_data.isel(time=slice(t, t + time_step),
                                 longitude=slice(o, o + longitude_step)))
            for t, o in itertools.product(range(0, 360 * 4, time_step),
                                          range(0, 144, longitude_step))
        ]
        with self.multifile_pattern(time_step, longitude_step) as pattern:
            actual = test_util.EagerPipeline() | FilePatternToChunks(pattern)

        self.assertAllCloseChunks(actual, expected)
Exemplo n.º 10
0
    def test_multiple_subchunks_returns_multiple_datasets(self):
        with self.pattern_from_testdata() as pattern:
            result = (test_util.EagerPipeline()
                      | FilePatternToChunks(pattern,
                                            chunks={
                                                "longitude": 48,
                                                "latitude": 24
                                            }))

        expected = [
            (core.Key({
                "time": 0,
                "longitude": o,
                "latitude": a
            }),
             self.test_data.isel(longitude=slice(o, o + 48),
                                 latitude=slice(a, a + 24)))
            for o, a in itertools.product(range(0, 144, 48), range(0, 73, 24))
        ]

        self.assertAllCloseChunks(result, expected)