def test_in_memory_rechunk_success(self): inputs = [ (xbeam.Key({ 'x': 100, 'y': 300 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[1, 2, 3]]))})), (xbeam.Key({ 'x': 101, 'y': 300 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[4, 5, 6]]))})), ] target_chunks = {'x': 2, 'y': 1} expected = [ (xbeam.Key({ 'x': 100, 'y': 300 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[1], [4]]))})), (xbeam.Key({ 'x': 100, 'y': 301 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[2], [5]]))})), (xbeam.Key({ 'x': 100, 'y': 302 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[3], [6]]))})), ] actual = list(rechunk.in_memory_rechunk(inputs, target_chunks)) self.assertIdenticalChunks(actual, expected)
def test_offsets_as_beam_key(self): inputs = [ (xbeam.Key({ 'x': 0, 'y': 1 }), 1), (xbeam.Key({ 'x': 0, 'y': 2 }), 2), (xbeam.Key({ 'y': 1, 'x': 0 }), 3), ] expected = [ (xbeam.Key({ 'x': 0, 'y': 1 }), [1, 3]), (xbeam.Key({ 'x': 0, 'y': 2 }), [2]), ] actual = inputs | beam.GroupByKey() self.assertEqual(actual, expected)
def test_consolidate_variables_merge_fails(self): inputs = [ ( xbeam.Key({'x': 0}, vars={'foo'}), xarray.Dataset({'foo': ('x', [1, 2])}), ), ( xbeam.Key({'x': 0}, vars={'bar'}), xarray.Dataset({'bar': ('x', [3, 4, 5])}), ), ] with self.assertRaisesRegex( ValueError, re.escape( textwrap.dedent(""" merging dataset chunks with variables [{'foo'}, {'bar'}] failed. <xarray.Dataset> Dimensions: (x: 2) Dimensions without coordinates: x Data variables: foo (x) int64 1 2 <xarray.Dataset> Dimensions: (x: 3) Dimensions without coordinates: x Data variables: bar (x) int64 3 4 5 """).strip())): inputs | xbeam.ConsolidateVariables()
def test_consolidate_and_split_only_some_dims(self): chunk_data = np.arange(0, 10).reshape(2, 5) split = [ (xbeam.Key({ 'x': 0, 'y': 0 }), xarray.Dataset({'foo': (('x', 'y'), chunk_data)})), (xbeam.Key({ 'x': 0, 'y': 5 }), xarray.Dataset({'foo': (('x', 'y'), chunk_data + 10)})), ] all_data = np.concatenate([chunk_data, chunk_data + 10], axis=1) consolidated = [ (xbeam.Key({ 'x': 0, 'y': 0 }), xarray.Dataset({'foo': (('x', 'y'), all_data)})), ] with self.subTest('ConsolidateChunks'): actual = split | xbeam.ConsolidateChunks({'y': 10}) self.assertIdenticalChunks(actual, consolidated) with self.subTest('SplitChunks'): actual = consolidated | xbeam.SplitChunks({'y': 5}) self.assertIdenticalChunks(actual, split)
def test_repr(self): key = xbeam.Key({'x': 0, 'y': 10}) expected = "Key(offsets={'x': 0, 'y': 10}, vars=None)" self.assertEqual(repr(key), expected) key = xbeam.Key(vars={'foo'}) expected = "Key(offsets={}, vars={'foo'})" self.assertEqual(repr(key), expected)
def test_chunks_to_zarr(self): dataset = xarray.Dataset( {'foo': ('x', np.arange(0, 60, 10))}, coords={'x': np.arange(6)}, ) chunked = dataset.chunk() inputs = [ (xbeam.Key({'x': 0}), dataset), ] with self.subTest('no template'): temp_dir = self.create_tempdir().full_path inputs | xbeam.ChunksToZarr(temp_dir) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('with template'): temp_dir = self.create_tempdir().full_path inputs | xbeam.ChunksToZarr(temp_dir, chunked) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) with self.subTest('with zarr_chunks and with template'): temp_dir = self.create_tempdir().full_path zarr_chunks = {'x': 3} inputs | xbeam.ChunksToZarr(temp_dir, chunked, zarr_chunks) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) self.assertEqual(result.chunks, {'x': (3, 3)}) with self.subTest('with zarr_chunks and no template'): temp_dir = self.create_tempdir().full_path zarr_chunks = {'x': 3} inputs | xbeam.ChunksToZarr(temp_dir, zarr_chunks=zarr_chunks) result = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(dataset, result) self.assertEqual(result.chunks, {'x': (3, 3)}) temp_dir = self.create_tempdir().full_path with self.assertRaisesRegex( ValueError, 'template does not have any variables chunked with Dask', ): xbeam.ChunksToZarr(temp_dir, dataset) temp_dir = self.create_tempdir().full_path template = chunked.assign_coords(x=np.zeros(6)) with self.assertRaisesRegex( ValueError, 'template and chunk indexes do not match', ): inputs | xbeam.ChunksToZarr(temp_dir, template) inputs2 = [ (xbeam.Key({'x': 0}), dataset.expand_dims(z=[1, 2])), ] temp_dir = self.create_tempdir().full_path with self.assertRaisesRegex( ValueError, 'unexpected new indexes found in chunk', ): inputs2 | xbeam.ChunksToZarr(temp_dir, template)
def test_rechunk_stage(self): inputs = [ (xbeam.Key({ 'x': 100, 'y': 300 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[1, 2, 3]]))})), (xbeam.Key({ 'x': 101, 'y': 300 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[4, 5, 6]]))})), (xbeam.Key({ 'x': 100, 'y': 303 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[10, 20, 30]]))})), (xbeam.Key({ 'x': 101, 'y': 303 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[40, 50, 60]]))})), ] expected = [ (xbeam.Key({ 'x': 100, 'y': 300 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[1], [4]]))})), (xbeam.Key({ 'x': 100, 'y': 301 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[2], [5]]))})), (xbeam.Key({ 'x': 100, 'y': 302 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[3], [6]]))})), (xbeam.Key({ 'x': 100, 'y': 303 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[10], [40]]))})), (xbeam.Key({ 'x': 100, 'y': 304 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[20], [50]]))})), (xbeam.Key({ 'x': 100, 'y': 305 }), xarray.Dataset({'foo': (('x', 'y'), np.array([[30], [60]]))})), ] actual = inputs | rechunk.RechunkStage( source_chunks={ 'x': 1, 'y': 3 }, target_chunks={ 'x': 2, 'y': 1 }, ) self.assertIdenticalChunks(actual, expected)
def test_in_memory_rechunk_not_unique(self): ds_zeros = xarray.Dataset({'foo': ('x', [0])}) inputs = [ (xbeam.Key({'x': 0}), ds_zeros), (xbeam.Key({'x': 0}), ds_zeros), ] target_chunks = {'x': 2} with self.assertRaisesRegex(ValueError, 'chunk keys are not unique'): list(rechunk.in_memory_rechunk(inputs, target_chunks))
def test_equality(self): key = xbeam.Key({'x': 0, 'y': 10}) self.assertEqual(key, key) self.assertNotEqual(key, None) key2 = xbeam.Key({'x': 0, 'y': 10}, {'bar'}) self.assertEqual(key2, key2) self.assertNotEqual(key, key2) self.assertNotEqual(key2, key)
def test_consolidate_with_unchunked_vars(self): inputs = [ (xbeam.Key({'x': 0}), xarray.Dataset({ 'foo': ('x', np.arange(0, 10)), 'bar': 1 })), (xbeam.Key({'x': 10}), xarray.Dataset({ 'foo': ('x', np.arange(10, 20)), 'bar': 1 })), ] expected = [ (xbeam.Key({'x': 0}), xarray.Dataset({ 'foo': ('x', np.arange(20)), 'bar': 1 })), ] actual = inputs | xbeam.ConsolidateChunks({'x': -1}) self.assertIdenticalChunks(actual, expected) inconsistent_inputs = [ (xbeam.Key({'x': 0}), xarray.Dataset({ 'foo': ('x', np.arange(0, 10)), 'bar': 1 })), (xbeam.Key({'x': 10}), xarray.Dataset({ 'foo': ('x', np.arange(10, 20)), 'bar': 2 })), ] with self.assertRaisesRegex( ValueError, re.escape( textwrap.dedent(""" combining nested dataset chunks for vars=None with offsets={'x': [0, 10]} failed. Leading datasets along dimension 'x': <xarray.Dataset> Dimensions: (x: 10) Dimensions without coordinates: x Data variables: foo (x) int64 0 1 2 3 4 5 6 7 8 9 bar int64 1 <xarray.Dataset> Dimensions: (x: 10) Dimensions without coordinates: x Data variables: foo (x) int64 10 11 12 13 14 15 16 17 18 19 bar int64 2 """).strip())): inconsistent_inputs | xbeam.ConsolidateChunks({'x': -1})
def test_vars_as_beam_key(self): inputs = [ (xbeam.Key(vars={'foo'}), 1), (xbeam.Key(vars={'bar'}), 2), (xbeam.Key(vars={'foo'}), 3), ] expected = [ (xbeam.Key(vars={'foo'}), [1, 3]), (xbeam.Key(vars={'bar'}), [2]), ] actual = inputs | beam.GroupByKey() self.assertEqual(actual, expected)
def test_consolidate_fully_missing_chunks(self): inputs = [ (xbeam.Key({'x': 5}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(5, 10))})), (xbeam.Key({'x': 0}, {'bar', 'baz'}), xarray.Dataset({ 'bar': ('x', np.arange(0, 5)), 'baz': ('x', np.arange(0, 5)) })), ] with self.assertRaisesRegex(ValueError, 'some expected chunks are missing'): xbeam.consolidate_fully(inputs)
def test_consolidate_chunks_missing_variables(self): inputs = [ (xbeam.Key({'x': 0}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(0, 5))})), (xbeam.Key({'x': 5}, {'bar'}), xarray.Dataset({'bar': ('x', np.arange(15, 20))})), ] with self.assertRaisesRegex( ValueError, re.escape( "some expected chunks are missing for vars=frozenset({'foo'}" )): list(xbeam.consolidate_chunks(inputs))
def test_constructor(self): key = xbeam.Key({'x': 0, 'y': 10}) self.assertIsInstance(key.offsets, immutabledict.immutabledict) self.assertEqual(dict(key.offsets), {'x': 0, 'y': 10}) self.assertEqual(key.vars, None) key = xbeam.Key(vars={'foo'}) self.assertEqual(dict(key.offsets), {}) self.assertIsInstance(key.vars, frozenset) self.assertEqual(set(key.vars), {'foo'}) with self.assertRaisesRegex(TypeError, 'vars must be a set or None'): xbeam.Key(vars='foo')
def test_consolidate_with_minus_one_chunks(self): inputs = [ (xbeam.Key({'x': 0}), xarray.Dataset({'foo': ('x', np.arange(0, 10))})), (xbeam.Key({'x': 10}), xarray.Dataset({'foo': ('x', np.arange(10, 20))})), ] expected = [ (xbeam.Key({'x': 0}), xarray.Dataset({'foo': ('x', np.arange(20))})), ] actual = inputs | xbeam.ConsolidateChunks({'x': -1}) self.assertIdenticalChunks(actual, expected)
def test_dataset_to_chunks_vars(self): dataset = xarray.Dataset({ 'foo': ('x', np.arange(6)), 'bar': ('x', -np.arange(6)), }) expected = [ (xbeam.Key({'x': 0}, {'foo'}), dataset.head(x=3)[['foo']]), (xbeam.Key({'x': 0}, {'bar'}), dataset.head(x=3)[['bar']]), (xbeam.Key({'x': 3}, {'foo'}), dataset.tail(x=3)[['foo']]), (xbeam.Key({'x': 3}, {'bar'}), dataset.tail(x=3)[['bar']]), ] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks( dataset, chunks={'x': 3}, split_vars=True)) self.assertIdenticalChunks(actual, expected)
def test_consolidate_overlapping_variables(self): inputs = [ (xbeam.Key({'x': 0}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(0, 5))})), (xbeam.Key({'x': 0}, {'foo', 'bar'}), xarray.Dataset({ 'foo': ('x', np.arange(5, 10)), 'bar': ('x', np.arange(0, 5)) })), ] with self.assertRaisesRegex( ValueError, "merging dataset chunks with variables .*'foo'.* failed", ): xbeam.consolidate_fully(inputs)
def test_consolidate_variables_overlapping_variables(self): inputs = [ (xbeam.Key({'x': 0}, vars={'foo'}), xarray.Dataset({'foo': ('x', [1, 2])})), ( xbeam.Key({'x': 0}, vars={'foo', 'bar'}), xarray.Dataset({ 'foo': ('x', [3, 4]), 'bar': ('x', [5, 6]) }), ), ] with self.assertRaisesRegex( ValueError, 'cannot merge chunks with overlapping variables: '): inputs | xbeam.ConsolidateVariables()
def test_rechunk_not_all_dimensions(self): data = np.random.RandomState(0).randint(2**30, size=(10, 20, 30)) ds = xarray.Dataset({'foo': (('time', 'x', 'y'), data)}) key = xbeam.Key({'x': 0, 'y': 0}) y_split_with_time_key = ([(key.with_offsets(time=0), ds)] | xbeam.SplitChunks({'y': 3})) x_split = [(key, ds)] | xbeam.SplitChunks({'x': 2}) actual = x_split | xbeam.Rechunk( dim_sizes=ds.sizes, source_chunks={ 'x': 2, 'y': -1 }, target_chunks={ 'x': -1, 'y': 3 }, itemsize=8, max_mem=10_000, ) self.assertIdenticalChunks(actual, y_split_with_time_key) with self.assertRaisesRegex( ValueError, 'source_chunks and target_chunks have different keys', ): xbeam.Rechunk( dim_sizes=ds.sizes, source_chunks={'x': 2}, target_chunks={'y': 3}, itemsize=8, max_mem=10_000, )
def test_validate_chunks_compose_in_pipeline(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [(xbeam.Key({'x': 0}), dataset)] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': -1}) | xbeam.ValidateEachChunk()) self.assertIdenticalChunks(actual, expected)
def test_consolidate_chunks_not_fully_shared_dims(self): inputs = [ (xbeam.Key({'x': 0}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(0, 5))})), (xbeam.Key({'x': 5}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(5, 10))})), (xbeam.Key({'y': 0}, {'bar'}), xarray.Dataset({'bar': ('y', np.arange(0, 5))})), ] actual = xbeam.consolidate_chunks(inputs) expected = [ (xbeam.Key({'x': 0}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(0, 10))})), (xbeam.Key({'y': 0}, {'bar'}), xarray.Dataset({'bar': ('y', np.arange(0, 5))})), ] self.assertIdenticalChunks(actual, expected)
def test_unmatched_variables_raises_error(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) with self.assertRaises(ValueError) as e: ([(xbeam.Key({'x': 0}, {'bar'}), dataset)] | xbeam.ValidateEachChunk()) self.assertIn( "Key var(s) 'bar' in Key(offsets={'x': 0}, vars={'bar'}) not found in Dataset " "data variables", e.exception.args[0])
def test_replace(self): key = xbeam.Key({'x': 0}, {'foo'}) expected = xbeam.Key({'x': 1}, {'foo'}) actual = key.replace({'x': 1}) self.assertEqual(expected, actual) expected = xbeam.Key({'y': 1}, {'foo'}) actual = key.replace({'y': 1}) self.assertEqual(expected, actual) expected = xbeam.Key({'x': 0}) actual = key.replace(vars=None) self.assertEqual(expected, actual) expected = xbeam.Key({'x': 0}, {'bar'}) actual = key.replace(vars={'bar'}) self.assertEqual(expected, actual) expected = xbeam.Key({'y': 1}, {'foo'}) actual = key.replace({'y': 1}, {'foo'}) self.assertEqual(expected, actual) expected = xbeam.Key({'y': 1}, {'bar'}) actual = key.replace({'y': 1}, {'bar'}) self.assertEqual(expected, actual)
def test_in_memory_rechunk_missing_keys(self): ds_zeros = xarray.Dataset({'foo': (('x', 'y'), [[0]])}) inputs = [ (xbeam.Key({ 'x': 0, 'y': 0 }), ds_zeros), (xbeam.Key({ 'x': 1, 'y': 1 }), ds_zeros), ] target_chunks = {'x': 2, 'y': 2} with self.assertRaisesRegex( ValueError, 'some expected chunks are missing for vars=None', ): list(rechunk.in_memory_rechunk(inputs, target_chunks))
def test_dataset_to_chunks_multiple(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [ (xbeam.Key({'x': 0}), dataset.head(x=3)), (xbeam.Key({'x': 3}), dataset.tail(x=3)), ] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset.chunk({'x': 3}))) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset.chunk({'x': 3}), num_threads=2)) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': 3})) self.assertIdenticalChunks(actual, expected)
def test_consolidate_and_split_uneven_chunks(self): consolidated = [ (xbeam.Key({'x': 0}), xarray.Dataset({'foo': ('x', np.arange(10))})), ] split = [ (xbeam.Key({'x': 0}), xarray.Dataset({'foo': ('x', np.arange(0, 4))})), (xbeam.Key({'x': 4}), xarray.Dataset({'foo': ('x', np.arange(4, 8))})), (xbeam.Key({'x': 8}), xarray.Dataset({'foo': ('x', np.arange(8, 10))})), ] with self.subTest('ConsolidateChunks'): actual = split | xbeam.ConsolidateChunks({'x': 10}) self.assertIdenticalChunks(actual, consolidated) with self.subTest('SplitChunks'): actual = consolidated | xbeam.SplitChunks({'x': 4}) self.assertIdenticalChunks(actual, split)
def test_dataset_to_chunks_whole(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) expected = [(xbeam.Key({'x': 0}), dataset)] actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={'x': -1})) self.assertIdenticalChunks(actual, expected) actual = (test_util.EagerPipeline() | xbeam.DatasetToChunks(dataset, chunks={})) self.assertIdenticalChunks(actual, expected)
def test_consolidate_fully(self): inputs = [ (xbeam.Key({'x': 0}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(0, 5))})), (xbeam.Key({'x': 0}, {'bar'}), xarray.Dataset({'bar': ('x', np.arange(10, 15))})), (xbeam.Key({'x': 5}, {'foo'}), xarray.Dataset({'foo': ('x', np.arange(5, 10))})), (xbeam.Key({'x': 5}, {'bar'}), xarray.Dataset({'bar': ('x', np.arange(15, 20))})), ] expected = ( xbeam.Key({'x': 0}, vars={'foo', 'bar'}), xarray.Dataset({ 'foo': ('x', np.arange(0, 10)), 'bar': ('x', np.arange(10, 20)), }), ) actual = xbeam.consolidate_fully(inputs) self.assertIdenticalChunks([actual], [expected])
def test_consolidate_variables(self): inputs = [ (xbeam.Key({'x': 0}, vars={'foo'}), xarray.Dataset({'foo': ('x', [1, 2])})), (xbeam.Key({'x': 2}, vars={'foo'}), xarray.Dataset({'foo': ('x', [1, 2])})), (xbeam.Key({'x': 0}, vars={'bar'}), xarray.Dataset({'bar': ('x', [5, 6])})), ] actual = xbeam.consolidate_variables(inputs) expected = [ (xbeam.Key({'x': 0}, {'foo', 'bar'}), xarray.Dataset({ 'foo': ('x', [1, 2]), 'bar': ('x', [5, 6]) })), (xbeam.Key({'x': 2}, vars={'foo'}), xarray.Dataset({'foo': ('x', [1, 2])})), ] self.assertIdenticalChunks(actual, expected)
def test_unmatched_dimension_raises_error(self): dataset = xarray.Dataset({'foo': ('x', np.arange(6))}) with self.assertRaises(ValueError) as e: ([(xbeam.Key({ 'x': 0, 'y': 0 }), dataset)] | xbeam.ValidateEachChunk()) self.assertIn( "Key offset(s) 'y' in Key(offsets={'x': 0, 'y': 0}, vars=None) not found in " "Dataset dimensions", e.exception.args[0])