def loopread(tcoutput, size_record, ncol, n_alt, size_head, size_data_record, tReq): tcoutput = Path(tcoutput).expanduser() n_t = tcoutput.stat().st_size // size_record // d_bytes chi = empty(n_t, float) t = empty(n_t, datetime) plasmaparam = DataArray(data=empty((n_t, n_alt, 4)), dims=["time", "alt_km", "isrparam"]) iono = DataArray(data=empty((n_t, n_alt, 22)), dims=["time", "alt_km", "param"]) with tcoutput.open("rb") as f: # reset to beginning for i in range(n_t): iono[i, ...], chi[i], t[i], alt, plasmaparam[i, ...] = data_tra( f, size_record, ncol, n_alt, size_head, size_data_record ) # FIXME isn't there a way to inherit coordinates like Pandas? iono = iono.assign_coords(time=t, param=PARAM, alt_km=alt) plasmaparam = plasmaparam.assign_coords(time=t, isrparam=ISRPARAM, alt_km=alt) #%% handle time request -- will return Dataframe if tReq, else returns Panel of all times if tReq is not None: # have to qualify this since picktime default gives last time as fallback tUsedInd = picktime(iono.time, tReq, None)[0] if tUsedInd is not None: # in case ind is 0 iono = iono[tUsedInd, ...] plasmaparam = plasmaparam[tUsedInd, ...] return iono, chi, plasmaparam
def staggered_to_right(f: xr.DataArray, block_size, dim, boundary='wrap'): """Move staggered variable to the right interface Parameters ---------- f : xr.DataArray block_size : size of the coarse graining block dim : str boundary : str, optional A boundary condition which is passed to `isel_bc` Returns ------- interface : xr.DataArray The value of f along the right interfaces of the coarse-grain blocks """ n = f.shape[f.get_axis_num(dim)] new_coord = get_center_coords(f[dim].values, block_size) idx = slice(block_size, n+block_size, block_size) f = isel_bc(f, idx, dim, boundary=boundary) return f.assign_coords(**{dim: new_coord})
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, assert_identical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, assert_allclose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, assert_equal) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={'x': range(4)}, dims=('x', 'y'), name='foo') self.lazy_array = DataArray(self.data, coords={'x': range(4)}, dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) assert chunked.chunks == ((2,) * 2, (2,) * 3) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() assert chunked.data.name.startswith('xarray-<this-array>') def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) assert isinstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], 'x') self.assertLazyAndAllClose(u, actual) @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4', reason='Need dask 0.16 for new interface') def test_compute(self): u = self.eager_array v = self.lazy_array assert dask.is_dask_collection(v) (v2,) = dask.compute(v + 1) assert not dask.is_dask_collection(v2) assert ((u + 1).data == v2.data).all() @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4', reason='Need dask 0.16 for new interface') def test_persist(self): u = self.eager_array v = self.lazy_array + 1 (v2,) = dask.persist(v) assert v is not v2 assert len(v2.__dask_graph__()) < len(v.__dask_graph__()) assert v2.__dask_keys__() == v.__dask_keys__() assert dask.is_dask_collection(v) assert dask.is_dask_collection(v2) self.assertLazyAndAllClose(u + 1, v) self.assertLazyAndAllClose(u + 1, v2) def test_concat_loads_variables(self): # Test that concat() computes not-in-memory variables at most once # and loads them in the output, while leaving the input unaltered. d1 = build_dask_array('d1') c1 = build_dask_array('c1') d2 = build_dask_array('d2') c2 = build_dask_array('c2') d3 = build_dask_array('d3') c3 = build_dask_array('c3') # Note: c is a non-index coord. # Index coords are loaded by IndexVariable.__init__. ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)}) ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)}) ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)}) assert kernel_call_count == 0 out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different') # each kernel is computed exactly once assert kernel_call_count == 6 # variables are loaded in the output assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat( [ds1, ds2, ds3], dim='n', data_vars='all', coords='all') # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat( [ds1, ds2, ds3], dim='n', data_vars=['d'], coords=['c']) # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[]) # variables are loaded once as we are validing that they're identical assert kernel_call_count == 12 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different', compat='identical') # compat=identical doesn't do any more kernel calls than compat=equals assert kernel_call_count == 18 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) # When the test for different turns true halfway through, # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={'d': ('x', [2.0])}, coords={'c': ('x', [2.0])}) out = xr.concat([ds1, ds2, ds4, ds3], dim='n', data_vars='different', coords='different') # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) # the data of ds1 and ds2 was loaded into numpy and then # concatenated to the data of ds3. Thus, only ds3 is computed now. out.compute() assert kernel_call_count == 24 # Finally, test that riginals are unaltered assert ds1['d'].data is d1 assert ds1['c'].data is c1 assert ds2['d'].data is d2 assert ds2['c'].data is c2 assert ds3['d'].data is d3 assert ds3['c'].data is c3 def test_groupby(self): if LooseVersion(dask.__version__) == LooseVersion('0.15.3'): pytest.xfail('upstream bug in dask: ' 'https://github.com/dask/dask/issues/2718') u = self.eager_array v = self.lazy_array expected = u.groupby('x').mean() actual = v.groupby('x').mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords['ab'] = ('x', ['a', 'a', 'b', 'b']) with raises_regex(NotImplementedError, 'dask'): v.groupby('ab').first() expected = u.groupby('ab').first() actual = v.groupby('ab').first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{'x': [2, 3, 4]}, {'x': [1, 100, 2, 101, 3]}, {'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3]}]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u['x']) self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename('bar')]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() assert count[0] == 1 def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=('w', 'x', 'y')) stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=['x', 'y']) expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) def test_dataarray_repr(self): # Test that __repr__ converts the dask backend to numpy # in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.DataArray 'data' (x: 1)> dask.array<shape=(1,), dtype=int64, chunksize=(1,)> Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x""") assert expected == repr(a) assert kernel_call_count == 0 def test_dataset_repr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.Dataset> Dimensions: (x: 1) Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x Data variables: a (x) int64 dask.array<shape=(1,), chunksize=(1,)>""") assert expected == repr(ds) assert kernel_call_count == 0 def test_dataarray_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) a1.compute() assert not a1._in_memory assert not a1.coords['y']._in_memory assert kernel_call_count == 2 a2 = pickle.loads(pickle.dumps(a1)) assert kernel_call_count == 2 assert_identical(a1, a2) assert not a1._in_memory assert not a2._in_memory assert not a1.coords['y']._in_memory assert not a2.coords['y']._in_memory def test_dataset_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds1 = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) ds1.compute() assert not ds1['a']._in_memory assert not ds1['y']._in_memory assert kernel_call_count == 2 ds2 = pickle.loads(pickle.dumps(ds1)) assert kernel_call_count == 2 assert_identical(ds1, ds2) assert not ds1['a']._in_memory assert not ds2['a']._in_memory assert not ds1['y']._in_memory assert not ds2['y']._in_memory def test_dataarray_getattr(self): # ipython/jupyter does a long list of getattr() calls to when trying to # represent an object. # Make sure we're not accidentally computing dask variables. data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(a, 'NOTEXIST') assert kernel_call_count == 0 def test_dataset_getattr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(ds, 'NOTEXIST') assert kernel_call_count == 0 def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1, 2]).chunk() assert not a._in_memory assert a.values.tolist() == [1, 2] assert not a._in_memory def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a)
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, assert_identical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, assert_allclose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, assert_equal) @pytest.fixture(autouse=True) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={"x": range(4)}, dims=("x", "y"), name="foo") self.lazy_array = DataArray(self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo") def test_rechunk(self): chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2}) assert chunked.chunks == ((2, ) * 2, (2, ) * 3) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() assert chunked.data.name.startswith("xarray-<this-array>") def test_lazy_dataset(self): lazy_ds = Dataset({"foo": (("x", "y"), self.data)}) assert isinstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], "x") self.assertLazyAndAllClose(u, actual) def test_compute(self): u = self.eager_array v = self.lazy_array assert dask.is_dask_collection(v) (v2, ) = dask.compute(v + 1) assert not dask.is_dask_collection(v2) assert ((u + 1).data == v2.data).all() def test_persist(self): u = self.eager_array v = self.lazy_array + 1 (v2, ) = dask.persist(v) assert v is not v2 assert len(v2.__dask_graph__()) < len(v.__dask_graph__()) assert v2.__dask_keys__() == v.__dask_keys__() assert dask.is_dask_collection(v) assert dask.is_dask_collection(v2) self.assertLazyAndAllClose(u + 1, v) self.assertLazyAndAllClose(u + 1, v2) def test_concat_loads_variables(self): # Test that concat() computes not-in-memory variables at most once # and loads them in the output, while leaving the input unaltered. d1 = build_dask_array("d1") c1 = build_dask_array("c1") d2 = build_dask_array("d2") c2 = build_dask_array("c2") d3 = build_dask_array("d3") c3 = build_dask_array("c3") # Note: c is a non-index coord. # Index coords are loaded by IndexVariable.__init__. ds1 = Dataset(data_vars={"d": ("x", d1)}, coords={"c": ("x", c1)}) ds2 = Dataset(data_vars={"d": ("x", d2)}, coords={"c": ("x", c2)}) ds3 = Dataset(data_vars={"d": ("x", d3)}, coords={"c": ("x", c3)}) assert kernel_call_count == 0 out = xr.concat([ds1, ds2, ds3], dim="n", data_vars="different", coords="different") # each kernel is computed exactly once assert kernel_call_count == 6 # variables are loaded in the output assert isinstance(out["d"].data, np.ndarray) assert isinstance(out["c"].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim="n", data_vars="all", coords="all") # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out["d"].data, dask.array.Array) assert isinstance(out["c"].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=["d"], coords=["c"]) # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out["d"].data, dask.array.Array) assert isinstance(out["c"].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=[], coords=[]) # variables are loaded once as we are validing that they're identical assert kernel_call_count == 12 assert isinstance(out["d"].data, np.ndarray) assert isinstance(out["c"].data, np.ndarray) out = xr.concat( [ds1, ds2, ds3], dim="n", data_vars="different", coords="different", compat="identical", ) # compat=identical doesn't do any more kernel calls than compat=equals assert kernel_call_count == 18 assert isinstance(out["d"].data, np.ndarray) assert isinstance(out["c"].data, np.ndarray) # When the test for different turns true halfway through, # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={"d": ("x", [2.0])}, coords={"c": ("x", [2.0])}) out = xr.concat([ds1, ds2, ds4, ds3], dim="n", data_vars="different", coords="different") # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 assert isinstance(out["d"].data, dask.array.Array) assert isinstance(out["c"].data, dask.array.Array) # the data of ds1 and ds2 was loaded into numpy and then # concatenated to the data of ds3. Thus, only ds3 is computed now. out.compute() assert kernel_call_count == 24 # Finally, test that riginals are unaltered assert ds1["d"].data is d1 assert ds1["c"].data is c1 assert ds2["d"].data is d2 assert ds2["c"].data is c2 assert ds3["d"].data is d3 assert ds3["c"].data is c3 def test_groupby(self): u = self.eager_array v = self.lazy_array expected = u.groupby("x").mean(...) actual = v.groupby("x").mean(...) self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords["ab"] = ("x", ["a", "a", "b", "b"]) with raises_regex(NotImplementedError, "dask"): v.groupby("ab").first() expected = u.groupby("ab").first() actual = v.groupby("ab").first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [ { "x": [2, 3, 4] }, { "x": [1, 100, 2, 101, 3] }, { "x": [2.5, 3, 3.5], "y": [2, 2.5, 3] }, ]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u["x"]) self.assertLazyAndEqual(expected, v.to_dataset("x").to_array("x")) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename("bar")]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) ds.load(scheduler=counting_get) assert count[0] == 1 def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=("w", "x", "y")) stacked = arr.stack(z=("x", "y")) z = pd.MultiIndex.from_product( [np.arange(3), np.arange(4)], names=["x", "y"]) expected = DataArray(data.reshape(2, -1), {"z": z}, dims=["w", "z"]) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) @pytest.mark.skipif(LooseVersion(dask.__version__) >= "2.0", reason="no meta") def test_dataarray_repr_legacy(self): data = build_dask_array("data") nonindex_coord = build_dask_array("coord") a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) expected = dedent("""\ <xarray.DataArray 'data' (x: 1)> {!r} Coordinates: y (x) int64 dask.array<chunksize=(1,), meta=np.ndarray> Dimensions without coordinates: x""".format(data)) assert expected == repr(a) assert kernel_call_count == 0 # should not evaluate dask array @pytest.mark.skipif(LooseVersion(dask.__version__) < "2.0", reason="needs meta") def test_dataarray_repr(self): data = build_dask_array("data") nonindex_coord = build_dask_array("coord") a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) expected = dedent("""\ <xarray.DataArray 'data' (x: 1)> {!r} Coordinates: y (x) int64 dask.array<chunksize=(1,), meta=np.ndarray> Dimensions without coordinates: x""".format(data)) assert expected == repr(a) assert kernel_call_count == 0 # should not evaluate dask array @pytest.mark.skipif(LooseVersion(dask.__version__) < "2.0", reason="needs meta") def test_dataset_repr(self): data = build_dask_array("data") nonindex_coord = build_dask_array("coord") ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)}) expected = dedent("""\ <xarray.Dataset> Dimensions: (x: 1) Coordinates: y (x) int64 dask.array<chunksize=(1,), meta=np.ndarray> Dimensions without coordinates: x Data variables: a (x) int64 dask.array<chunksize=(1,), meta=np.ndarray>""" ) assert expected == repr(ds) assert kernel_call_count == 0 # should not evaluate dask array def test_dataarray_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variable nor the non-index coords data = build_dask_array("data") nonindex_coord = build_dask_array("coord") a1 = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) a1.compute() assert not a1._in_memory assert not a1.coords["y"]._in_memory assert kernel_call_count == 2 a2 = pickle.loads(pickle.dumps(a1)) assert kernel_call_count == 2 assert_identical(a1, a2) assert not a1._in_memory assert not a2._in_memory assert not a1.coords["y"]._in_memory assert not a2.coords["y"]._in_memory def test_dataset_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array("data") nonindex_coord = build_dask_array("coord") ds1 = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)}) ds1.compute() assert not ds1["a"]._in_memory assert not ds1["y"]._in_memory assert kernel_call_count == 2 ds2 = pickle.loads(pickle.dumps(ds1)) assert kernel_call_count == 2 assert_identical(ds1, ds2) assert not ds1["a"]._in_memory assert not ds2["a"]._in_memory assert not ds1["y"]._in_memory assert not ds2["y"]._in_memory def test_dataarray_getattr(self): # ipython/jupyter does a long list of getattr() calls to when trying to # represent an object. # Make sure we're not accidentally computing dask variables. data = build_dask_array("data") nonindex_coord = build_dask_array("coord") a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) with suppress(AttributeError): getattr(a, "NOTEXIST") assert kernel_call_count == 0 def test_dataset_getattr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array("data") nonindex_coord = build_dask_array("coord") ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)}) with suppress(AttributeError): getattr(ds, "NOTEXIST") assert kernel_call_count == 0 def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1, 2]).chunk() assert not a._in_memory assert a.values.tolist() == [1, 2] assert not a._in_memory def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={"x": range(4)}, name="foo") self.assertLazyAndIdentical(self.lazy_array, a)
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayEqual) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={'x': range(4)}, dims=('x', 'y'), name='foo') self.lazy_array = DataArray(self.data, coords={'x': range(4)}, dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) self.assertEqual(chunked.chunks, ((2, ) * 2, (2, ) * 3)) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() self.assertTrue(chunked.data.name.startswith('xarray-<this-array>')) def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) self.assertIsInstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], 'x') self.assertLazyAndAllClose(u, actual) def test_groupby(self): u = self.eager_array v = self.lazy_array expected = u.groupby('x').mean() actual = v.groupby('x').mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords['ab'] = ('x', ['a', 'a', 'b', 'b']) with self.assertRaisesRegexp(NotImplementedError, 'dask'): v.groupby('ab').first() expected = u.groupby('ab').first() actual = v.groupby('ab').first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{ 'x': [2, 3, 4] }, { 'x': [1, 100, 2, 101, 3] }, { 'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3] }]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u['x']) self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename('bar')]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1) def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=('w', 'x', 'y')) stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product( [np.arange(3), np.arange(4)], names=['x', 'y']) expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) def test_variable_pickle(self): # Test that pickling/unpickling does not convert the dask # backend to numpy a1 = Variable(['x'], build_dask_array()) a1.compute() self.assertFalse(a1._in_memory) self.assertEquals(kernel_call_count, 1) a2 = pickle.loads(pickle.dumps(a1)) self.assertEquals(kernel_call_count, 1) self.assertVariableIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) def test_dataarray_pickle(self): # Test that pickling/unpickling does not convert the dask # backend to numpy a1 = DataArray(build_dask_array()) a1.compute() self.assertFalse(a1._in_memory) self.assertEquals(kernel_call_count, 1) a2 = pickle.loads(pickle.dumps(a1)) self.assertEquals(kernel_call_count, 1) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) def test_dataset_pickle(self): ds1 = Dataset({'a': DataArray(build_dask_array())}) ds1.compute() self.assertFalse(ds1['a']._in_memory) self.assertEquals(kernel_call_count, 1) ds2 = pickle.loads(pickle.dumps(ds1)) self.assertEquals(kernel_call_count, 1) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1, 2]).chunk() self.assertFalse(a._in_memory) self.assertEquals(a.values.tolist(), [1, 2]) self.assertFalse(a._in_memory) def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a)
def get_test_data(input_shape=(100, 50), output_shape=(200, 100), output_proj=None, input_dims=('y', 'x')): """Get common data objects used in testing. Returns: tuple with the following elements input_data_on_area: DataArray with dimensions as if it is a gridded dataset. input_area_def: AreaDefinition of the above DataArray input_data_on_swath: DataArray with dimensions as if it is a swath. input_swath: SwathDefinition of the above DataArray target_area_def: AreaDefinition to be used as a target for resampling """ from xarray import DataArray import dask.array as da from pyresample.geometry import AreaDefinition, SwathDefinition from pyresample.utils import proj4_str_to_dict ds1 = DataArray(da.zeros(input_shape, chunks=85), dims=input_dims, attrs={ 'name': 'test_data_name', 'test': 'test' }) if input_dims and 'y' in input_dims: ds1 = ds1.assign_coords(y=da.arange(input_shape[-2], chunks=85)) if input_dims and 'x' in input_dims: ds1 = ds1.assign_coords(x=da.arange(input_shape[-1], chunks=85)) if input_dims and 'bands' in input_dims: ds1 = ds1.assign_coords(bands=list('RGBA'[:ds1.sizes['bands']])) input_proj_str = ('+proj=geos +lon_0=-95.0 +h=35786023.0 +a=6378137.0 ' '+b=6356752.31414 +sweep=x +units=m +no_defs') source = AreaDefinition( 'test_target', 'test_target', 'test_target', proj4_str_to_dict(input_proj_str), input_shape[1], # width input_shape[0], # height (-1000., -1500., 1000., 1500.)) ds1.attrs['area'] = source if CRS is not None: crs = CRS.from_string(input_proj_str) ds1 = ds1.assign_coords(crs=crs) ds2 = ds1.copy() input_area_shape = tuple(ds1.sizes[dim] for dim in ds1.dims if dim in ['y', 'x']) geo_dims = ('y', 'x') if input_dims else None lons = da.random.random(input_area_shape, chunks=50) lats = da.random.random(input_area_shape, chunks=50) swath_def = SwathDefinition(DataArray(lons, dims=geo_dims), DataArray(lats, dims=geo_dims)) ds2.attrs['area'] = swath_def if CRS is not None: crs = CRS.from_string('+proj=latlong +datum=WGS84 +ellps=WGS84') ds2 = ds2.assign_coords(crs=crs) # set up target definition output_proj_str = ('+proj=lcc +datum=WGS84 +ellps=WGS84 ' '+lon_0=-95. +lat_0=25 +lat_1=25 +units=m +no_defs') output_proj_str = output_proj or output_proj_str target = AreaDefinition( 'test_target', 'test_target', 'test_target', proj4_str_to_dict(output_proj_str), output_shape[1], # width output_shape[0], # height (-1000., -1500., 1000., 1500.), ) return ds1, source, ds2, swath_def, target
def read_plink1_bin(bed, bim=None, fam=None, verbose=True): """ Read PLINK 1 binary files [1]_ into a data array. A PLINK 1 binary file set consists of three files: - BED: containing the genotype. - BIM: containing variant information. - FAM: containing sample information. The user might provide a single file path to a BED file, from which this function will try to infer the file path of the other two files. This function also allows the user to provide file path to multiple BED and BIM files, as it is common to have a data set split into multiple files, one per chromosome. This function returns a samples-by-variants matrix. This is a special kind of matrix with rows and columns having multiple coordinates each. Those coordinates have the metainformation contained in the BIM and FAM files. Examples -------- The following example reads two BED files and two BIM files correspondig to chromosomes 11 and 12, and read a single FAM file whose filename is inferred from the BED filenames. .. doctest:: >>> from os.path import join >>> from pandas_plink import read_plink1_bin >>> from pandas_plink import get_data_folder >>> G = read_plink1_bin(join(get_data_folder(), "chr*.bed"), verbose=False) >>> print(G) <xarray.DataArray 'genotype' (sample: 14, variant: 1252)> dask.array<concatenate, shape=(14, 1252), dtype=float64, chunksize=(14, 779), chunktype=numpy.ndarray> Coordinates: * sample (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' * variant (variant) object '11_316849996' '11_316874359' ... '12_373081507' fid (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014' iid (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014' father (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' mother (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' gender (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' trait (sample) float64 -9.0 -9.0 -9.0 -9.0 -9.0 ... -9.0 -9.0 -9.0 -9.0 chrom (variant) <U2 '11' '11' '11' '11' '11' ... '12' '12' '12' '12' '12' snp (variant) <U9 '316849996' '316874359' ... '372918788' '373081507' cm (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 pos (variant) int64 157439 181802 248969 ... 27163741 27205125 27367844 a0 (variant) <U1 'C' 'G' 'G' 'C' 'C' 'T' ... 'A' 'A' 'G' 'A' 'T' 'G' a1 (variant) <U1 'T' 'C' 'C' 'T' 'T' 'A' ... 'T' 'G' 'A' 'T' 'C' 'A' >>> print(G.shape) (14, 1252) Suppose we want the genotypes of the chromosome 11 only: .. doctest:: >>> G = G.where(G.chrom == "11", drop=True) >>> print(G) <xarray.DataArray 'genotype' (sample: 14, variant: 779)> dask.array<where, shape=(14, 779), dtype=float64, chunksize=(14, 779), chunktype=numpy.ndarray> Coordinates: * sample (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' * variant (variant) object '11_316849996' '11_316874359' ... '11_345698259' fid (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014' iid (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014' father (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' mother (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' gender (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' trait (sample) float64 -9.0 -9.0 -9.0 -9.0 -9.0 ... -9.0 -9.0 -9.0 -9.0 chrom (variant) <U2 '11' '11' '11' '11' '11' ... '11' '11' '11' '11' '11' snp (variant) <U9 '316849996' '316874359' ... '345653648' '345698259' cm (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 pos (variant) int64 157439 181802 248969 ... 28937375 28961091 29005702 a0 (variant) <U1 'C' 'G' 'G' 'C' 'C' 'T' ... 'T' 'A' 'C' 'A' 'A' 'T' a1 (variant) <U1 'T' 'C' 'C' 'T' 'T' 'A' ... 'C' 'G' 'T' 'G' 'C' 'C' >>> print(G.shape) (14, 779) Lets now print the genotype value of the sample `B003` for variant `11_316874359`: .. doctest:: >>> print(G.sel(sample="B003", variant="11_316874359").values) 0.0 The special matrix we return is of type :class:`xarray.DataArray`. More information about it can be found at the `xarray documentation <http://xarray.pydata.org>`_. Parameters ---------- bed : str Path to a BED file. It can contain shell-style wildcards to indicate multiple BED files. bim : str, optional Path to a BIM file. It can contain shell-style wildcards to indicate multiple BIM files. It defaults to ``None``, in which case it will try to be inferred. fam : str, optional Path to a FAM file. It defaults to ``None``, in which case it will try to be inferred. verbose : bool ``True`` for progress information; ``False`` otherwise. Returns ------- G : :class:`xarray.DataArray` Genotype with metadata. References ---------- .. [1] PLINK 1 binary. https://www.cog-genomics.org/plink/2.0/input#bed """ from numpy import int64, float64 from tqdm import tqdm from xarray import DataArray import pandas as pd import dask.array as da bed_files = sorted(glob(bed)) if len(bed_files) == 0: raise ValueError("No BED file has been found.") if bim is None: bim_files = [last_replace(f, ".bed", ".bim") for f in bed_files] else: bim_files = sorted(glob(bim)) if len(bim_files) == 0: raise ValueError("No BIM file has been found.") if fam is None: fam_files = [last_replace(f, ".bed", ".fam") for f in bed_files] else: fam_files = sorted(glob(fam)) if len(fam_files) == 0: raise ValueError("No FAM file has been found.") if len(bed_files) != len(bim_files): raise ValueError("The numbers of BED and BIM files must match.") if len(fam_files) > 1: msg = "More than one FAM file has been specified. Only the first one will be " msg += "considered." if verbose: warnings.warn(msg, UserWarning) fam_files = fam_files[:1] nfiles = len(bed_files) + len(bim_files) + 1 pbar = tqdm(desc="Mapping files", total=nfiles, disable=not verbose) bims = _read_file(bim_files, lambda f: _read_bim(f), pbar) nmarkers = {bed_files[i]: b.shape[0] for i, b in enumerate(bims)} bim = pd.concat(bims, axis=0, ignore_index=True) del bim["i"] fam = _read_file(fam_files, lambda f: _read_fam(f), pbar)[0] del fam["i"] nsamples = fam.shape[0] sample_ids = fam["iid"] variant_ids = bim["chrom"].astype(str) + "_" + bim["snp"].astype(str) G = _read_file(bed_files, lambda f: _read_bed(f, nsamples, nmarkers[f]).T, pbar) G = da.concatenate(G, axis=1) G = DataArray(G, dims=["sample", "variant"], coords=[sample_ids, variant_ids]) sample = {c: ("sample", fam[c].tolist()) for c in fam.columns} variant = {c: ("variant", bim[c].tolist()) for c in bim.columns} G = G.assign_coords(**sample) G = G.assign_coords(**variant) G.name = "genotype" G["pos"] = G["pos"].astype(int64) G["cm"] = G["cm"].astype(float64) G["trait"] = G["trait"].astype(float64) pbar.close() return G
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayEqual) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={'x': range(4)}, dims=('x', 'y'), name='foo') self.lazy_array = DataArray(self.data, coords={'x': range(4)}, dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) self.assertEqual(chunked.chunks, ((2, ) * 2, (2, ) * 3)) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() self.assertTrue(chunked.data.name.startswith('xarray-<this-array>')) def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) self.assertIsInstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], 'x') self.assertLazyAndAllClose(u, actual) def test_concat_loads_variables(self): # Test that concat() computes not-in-memory variables at most once # and loads them in the output, while leaving the input unaltered. d1 = build_dask_array('d1') c1 = build_dask_array('c1') d2 = build_dask_array('d2') c2 = build_dask_array('c2') d3 = build_dask_array('d3') c3 = build_dask_array('c3') # Note: c is a non-index coord. # Index coords are loaded by IndexVariable.__init__. ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)}) ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)}) ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)}) assert kernel_call_count == 0 out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different') # each kernel is computed exactly once assert kernel_call_count == 6 # variables are loaded in the output assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='all', coords='all') # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=['d'], coords=['c']) # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[]) # variables are loaded once as we are validing that they're identical assert kernel_call_count == 12 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different', compat='identical') # compat=identical doesn't do any more kernel calls than compat=equals assert kernel_call_count == 18 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) # When the test for different turns true halfway through, # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={'d': ('x', [2.0])}, coords={'c': ('x', [2.0])}) out = xr.concat([ds1, ds2, ds4, ds3], dim='n', data_vars='different', coords='different') # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) # the data of ds1 and ds2 was loaded into numpy and then # concatenated to the data of ds3. Thus, only ds3 is computed now. out.compute() assert kernel_call_count == 24 # Finally, test that riginals are unaltered assert ds1['d'].data is d1 assert ds1['c'].data is c1 assert ds2['d'].data is d2 assert ds2['c'].data is c2 assert ds3['d'].data is d3 assert ds3['c'].data is c3 def test_groupby(self): if LooseVersion(dask.__version__) == LooseVersion('0.15.3'): pytest.xfail('upstream bug in dask: ' 'https://github.com/dask/dask/issues/2718') u = self.eager_array v = self.lazy_array expected = u.groupby('x').mean() actual = v.groupby('x').mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords['ab'] = ('x', ['a', 'a', 'b', 'b']) with self.assertRaisesRegexp(NotImplementedError, 'dask'): v.groupby('ab').first() expected = u.groupby('ab').first() actual = v.groupby('ab').first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{ 'x': [2, 3, 4] }, { 'x': [1, 100, 2, 101, 3] }, { 'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3] }]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u['x']) self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename('bar')]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1) def test_persist_Dataset(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() ds = ds + 1 n = len(ds.foo.data.dask) ds2 = ds.persist() assert len(ds2.foo.data.dask) == 1 assert len(ds.foo.data.dask) == n # doesn't mutate in place def test_persist_DataArray(self): x = da.arange(10, chunks=(5, )) y = DataArray(x) z = y + 1 n = len(z.data.dask) zz = z.persist() assert len(z.data.dask) == n assert len(zz.data.dask) == zz.data.npartitions def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=('w', 'x', 'y')) stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product( [np.arange(3), np.arange(4)], names=['x', 'y']) expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) def test_dataarray_repr(self): # Test that __repr__ converts the dask backend to numpy # in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.DataArray 'data' (x: 1)> dask.array<shape=(1,), dtype=int64, chunksize=(1,)> Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x""") self.assertEqual(expected, repr(a)) self.assertEquals(kernel_call_count, 0) def test_dataset_repr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.Dataset> Dimensions: (x: 1) Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x Data variables: a (x) int64 dask.array<shape=(1,), chunksize=(1,)>""") self.assertEqual(expected, repr(ds)) self.assertEquals(kernel_call_count, 0) def test_dataarray_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) a1.compute() self.assertFalse(a1._in_memory) self.assertFalse(a1.coords['y']._in_memory) self.assertEquals(kernel_call_count, 2) a2 = pickle.loads(pickle.dumps(a1)) self.assertEquals(kernel_call_count, 2) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) self.assertFalse(a1.coords['y']._in_memory) self.assertFalse(a2.coords['y']._in_memory) def test_dataset_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds1 = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) ds1.compute() self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds1['y']._in_memory) self.assertEquals(kernel_call_count, 2) ds2 = pickle.loads(pickle.dumps(ds1)) self.assertEquals(kernel_call_count, 2) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) self.assertFalse(ds1['y']._in_memory) self.assertFalse(ds2['y']._in_memory) def test_dataarray_getattr(self): # ipython/jupyter does a long list of getattr() calls to when trying to # represent an object. # Make sure we're not accidentally computing dask variables. data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(a, 'NOTEXIST') self.assertEquals(kernel_call_count, 0) def test_dataset_getattr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(ds, 'NOTEXIST') self.assertEquals(kernel_call_count, 0) def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1, 2]).chunk() self.assertFalse(a._in_memory) self.assertEquals(a.values.tolist(), [1, 2]) self.assertFalse(a._in_memory) def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a) def test_to_dask_dataframe(self): # Test conversion of Datasets to dask DataFrames x = da.from_array(np.random.randn(10), chunks=4) y = np.arange(10, dtype='uint8') t = list('abcdefghij') ds = Dataset( OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))])) expected_pd = pd.DataFrame({ 'a': x, 'b': y }, index=pd.Index(t, name='t')) # test if 1-D index is correctly set up expected = dd.from_pandas(expected_pd, chunksize=4) actual = ds.to_dask_dataframe(set_index=True) # test if we have dask dataframes self.assertIsInstance(actual, dd.DataFrame) # use the .equals from pandas to check dataframes are equivalent assert_frame_equal(expected.compute(), actual.compute()) # test if no index is given expected = dd.from_pandas(expected_pd.reset_index(drop=False), chunksize=4) actual = ds.to_dask_dataframe(set_index=False) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected.compute(), actual.compute()) def test_to_dask_dataframe_2D(self): # Test if 2-D dataset is supplied w = da.from_array(np.random.randn(2, 3), chunks=(1, 2)) ds = Dataset({'w': (('x', 'y'), w)}) ds['x'] = ('x', np.array([0, 1], np.int64)) ds['y'] = ('y', list('abc')) # dask dataframes do not (yet) support multiindex, # but when it does, this would be the expected index: exp_index = pd.MultiIndex.from_arrays( [[0, 0, 0, 1, 1, 1], ['a', 'b', 'c', 'a', 'b', 'c']], names=['x', 'y']) expected = pd.DataFrame({'w': w.reshape(-1)}, index=exp_index) # so for now, reset the index expected = expected.reset_index(drop=False) actual = ds.to_dask_dataframe(set_index=False) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected, actual.compute()) def test_to_dask_dataframe_coordinates(self): # Test if coordinate is also a dask array x = da.from_array(np.random.randn(10), chunks=4) t = da.from_array(np.arange(10) * 2, chunks=4) ds = Dataset(OrderedDict([('a', ('t', x)), ('t', ('t', t))])) expected_pd = pd.DataFrame({'a': x}, index=pd.Index(t, name='t')) expected = dd.from_pandas(expected_pd, chunksize=4) actual = ds.to_dask_dataframe(set_index=True) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected.compute(), actual.compute()) def test_to_dask_dataframe_not_daskarray(self): # Test if DataArray is not a dask array x = np.random.randn(10) y = np.arange(10, dtype='uint8') t = list('abcdefghij') ds = Dataset( OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))])) expected = pd.DataFrame({'a': x, 'b': y}, index=pd.Index(t, name='t')) actual = ds.to_dask_dataframe(set_index=True) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected, actual.compute()) def test_to_dask_dataframe_no_coordinate(self): # Test if Dataset has a dimension without coordinates x = da.from_array(np.random.randn(10), chunks=4) ds = Dataset({'x': ('dim_0', x)}) expected = pd.DataFrame({'x': x.compute()}) actual = ds.to_dask_dataframe(set_index=True) assert_frame_equal(expected, actual.compute())
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayEqual) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={'x': range(4)}, dims=('x', 'y'), name='foo') self.lazy_array = DataArray(self.data, coords={'x': range(4)}, dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) self.assertEqual(chunked.chunks, ((2,) * 2, (2,) * 3)) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() self.assertTrue(chunked.data.name.startswith('xarray-<this-array>')) def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) self.assertIsInstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], 'x') self.assertLazyAndAllClose(u, actual) def test_groupby(self): u = self.eager_array v = self.lazy_array expected = u.groupby('x').mean() actual = v.groupby('x').mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords['ab'] = ('x', ['a', 'a', 'b', 'b']) with self.assertRaisesRegexp(NotImplementedError, 'dask'): v.groupby('ab').first() expected = u.groupby('ab').first() actual = v.groupby('ab').first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{'x': [2, 3, 4]}, {'x': [1, 100, 2, 101, 3]}, {'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3]}]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u['x']) self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename('bar')]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1) def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=('w', 'x', 'y')) stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=['x', 'y']) expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) def test_variable_pickle(self): # Test that pickling/unpickling does not convert the dask # backend to numpy a1 = Variable(['x'], build_dask_array()) a1.compute() self.assertFalse(a1._in_memory) self.assertEquals(kernel_call_count, 1) a2 = pickle.loads(pickle.dumps(a1)) self.assertEquals(kernel_call_count, 1) self.assertVariableIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) def test_dataarray_pickle(self): # Test that pickling/unpickling does not convert the dask # backend to numpy a1 = DataArray(build_dask_array()) a1.compute() self.assertFalse(a1._in_memory) self.assertEquals(kernel_call_count, 1) a2 = pickle.loads(pickle.dumps(a1)) self.assertEquals(kernel_call_count, 1) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) def test_dataset_pickle(self): ds1 = Dataset({'a': DataArray(build_dask_array())}) ds1.compute() self.assertFalse(ds1['a']._in_memory) self.assertEquals(kernel_call_count, 1) ds2 = pickle.loads(pickle.dumps(ds1)) self.assertEquals(kernel_call_count, 1) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1,2]).chunk() self.assertFalse(a._in_memory) self.assertEquals(a.values.tolist(), [1, 2]) self.assertFalse(a._in_memory) def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a)
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayEqual) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={'x': range(4)}, dims=('x', 'y'), name='foo') self.lazy_array = DataArray(self.data, coords={'x': range(4)}, dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) self.assertEqual(chunked.chunks, ((2,) * 2, (2,) * 3)) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() self.assertTrue(chunked.data.name.startswith('xarray-<this-array>')) def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) self.assertIsInstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], 'x') self.assertLazyAndAllClose(u, actual) @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4', reason='Need dask 0.16 for new interface') def test_compute(self): u = self.eager_array v = self.lazy_array assert dask.is_dask_collection(v) (v2,) = dask.compute(v + 1) assert not dask.is_dask_collection(v2) assert ((u + 1).data == v2.data).all() @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4', reason='Need dask 0.16 for new interface') def test_persist(self): u = self.eager_array v = self.lazy_array + 1 (v2,) = dask.persist(v) assert v is not v2 assert len(v2.__dask_graph__()) < len(v.__dask_graph__()) assert v2.__dask_keys__() == v.__dask_keys__() assert dask.is_dask_collection(v) assert dask.is_dask_collection(v2) self.assertLazyAndAllClose(u + 1, v) self.assertLazyAndAllClose(u + 1, v2) def test_concat_loads_variables(self): # Test that concat() computes not-in-memory variables at most once # and loads them in the output, while leaving the input unaltered. d1 = build_dask_array('d1') c1 = build_dask_array('c1') d2 = build_dask_array('d2') c2 = build_dask_array('c2') d3 = build_dask_array('d3') c3 = build_dask_array('c3') # Note: c is a non-index coord. # Index coords are loaded by IndexVariable.__init__. ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)}) ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)}) ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)}) assert kernel_call_count == 0 out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different') # each kernel is computed exactly once assert kernel_call_count == 6 # variables are loaded in the output assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='all', coords='all') # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=['d'], coords=['c']) # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[]) # variables are loaded once as we are validing that they're identical assert kernel_call_count == 12 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different', compat='identical') # compat=identical doesn't do any more kernel calls than compat=equals assert kernel_call_count == 18 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) # When the test for different turns true halfway through, # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={'d': ('x', [2.0])}, coords={'c': ('x', [2.0])}) out = xr.concat([ds1, ds2, ds4, ds3], dim='n', data_vars='different', coords='different') # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) # the data of ds1 and ds2 was loaded into numpy and then # concatenated to the data of ds3. Thus, only ds3 is computed now. out.compute() assert kernel_call_count == 24 # Finally, test that riginals are unaltered assert ds1['d'].data is d1 assert ds1['c'].data is c1 assert ds2['d'].data is d2 assert ds2['c'].data is c2 assert ds3['d'].data is d3 assert ds3['c'].data is c3 def test_groupby(self): if LooseVersion(dask.__version__) == LooseVersion('0.15.3'): pytest.xfail('upstream bug in dask: ' 'https://github.com/dask/dask/issues/2718') u = self.eager_array v = self.lazy_array expected = u.groupby('x').mean() actual = v.groupby('x').mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords['ab'] = ('x', ['a', 'a', 'b', 'b']) with raises_regex(NotImplementedError, 'dask'): v.groupby('ab').first() expected = u.groupby('ab').first() actual = v.groupby('ab').first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{'x': [2, 3, 4]}, {'x': [1, 100, 2, 101, 3]}, {'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3]}]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u['x']) self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename('bar')]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1) def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=('w', 'x', 'y')) stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=['x', 'y']) expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) def test_dataarray_repr(self): # Test that __repr__ converts the dask backend to numpy # in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.DataArray 'data' (x: 1)> dask.array<shape=(1,), dtype=int64, chunksize=(1,)> Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x""") self.assertEqual(expected, repr(a)) assert kernel_call_count == 0 def test_dataset_repr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.Dataset> Dimensions: (x: 1) Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x Data variables: a (x) int64 dask.array<shape=(1,), chunksize=(1,)>""") self.assertEqual(expected, repr(ds)) assert kernel_call_count == 0 def test_dataarray_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) a1.compute() self.assertFalse(a1._in_memory) self.assertFalse(a1.coords['y']._in_memory) assert kernel_call_count == 2 a2 = pickle.loads(pickle.dumps(a1)) assert kernel_call_count == 2 self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) self.assertFalse(a1.coords['y']._in_memory) self.assertFalse(a2.coords['y']._in_memory) def test_dataset_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds1 = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) ds1.compute() self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds1['y']._in_memory) assert kernel_call_count == 2 ds2 = pickle.loads(pickle.dumps(ds1)) assert kernel_call_count == 2 self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) self.assertFalse(ds1['y']._in_memory) self.assertFalse(ds2['y']._in_memory) def test_dataarray_getattr(self): # ipython/jupyter does a long list of getattr() calls to when trying to # represent an object. # Make sure we're not accidentally computing dask variables. data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(a, 'NOTEXIST') assert kernel_call_count == 0 def test_dataset_getattr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(ds, 'NOTEXIST') assert kernel_call_count == 0 def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1, 2]).chunk() self.assertFalse(a._in_memory) assert a.values.tolist() == [1, 2] self.assertFalse(a._in_memory) def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a)
def add_name(ds: xr.DataArray, name: str): return ds.assign_coords(id=name)
def set_spatial_precision( array: xr.DataArray, precision: int ) -> xr.DataArray: if precision is None: return array sdims = [ array.dims[-2], array.dims[-1] ] rounded_coords = { dim: array.coords[dim].round( precision ) for dim in sdims } return array.assign_coords( rounded_coords )
def normalize_spatial_dimensions( arr: xr.DataArray, origin: str = 'center', dims: Iterable[Hashable] = ('x', 'y')) -> xr.DataArray: """Normalize spatial coordinates For a dataarray with arbitrary spatial positioning, introduce new spatial indexing that puts (0, 0) either in the center of the scan or in the bottom left corner. This is a necessary preprocessing if multiple scans of the same size should be aligned. Args: arr: input dataarray origin: {'center' (default), 'min'} - 'center': new spatial coordinates will have (0, 0) at the center of the image - 'min': new spatial coordinates will have (0, 0) at the bottom left corner of the image dims: dimensions that should be normalized, defaults to ('x', 'y') Returns: dataarray with normalized spatial coordinates, the orginal coordinates are retained as 'orginal_name'_old. Examples: a : <xarray.DataArray (pixel: 4)> array([0., 0., 0., 0.]) Coordinates: x (pixel) int64 6 6 9 9 y (pixel) int64 1 3 1 3 Dimensions without coordinates: pixel >>> normalize_spatial_dimensions(a, 'center') <xarray.DataArray (pixel: 4)> array([0., 0., 0., 0.]) Coordinates: x_old (pixel) int64 6 6 9 9 y_old (pixel) int64 1 3 1 3 y (pixel) float64 -1.0 1.0 -1.0 1.0 x (pixel) float64 -1.5 -1.5 1.5 1.5 Dimensions without coordinates: pixel >>> normalize_spatial_dimensions(a, 'min') <xarray.DataArray (pixel: 4)> array([0., 0., 0., 0.]) Coordinates: x_old (pixel) int64 6 6 9 9 y_old (pixel) int64 1 3 1 3 y (pixel) int64 0 2 0 2 x (pixel) int64 0 0 3 3 Dimensions without coordinates: pixel """ orig_dims = set(arr.dims) dims = set(dims) arr = arr.rename({k: f'{k}_old' for k in dims}) new_coords = {} for dim in dims: coord = arr.coords[f'{dim}_old'] if origin == 'center': c0 = (coord.max() + coord.min()) / 2 elif origin == 'min': c0 = coord.min() else: raise ValueError('Coordinate origin `{origin}` is not supported') new_coords[dim] = coord - c0 arr = arr.assign_coords(new_coords) arr = arr.swap_dims({f'{k}_old': k for k in dims & orig_dims}) return arr
def gard_postprocess( model_output: xr.Dataset, scrf: xr.DataArray, label: str, model_params: Optional[Dict[str, Any]] = None, **kwargs, ) -> xr.Dataset: """ Add perturbation to the mean prediction of GARD to more accurately represent extreme events. The perturbation is generated with the prediction error during model fit scaled with a spatio-temporally correlated random field. Parameters ---------- model_output : xr.Dataset GARD model prediction output. Should contain three variables: pred (predicted mean), prediction_error (prediction error in fit), and exceedance_prob (probability of exceedance for threshold) scrf : xr.DataArray Spatio-temporally correlated random fields (SCRF) model_params : Dict Model parameter dictionary Returns ------- downscaled : xr.Dataset Final downscaled output """ if model_params is not None: thresh = model_params.get('thresh') else: thresh = None ## CURRENTLY needs calendar to be gregorian ## TODO: merge in the calendar conversion for GCMs and this should work great! assert len(scrf.time) == len(model_output.time) assert len(scrf.lat) == len(model_output.lat) assert len(scrf.lon) == len(model_output.lon) scrf = scrf.assign_coords({ 'lat': model_output.lat, 'lon': model_output.lon, 'time': model_output.time }) if thresh is not None: # convert scrf from a normal distribution to a uniform distribution scrf_uniform = xr.apply_ufunc(norm.cdf, scrf, dask='parallelized', output_dtypes=[scrf.dtype]) # find where exceedance prob is exceeded mask = scrf_uniform > (1 - model_output['exceedance_prob']) # Rescale the uniform distribution new_uniform = (scrf_uniform - (1 - model_output['exceedance_prob']) ) / model_output['exceedance_prob'] # Get the normal distribution equivalent of new_uniform r_normal = xr.apply_ufunc(norm.ppf, new_uniform, dask='parallelized', output_dtypes=[new_uniform.dtype]) downscaled = model_output[ 'pred'] + r_normal * model_output['prediction_error'] # what do we do for thresholds like heat wave? valids = xr.ufuncs.logical_or(mask, downscaled >= 0) downscaled = downscaled.where(valids, 0) else: downscaled = model_output[ 'pred'] + scrf * model_output['prediction_error'] downscaled = downscaled.chunk({'time': 365, 'lat': 150, 'lon': 150}) return downscaled.to_dataset(name=label)
class TestDataArrayAndDataset(DaskTestCase): def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical) def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose) def assertLazyAndEqual(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayEqual) def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_array = DataArray(self.values, coords={'x': range(4)}, dims=('x', 'y'), name='foo') self.lazy_array = DataArray(self.data, coords={'x': range(4)}, dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) self.assertEqual(chunked.chunks, ((2, ) * 2, (2, ) * 3)) self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() self.assertTrue(chunked.data.name.startswith('xarray-<this-array>')) def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) self.assertIsInstance(lazy_ds.foo.variable.data, da.Array) def test_lazy_array(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(u, v) self.assertLazyAndAllClose(-u, -v) self.assertLazyAndAllClose(u.T, v.T) self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(1 + u, 1 + v) actual = xr.concat([v[:2], v[2:]], 'x') self.assertLazyAndAllClose(u, actual) def test_groupby(self): if LooseVersion(dask.__version__) == LooseVersion('0.15.3'): pytest.xfail('upstream bug in dask: ' 'https://github.com/dask/dask/issues/2718') u = self.eager_array v = self.lazy_array expected = u.groupby('x').mean() actual = v.groupby('x').mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): u = self.eager_array v = self.lazy_array for coords in [u.coords, v.coords]: coords['ab'] = ('x', ['a', 'a', 'b', 'b']) with self.assertRaisesRegexp(NotImplementedError, 'dask'): v.groupby('ab').first() expected = u.groupby('ab').first() actual = v.groupby('ab').first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): u = self.eager_array.assign_coords(y=range(6)) v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{ 'x': [2, 3, 4] }, { 'x': [1, 100, 2, 101, 3] }, { 'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3] }]: expected = u.reindex(**kwargs) actual = v.reindex(**kwargs) self.assertLazyAndAllClose(expected, actual) def test_to_dataset_roundtrip(self): u = self.eager_array v = self.lazy_array expected = u.assign_coords(x=u['x']) self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): def duplicate_and_merge(array): return xr.merge([array, array.rename('bar')]).to_array() expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array v = self.lazy_array self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndEqual(expected, DataArray(a).where(y)) self.assertLazyAndEqual(expected, DataArray(x).where(b)) self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1) def test_persist_Dataset(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() ds = ds + 1 n = len(ds.foo.data.dask) ds2 = ds.persist() assert len(ds2.foo.data.dask) == 1 assert len(ds.foo.data.dask) == n # doesn't mutate in place def test_persist_DataArray(self): x = da.arange(10, chunks=(5, )) y = DataArray(x) z = y + 1 n = len(z.data.dask) zz = z.persist() assert len(z.data.dask) == n assert len(zz.data.dask) == zz.data.npartitions def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=('w', 'x', 'y')) stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product( [np.arange(3), np.arange(4)], names=['x', 'y']) expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) def test_dataarray_repr(self): # Test that __repr__ converts the dask backend to numpy # in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.DataArray 'data' (x: 1)> dask.array<shape=(1,), dtype=int64, chunksize=(1,)> Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x""") self.assertEqual(expected, repr(a)) self.assertEquals(kernel_call_count, 0) def test_dataset_repr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) expected = dedent("""\ <xarray.Dataset> Dimensions: (x: 1) Coordinates: y (x) int64 dask.array<shape=(1,), chunksize=(1,)> Dimensions without coordinates: x Data variables: a (x) int64 dask.array<shape=(1,), chunksize=(1,)>""") self.assertEqual(expected, repr(ds)) self.assertEquals(kernel_call_count, 0) def test_dataarray_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variable nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) a1.compute() self.assertFalse(a1._in_memory) self.assertFalse(a1.coords['y']._in_memory) self.assertEquals(kernel_call_count, 2) a2 = pickle.loads(pickle.dumps(a1)) self.assertEquals(kernel_call_count, 2) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) self.assertFalse(a1.coords['y']._in_memory) self.assertFalse(a2.coords['y']._in_memory) def test_dataset_pickle(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds1 = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) ds1.compute() self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds1['y']._in_memory) self.assertEquals(kernel_call_count, 2) ds2 = pickle.loads(pickle.dumps(ds1)) self.assertEquals(kernel_call_count, 2) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) self.assertFalse(ds1['y']._in_memory) self.assertFalse(ds2['y']._in_memory) def test_dataarray_getattr(self): # ipython/jupyter does a long list of getattr() calls to when trying to # represent an object. # Make sure we're not accidentally computing dask variables. data = build_dask_array('data') nonindex_coord = build_dask_array('coord') a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(a, 'NOTEXIST') self.assertEquals(kernel_call_count, 0) def test_dataset_getattr(self): # Test that pickling/unpickling converts the dask backend # to numpy in neither the data variables nor the non-index coords data = build_dask_array('data') nonindex_coord = build_dask_array('coord') ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) with suppress(AttributeError): getattr(ds, 'NOTEXIST') self.assertEquals(kernel_call_count, 0) def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy a = DataArray([1, 2]).chunk() self.assertFalse(a._in_memory) self.assertEquals(a.values.tolist(), [1, 2]) self.assertFalse(a._in_memory) def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable, coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a)
def get_adf15( self, element: str, charge: str, filetype: str, year="", ) -> DataArray: """Read data from the specified ADF15 ADAS file. Implementation is capable of reading files with compact and expanded formatting e.g. pec96][ne_pju][ne9.dat and pec40][ar_cl][ar16.dat respectively Parameters ---------- element The atomic symbol for the element which will be retrieved. charge Charge state of the ion (e.g. 16 for Ar 16+), can also include other string for more complicated path (transport_llu][ar15ic.dat setting charge to "15ic") filetype The type of data to retrieve. Options: ic, cl, ca, ls, llu, ... year The two-digit year label for the data. = "transport" if special transport path Returns ------- : The data in the specified file. Dimensions are density and temperature. Each members of the dataset correspond to a different charge state. """ def explicit_reshape(data_to_reshape, nd, nt): data = np.empty((nd, nt)) for id in range(nd): for it in range(nt): data[id, it] = data_to_reshape[id * nt + it] return data def build_file_component(year, element): file_component = "transport" if year != "transport": file_component = f"pec{year}][{element.lower()}" return file_component def file_type(identifier): identifier_dict = { "+": "compact", ":": "expanded", } file_type = identifier_dict.get(identifier) if file_type is None: raise ValueError( f"Unknown file header identified ({identifier}).") return file_type def transition_match(transition_line): transition_type = "orbitals" match = ( r"c\s+(\d+.)" # isel r"\s+(\d+.\d+)" # wavelength r"\s+(\d+)(\(\d\)\d\(.+\d?.\d\))-" # transition upper level r".+(\d+)(\(\d\)\d\(.+\d?.\d\))" # transition lower level ) header_re = re.compile(match) m = header_re.search(transition_line) if not m: transition_type = "n_levels" match = r"c\s+(\d+.)\s+(\d+.\d+)\s+([n]\=.\d+.-.[n]\=.\d+)" header_re = re.compile(match) m = header_re.search(transition_line) if not m: raise ValueError( f"Unknown transition formatting ({identifier}).") return transition_type, match now = datetime.datetime.now() file_component = build_file_component(year, element) filename = Path(pathname2url(file_component)) / pathname2url( f"{file_component}_{filetype.lower()}]" f"[{element.lower()}{charge.lower()}.dat") header_match = { "compact": r"(\d+).+/(\S+).*\+(.*)photon", "expanded": r"(\d+).+/(\S+).*\:(.*)photon", } section_header_match = { "compact": r"(\d+.\d+).+\s+(\d+)\s+(\d+).+type\s?" r"=\s?(\S+).+isel.+\s+(\d+)", "expanded": r"(\d+.\d+)\s+(\d+)\s+(\d+).+type\s?=" r"\s?(\S+).+isel\s+?=\s+?(\d+)", } with self._get_file("adf15", filename) as f: header = f.readline().strip().lower() identifier = file_type(header.split("/")[1][2]) match = header_match[identifier] m = re.search(match, header, re.I) assert isinstance(m, re.Match) ntrans = int(m.group(1)) element_name = m.group(2).strip().lower() charge_state = int(m.group(3)) assert element_name == element.lower() m = re.search(r"(\d+)(\S*)", charge) assert isinstance(m, re.Match) extracted_charge = m.group(1) if charge_state != int(extracted_charge): raise ValueError( f"Charge state in ADF15 file ({charge_state}) does not " f"match argument ({charge}).") # Read first section header to build arrays outside of reading loop match = section_header_match[identifier] header_re = re.compile(match) m = None while not m: line = f.readline().strip().lower() m = header_re.search(line) assert isinstance(m, re.Match) nd = int(m.group(2)) nt = int(m.group(3)) ttype: List[str] = [] tindex = np.empty(ntrans) wavelength = np.empty(ntrans) # Read Photon Emissivity Coefficient rates data = np.empty((ntrans, nd, nt)) for i in range(ntrans): m = header_re.search(line) assert isinstance(m, re.Match) assert int(m.group(5)) - 1 == i tindex[i] = i + 1 ttype.append(m.group(4)) wavelength[i] = float(m.group(1)) # (Angstroms) densities = np.fromfile(f, float, nd, " ") temperatures = np.fromfile(f, float, nt, " ") data_tmp = np.fromfile(f, float, nd * nt, " ") data[i, :, :] = explicit_reshape(data_tmp, nd, nt) line = f.readline().strip().lower() data = np.transpose(np.array(data), (0, 2, 1)) # Read Transition information from end of file file_end_re = re.compile(r"c\s+[isel].+\s+[transition].+\s+[type]") while not file_end_re.search(line): line = f.readline().strip().lower() _ = f.readline() if identifier == "expanded": _ = f.readline() line = f.readline().strip().lower() transition_type, match = transition_match(line) transition_re = re.compile(match) format_transition = { "orbitals": lambda m: f"{m.group(4)}-{m.group(6)}".replace(" ", ""), "n_levels": lambda m: m.group(3).replace(" ", ""), } transition = [] for i in tindex: m = transition_re.search(line) assert isinstance(m, re.Match) assert int(m.group(1)[:-1]) == i transition_tmp = format_transition[transition_type](m) transition.append(transition_tmp) line = f.readline().strip().lower() gen_type = ADF15_GENERAL_DATATYPES[filetype] spec_type = element name = f"{spec_type}_{gen_type}" attrs = { "datatype": (gen_type, spec_type), "provenance": self.create_provenance(filename, now), } coords = [ ("index", tindex), ("electron_temperature", temperatures), # eV ("electron_density", densities * 10**6), # m**-3 ] pecs = DataArray( data * 10**-6, coords=coords, name=name, attrs=attrs, ) # Add extra dimensions attached to index pecs = pecs.assign_coords(wavelength=("index", wavelength)) # (A) pecs = pecs.assign_coords( transition=("index", transition) ) # (2S+1)L(w-1/2)-(2S+1)L(w-1/2) of upper-lower levels, no blank spaces pecs = pecs.assign_coords(type=("index", ttype)) # (excit, recomb, cx) return pecs