示例#1
0
def loopread(tcoutput, size_record, ncol, n_alt, size_head, size_data_record, tReq):
    tcoutput = Path(tcoutput).expanduser()
    n_t = tcoutput.stat().st_size // size_record // d_bytes

    chi = empty(n_t, float)
    t = empty(n_t, datetime)

    plasmaparam = DataArray(data=empty((n_t, n_alt, 4)), dims=["time", "alt_km", "isrparam"])
    iono = DataArray(data=empty((n_t, n_alt, 22)), dims=["time", "alt_km", "param"])

    with tcoutput.open("rb") as f:  # reset to beginning
        for i in range(n_t):
            iono[i, ...], chi[i], t[i], alt, plasmaparam[i, ...] = data_tra(
                f, size_record, ncol, n_alt, size_head, size_data_record
            )
        # FIXME isn't there a way to inherit coordinates like Pandas?
        iono = iono.assign_coords(time=t, param=PARAM, alt_km=alt)
        plasmaparam = plasmaparam.assign_coords(time=t, isrparam=ISRPARAM, alt_km=alt)
    #%% handle time request -- will return Dataframe if tReq, else returns Panel of all times
    if tReq is not None:  # have to qualify this since picktime default gives last time as fallback
        tUsedInd = picktime(iono.time, tReq, None)[0]
        if tUsedInd is not None:  # in case ind is 0
            iono = iono[tUsedInd, ...]
            plasmaparam = plasmaparam[tUsedInd, ...]

    return iono, chi, plasmaparam
示例#2
0
文件: regrid.py 项目: nbren12/gnl
def staggered_to_right(f: xr.DataArray, block_size, dim, boundary='wrap'):
    """Move staggered variable to the right interface

    Parameters
    ----------
    f : xr.DataArray
    block_size : size of the coarse graining block
    dim : str
    boundary : str, optional
        A boundary condition which is passed to `isel_bc`

    Returns
    -------
    interface : xr.DataArray
        The value of f along the right interfaces of the coarse-grain blocks
    """
    n = f.shape[f.get_axis_num(dim)]
    new_coord = get_center_coords(f[dim].values, block_size)

    idx = slice(block_size, n+block_size, block_size)
    f = isel_bc(f, idx, dim, boundary=boundary)
    return f.assign_coords(**{dim: new_coord})
示例#3
0
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_identical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_allclose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_equal)

    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values, coords={'x': range(4)},
                                     dims=('x', 'y'), name='foo')
        self.lazy_array = DataArray(self.data, coords={'x': range(4)},
                                    dims=('x', 'y'), name='foo')

    def test_rechunk(self):
        chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
        assert chunked.chunks == ((2,) * 2, (2,) * 3)
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        assert chunked.data.name.startswith('xarray-<this-array>')

    def test_lazy_dataset(self):
        lazy_ds = Dataset({'foo': (('x', 'y'), self.data)})
        assert isinstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], 'x')
        self.assertLazyAndAllClose(u, actual)

    @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4',
                        reason='Need dask 0.16 for new interface')
    def test_compute(self):
        u = self.eager_array
        v = self.lazy_array

        assert dask.is_dask_collection(v)
        (v2,) = dask.compute(v + 1)
        assert not dask.is_dask_collection(v2)

        assert ((u + 1).data == v2.data).all()

    @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4',
                        reason='Need dask 0.16 for new interface')
    def test_persist(self):
        u = self.eager_array
        v = self.lazy_array + 1

        (v2,) = dask.persist(v)
        assert v is not v2
        assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
        assert v2.__dask_keys__() == v.__dask_keys__()
        assert dask.is_dask_collection(v)
        assert dask.is_dask_collection(v2)

        self.assertLazyAndAllClose(u + 1, v)
        self.assertLazyAndAllClose(u + 1, v2)

    def test_concat_loads_variables(self):
        # Test that concat() computes not-in-memory variables at most once
        # and loads them in the output, while leaving the input unaltered.
        d1 = build_dask_array('d1')
        c1 = build_dask_array('c1')
        d2 = build_dask_array('d2')
        c2 = build_dask_array('c2')
        d3 = build_dask_array('d3')
        c3 = build_dask_array('c3')
        # Note: c is a non-index coord.
        # Index coords are loaded by IndexVariable.__init__.
        ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)})
        ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)})
        ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)})

        assert kernel_call_count == 0
        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different',
                        coords='different')
        # each kernel is computed exactly once
        assert kernel_call_count == 6
        # variables are loaded in the output
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        out = xr.concat(
            [ds1, ds2, ds3], dim='n', data_vars='all', coords='all')
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)

        out = xr.concat(
            [ds1, ds2, ds3], dim='n', data_vars=['d'], coords=['c'])
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[])
        # variables are loaded once as we are validing that they're identical
        assert kernel_call_count == 12
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different',
                        coords='different', compat='identical')
        # compat=identical doesn't do any more kernel calls than compat=equals
        assert kernel_call_count == 18
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        # When the test for different turns true halfway through,
        # stop computing variables as it would not have any benefit
        ds4 = Dataset(data_vars={'d': ('x', [2.0])},
                      coords={'c': ('x', [2.0])})
        out = xr.concat([ds1, ds2, ds4, ds3], dim='n', data_vars='different',
                        coords='different')
        # the variables of ds1 and ds2 were computed, but those of ds3 didn't
        assert kernel_call_count == 22
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)
        # the data of ds1 and ds2 was loaded into numpy and then
        # concatenated to the data of ds3. Thus, only ds3 is computed now.
        out.compute()
        assert kernel_call_count == 24

        # Finally, test that riginals are unaltered
        assert ds1['d'].data is d1
        assert ds1['c'].data is c1
        assert ds2['d'].data is d2
        assert ds2['c'].data is c2
        assert ds3['d'].data is d3
        assert ds3['c'].data is c3

    def test_groupby(self):
        if LooseVersion(dask.__version__) == LooseVersion('0.15.3'):
            pytest.xfail('upstream bug in dask: '
                         'https://github.com/dask/dask/issues/2718')

        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby('x').mean()
        actual = v.groupby('x').mean()
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords['ab'] = ('x', ['a', 'a', 'b', 'b'])
        with raises_regex(NotImplementedError, 'dask'):
            v.groupby('ab').first()
        expected = u.groupby('ab').first()
        actual = v.groupby('ab').first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [{'x': [2, 3, 4]},
                       {'x': [1, 100, 2, 101, 3]},
                       {'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3]}]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u['x'])
        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))

    def test_merge(self):

        def duplicate_and_merge(array):
            return xr.merge([array, array.rename('bar')]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)),
                      'bar': ('x', range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        assert count[0] == 1

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=('w', 'x', 'y'))
        stacked = arr.stack(z=('x', 'y'))
        z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)],
                                       names=['x', 'y'])
        expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z'])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_dataarray_repr(self):
        # Test that __repr__ converts the dask backend to numpy
        # in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.DataArray 'data' (x: 1)>
        dask.array<shape=(1,), dtype=int64, chunksize=(1,)>
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x""")
        assert expected == repr(a)
        assert kernel_call_count == 0

    def test_dataset_repr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.Dataset>
        Dimensions:  (x: 1)
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x
        Data variables:
            a        (x) int64 dask.array<shape=(1,), chunksize=(1,)>""")
        assert expected == repr(ds)
        assert kernel_call_count == 0

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        a1.compute()
        assert not a1._in_memory
        assert not a1.coords['y']._in_memory
        assert kernel_call_count == 2
        a2 = pickle.loads(pickle.dumps(a1))
        assert kernel_call_count == 2
        assert_identical(a1, a2)
        assert not a1._in_memory
        assert not a2._in_memory
        assert not a1.coords['y']._in_memory
        assert not a2.coords['y']._in_memory

    def test_dataset_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds1 = Dataset(data_vars={'a': ('x', data)},
                      coords={'y': ('x', nonindex_coord)})
        ds1.compute()
        assert not ds1['a']._in_memory
        assert not ds1['y']._in_memory
        assert kernel_call_count == 2
        ds2 = pickle.loads(pickle.dumps(ds1))
        assert kernel_call_count == 2
        assert_identical(ds1, ds2)
        assert not ds1['a']._in_memory
        assert not ds2['a']._in_memory
        assert not ds1['y']._in_memory
        assert not ds2['y']._in_memory

    def test_dataarray_getattr(self):
        # ipython/jupyter does a long list of getattr() calls to when trying to
        # represent an object.
        # Make sure we're not accidentally computing dask variables.
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'],
                      coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(a, 'NOTEXIST')
        assert kernel_call_count == 0

    def test_dataset_getattr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(ds, 'NOTEXIST')
        assert kernel_call_count == 0

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        assert not a._in_memory
        assert a.values.tolist() == [1, 2]
        assert not a._in_memory

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={'x': range(4)}, name='foo')
        self.assertLazyAndIdentical(self.lazy_array, a)
示例#4
0
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_identical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_allclose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, assert_equal)

    @pytest.fixture(autouse=True)
    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values,
                                     coords={"x": range(4)},
                                     dims=("x", "y"),
                                     name="foo")
        self.lazy_array = DataArray(self.data,
                                    coords={"x": range(4)},
                                    dims=("x", "y"),
                                    name="foo")

    def test_rechunk(self):
        chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2})
        assert chunked.chunks == ((2, ) * 2, (2, ) * 3)
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        assert chunked.data.name.startswith("xarray-<this-array>")

    def test_lazy_dataset(self):
        lazy_ds = Dataset({"foo": (("x", "y"), self.data)})
        assert isinstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], "x")
        self.assertLazyAndAllClose(u, actual)

    def test_compute(self):
        u = self.eager_array
        v = self.lazy_array

        assert dask.is_dask_collection(v)
        (v2, ) = dask.compute(v + 1)
        assert not dask.is_dask_collection(v2)

        assert ((u + 1).data == v2.data).all()

    def test_persist(self):
        u = self.eager_array
        v = self.lazy_array + 1

        (v2, ) = dask.persist(v)
        assert v is not v2
        assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
        assert v2.__dask_keys__() == v.__dask_keys__()
        assert dask.is_dask_collection(v)
        assert dask.is_dask_collection(v2)

        self.assertLazyAndAllClose(u + 1, v)
        self.assertLazyAndAllClose(u + 1, v2)

    def test_concat_loads_variables(self):
        # Test that concat() computes not-in-memory variables at most once
        # and loads them in the output, while leaving the input unaltered.
        d1 = build_dask_array("d1")
        c1 = build_dask_array("c1")
        d2 = build_dask_array("d2")
        c2 = build_dask_array("c2")
        d3 = build_dask_array("d3")
        c3 = build_dask_array("c3")
        # Note: c is a non-index coord.
        # Index coords are loaded by IndexVariable.__init__.
        ds1 = Dataset(data_vars={"d": ("x", d1)}, coords={"c": ("x", c1)})
        ds2 = Dataset(data_vars={"d": ("x", d2)}, coords={"c": ("x", c2)})
        ds3 = Dataset(data_vars={"d": ("x", d3)}, coords={"c": ("x", c3)})

        assert kernel_call_count == 0
        out = xr.concat([ds1, ds2, ds3],
                        dim="n",
                        data_vars="different",
                        coords="different")
        # each kernel is computed exactly once
        assert kernel_call_count == 6
        # variables are loaded in the output
        assert isinstance(out["d"].data, np.ndarray)
        assert isinstance(out["c"].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3],
                        dim="n",
                        data_vars="all",
                        coords="all")
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3],
                        dim="n",
                        data_vars=["d"],
                        coords=["c"])
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=[], coords=[])
        # variables are loaded once as we are validing that they're identical
        assert kernel_call_count == 12
        assert isinstance(out["d"].data, np.ndarray)
        assert isinstance(out["c"].data, np.ndarray)

        out = xr.concat(
            [ds1, ds2, ds3],
            dim="n",
            data_vars="different",
            coords="different",
            compat="identical",
        )
        # compat=identical doesn't do any more kernel calls than compat=equals
        assert kernel_call_count == 18
        assert isinstance(out["d"].data, np.ndarray)
        assert isinstance(out["c"].data, np.ndarray)

        # When the test for different turns true halfway through,
        # stop computing variables as it would not have any benefit
        ds4 = Dataset(data_vars={"d": ("x", [2.0])},
                      coords={"c": ("x", [2.0])})
        out = xr.concat([ds1, ds2, ds4, ds3],
                        dim="n",
                        data_vars="different",
                        coords="different")
        # the variables of ds1 and ds2 were computed, but those of ds3 didn't
        assert kernel_call_count == 22
        assert isinstance(out["d"].data, dask.array.Array)
        assert isinstance(out["c"].data, dask.array.Array)
        # the data of ds1 and ds2 was loaded into numpy and then
        # concatenated to the data of ds3. Thus, only ds3 is computed now.
        out.compute()
        assert kernel_call_count == 24

        # Finally, test that riginals are unaltered
        assert ds1["d"].data is d1
        assert ds1["c"].data is c1
        assert ds2["d"].data is d2
        assert ds2["c"].data is c2
        assert ds3["d"].data is d3
        assert ds3["c"].data is c3

    def test_groupby(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby("x").mean(...)
        actual = v.groupby("x").mean(...)
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords["ab"] = ("x", ["a", "a", "b", "b"])
        with raises_regex(NotImplementedError, "dask"):
            v.groupby("ab").first()
        expected = u.groupby("ab").first()
        actual = v.groupby("ab").first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [
            {
                "x": [2, 3, 4]
            },
            {
                "x": [1, 100, 2, 101, 3]
            },
            {
                "x": [2.5, 3, 3.5],
                "y": [2, 2.5, 3]
            },
        ]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u["x"])
        self.assertLazyAndEqual(expected, v.to_dataset("x").to_array("x"))

    def test_merge(self):
        def duplicate_and_merge(array):
            return xr.merge([array, array.rename("bar")]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning")
    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        ds.load(scheduler=counting_get)

        assert count[0] == 1

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=("w", "x", "y"))
        stacked = arr.stack(z=("x", "y"))
        z = pd.MultiIndex.from_product(
            [np.arange(3), np.arange(4)], names=["x", "y"])
        expected = DataArray(data.reshape(2, -1), {"z": z}, dims=["w", "z"])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    @pytest.mark.skipif(LooseVersion(dask.__version__) >= "2.0",
                        reason="no meta")
    def test_dataarray_repr_legacy(self):
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        expected = dedent("""\
            <xarray.DataArray 'data' (x: 1)>
            {!r}
            Coordinates:
                y        (x) int64 dask.array<chunksize=(1,), meta=np.ndarray>
            Dimensions without coordinates: x""".format(data))
        assert expected == repr(a)
        assert kernel_call_count == 0  # should not evaluate dask array

    @pytest.mark.skipif(LooseVersion(dask.__version__) < "2.0",
                        reason="needs meta")
    def test_dataarray_repr(self):
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        expected = dedent("""\
            <xarray.DataArray 'data' (x: 1)>
            {!r}
            Coordinates:
                y        (x) int64 dask.array<chunksize=(1,), meta=np.ndarray>
            Dimensions without coordinates: x""".format(data))
        assert expected == repr(a)
        assert kernel_call_count == 0  # should not evaluate dask array

    @pytest.mark.skipif(LooseVersion(dask.__version__) < "2.0",
                        reason="needs meta")
    def test_dataset_repr(self):
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        ds = Dataset(data_vars={"a": ("x", data)},
                     coords={"y": ("x", nonindex_coord)})
        expected = dedent("""\
            <xarray.Dataset>
            Dimensions:  (x: 1)
            Coordinates:
                y        (x) int64 dask.array<chunksize=(1,), meta=np.ndarray>
            Dimensions without coordinates: x
            Data variables:
                a        (x) int64 dask.array<chunksize=(1,), meta=np.ndarray>"""
                          )
        assert expected == repr(ds)
        assert kernel_call_count == 0  # should not evaluate dask array

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variable nor the non-index coords
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a1 = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        a1.compute()
        assert not a1._in_memory
        assert not a1.coords["y"]._in_memory
        assert kernel_call_count == 2
        a2 = pickle.loads(pickle.dumps(a1))
        assert kernel_call_count == 2
        assert_identical(a1, a2)
        assert not a1._in_memory
        assert not a2._in_memory
        assert not a1.coords["y"]._in_memory
        assert not a2.coords["y"]._in_memory

    def test_dataset_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        ds1 = Dataset(data_vars={"a": ("x", data)},
                      coords={"y": ("x", nonindex_coord)})
        ds1.compute()
        assert not ds1["a"]._in_memory
        assert not ds1["y"]._in_memory
        assert kernel_call_count == 2
        ds2 = pickle.loads(pickle.dumps(ds1))
        assert kernel_call_count == 2
        assert_identical(ds1, ds2)
        assert not ds1["a"]._in_memory
        assert not ds2["a"]._in_memory
        assert not ds1["y"]._in_memory
        assert not ds2["y"]._in_memory

    def test_dataarray_getattr(self):
        # ipython/jupyter does a long list of getattr() calls to when trying to
        # represent an object.
        # Make sure we're not accidentally computing dask variables.
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
        with suppress(AttributeError):
            getattr(a, "NOTEXIST")
        assert kernel_call_count == 0

    def test_dataset_getattr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array("data")
        nonindex_coord = build_dask_array("coord")
        ds = Dataset(data_vars={"a": ("x", data)},
                     coords={"y": ("x", nonindex_coord)})
        with suppress(AttributeError):
            getattr(ds, "NOTEXIST")
        assert kernel_call_count == 0

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        assert not a._in_memory
        assert a.values.tolist() == [1, 2]
        assert not a._in_memory

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={"x": range(4)},
                      name="foo")
        self.assertLazyAndIdentical(self.lazy_array, a)
示例#5
0
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayEqual)

    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values,
                                     coords={'x': range(4)},
                                     dims=('x', 'y'),
                                     name='foo')
        self.lazy_array = DataArray(self.data,
                                    coords={'x': range(4)},
                                    dims=('x', 'y'),
                                    name='foo')

    def test_rechunk(self):
        chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
        self.assertEqual(chunked.chunks, ((2, ) * 2, (2, ) * 3))
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        self.assertTrue(chunked.data.name.startswith('xarray-<this-array>'))

    def test_lazy_dataset(self):
        lazy_ds = Dataset({'foo': (('x', 'y'), self.data)})
        self.assertIsInstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], 'x')
        self.assertLazyAndAllClose(u, actual)

    def test_groupby(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby('x').mean()
        actual = v.groupby('x').mean()
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords['ab'] = ('x', ['a', 'a', 'b', 'b'])
        with self.assertRaisesRegexp(NotImplementedError, 'dask'):
            v.groupby('ab').first()
        expected = u.groupby('ab').first()
        actual = v.groupby('ab').first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [{
                'x': [2, 3, 4]
        }, {
                'x': [1, 100, 2, 101, 3]
        }, {
                'x': [2.5, 3, 3.5],
                'y': [2, 2.5, 3]
        }]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u['x'])
        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))

    def test_merge(self):
        def duplicate_and_merge(array):
            return xr.merge([array, array.rename('bar')]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        self.assertEqual(count[0], 1)

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=('w', 'x', 'y'))
        stacked = arr.stack(z=('x', 'y'))
        z = pd.MultiIndex.from_product(
            [np.arange(3), np.arange(4)], names=['x', 'y'])
        expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z'])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_variable_pickle(self):
        # Test that pickling/unpickling does not convert the dask
        # backend to numpy
        a1 = Variable(['x'], build_dask_array())
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertEquals(kernel_call_count, 1)
        a2 = pickle.loads(pickle.dumps(a1))
        self.assertEquals(kernel_call_count, 1)
        self.assertVariableIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling does not convert the dask
        # backend to numpy
        a1 = DataArray(build_dask_array())
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertEquals(kernel_call_count, 1)
        a2 = pickle.loads(pickle.dumps(a1))
        self.assertEquals(kernel_call_count, 1)
        self.assertDataArrayIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)

    def test_dataset_pickle(self):
        ds1 = Dataset({'a': DataArray(build_dask_array())})
        ds1.compute()
        self.assertFalse(ds1['a']._in_memory)
        self.assertEquals(kernel_call_count, 1)
        ds2 = pickle.loads(pickle.dumps(ds1))
        self.assertEquals(kernel_call_count, 1)
        self.assertDatasetIdentical(ds1, ds2)
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds2['a']._in_memory)

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        self.assertFalse(a._in_memory)
        self.assertEquals(a.values.tolist(), [1, 2])
        self.assertFalse(a._in_memory)

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={'x': range(4)},
                      name='foo')
        self.assertLazyAndIdentical(self.lazy_array, a)
示例#6
0
def get_test_data(input_shape=(100, 50),
                  output_shape=(200, 100),
                  output_proj=None,
                  input_dims=('y', 'x')):
    """Get common data objects used in testing.

    Returns: tuple with the following elements
        input_data_on_area: DataArray with dimensions as if it is a gridded
            dataset.
        input_area_def: AreaDefinition of the above DataArray
        input_data_on_swath: DataArray with dimensions as if it is a swath.
        input_swath: SwathDefinition of the above DataArray
        target_area_def: AreaDefinition to be used as a target for resampling

    """
    from xarray import DataArray
    import dask.array as da
    from pyresample.geometry import AreaDefinition, SwathDefinition
    from pyresample.utils import proj4_str_to_dict
    ds1 = DataArray(da.zeros(input_shape, chunks=85),
                    dims=input_dims,
                    attrs={
                        'name': 'test_data_name',
                        'test': 'test'
                    })
    if input_dims and 'y' in input_dims:
        ds1 = ds1.assign_coords(y=da.arange(input_shape[-2], chunks=85))
    if input_dims and 'x' in input_dims:
        ds1 = ds1.assign_coords(x=da.arange(input_shape[-1], chunks=85))
    if input_dims and 'bands' in input_dims:
        ds1 = ds1.assign_coords(bands=list('RGBA'[:ds1.sizes['bands']]))

    input_proj_str = ('+proj=geos +lon_0=-95.0 +h=35786023.0 +a=6378137.0 '
                      '+b=6356752.31414 +sweep=x +units=m +no_defs')
    source = AreaDefinition(
        'test_target',
        'test_target',
        'test_target',
        proj4_str_to_dict(input_proj_str),
        input_shape[1],  # width
        input_shape[0],  # height
        (-1000., -1500., 1000., 1500.))
    ds1.attrs['area'] = source
    if CRS is not None:
        crs = CRS.from_string(input_proj_str)
        ds1 = ds1.assign_coords(crs=crs)

    ds2 = ds1.copy()
    input_area_shape = tuple(ds1.sizes[dim] for dim in ds1.dims
                             if dim in ['y', 'x'])
    geo_dims = ('y', 'x') if input_dims else None
    lons = da.random.random(input_area_shape, chunks=50)
    lats = da.random.random(input_area_shape, chunks=50)
    swath_def = SwathDefinition(DataArray(lons, dims=geo_dims),
                                DataArray(lats, dims=geo_dims))
    ds2.attrs['area'] = swath_def
    if CRS is not None:
        crs = CRS.from_string('+proj=latlong +datum=WGS84 +ellps=WGS84')
        ds2 = ds2.assign_coords(crs=crs)

    # set up target definition
    output_proj_str = ('+proj=lcc +datum=WGS84 +ellps=WGS84 '
                       '+lon_0=-95. +lat_0=25 +lat_1=25 +units=m +no_defs')
    output_proj_str = output_proj or output_proj_str
    target = AreaDefinition(
        'test_target',
        'test_target',
        'test_target',
        proj4_str_to_dict(output_proj_str),
        output_shape[1],  # width
        output_shape[0],  # height
        (-1000., -1500., 1000., 1500.),
    )
    return ds1, source, ds2, swath_def, target
示例#7
0
def read_plink1_bin(bed, bim=None, fam=None, verbose=True):
    """
    Read PLINK 1 binary files [1]_ into a data array.

    A PLINK 1 binary file set consists of three files:

    - BED: containing the genotype.
    - BIM: containing variant information.
    - FAM: containing sample information.

    The user might provide a single file path to a BED file, from which this function
    will try to infer the file path of the other two files.
    This function also allows the user to provide file path to multiple BED and
    BIM files, as it is common to have a data set split into multiple files, one per
    chromosome.

    This function returns a samples-by-variants matrix. This is a special kind of matrix
    with rows and columns having multiple coordinates each. Those coordinates have the
    metainformation contained in the BIM and FAM files.

    Examples
    --------
    The following example reads two BED files and two BIM files correspondig to
    chromosomes 11 and 12, and read a single FAM file whose filename is inferred from
    the BED filenames.

    .. doctest::

        >>> from os.path import join
        >>> from pandas_plink import read_plink1_bin
        >>> from pandas_plink import get_data_folder
        >>> G = read_plink1_bin(join(get_data_folder(), "chr*.bed"), verbose=False)
        >>> print(G)
        <xarray.DataArray 'genotype' (sample: 14, variant: 1252)>
        dask.array<concatenate, shape=(14, 1252), dtype=float64, chunksize=(14, 779), chunktype=numpy.ndarray>
        Coordinates:
          * sample   (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
          * variant  (variant) object '11_316849996' '11_316874359' ... '12_373081507'
            fid      (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014'
            iid      (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014'
            father   (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
            mother   (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
            gender   (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
            trait    (sample) float64 -9.0 -9.0 -9.0 -9.0 -9.0 ... -9.0 -9.0 -9.0 -9.0
            chrom    (variant) <U2 '11' '11' '11' '11' '11' ... '12' '12' '12' '12' '12'
            snp      (variant) <U9 '316849996' '316874359' ... '372918788' '373081507'
            cm       (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0
            pos      (variant) int64 157439 181802 248969 ... 27163741 27205125 27367844
            a0       (variant) <U1 'C' 'G' 'G' 'C' 'C' 'T' ... 'A' 'A' 'G' 'A' 'T' 'G'
            a1       (variant) <U1 'T' 'C' 'C' 'T' 'T' 'A' ... 'T' 'G' 'A' 'T' 'C' 'A'
        >>> print(G.shape)
        (14, 1252)

    Suppose we want the genotypes of the chromosome 11 only:

    .. doctest::

        >>> G = G.where(G.chrom == "11", drop=True)
        >>> print(G)
        <xarray.DataArray 'genotype' (sample: 14, variant: 779)>
        dask.array<where, shape=(14, 779), dtype=float64, chunksize=(14, 779), chunktype=numpy.ndarray>
        Coordinates:
          * sample   (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
          * variant  (variant) object '11_316849996' '11_316874359' ... '11_345698259'
            fid      (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014'
            iid      (sample) <U4 'B001' 'B002' 'B003' 'B004' ... 'B012' 'B013' 'B014'
            father   (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
            mother   (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
            gender   (sample) <U1 '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
            trait    (sample) float64 -9.0 -9.0 -9.0 -9.0 -9.0 ... -9.0 -9.0 -9.0 -9.0
            chrom    (variant) <U2 '11' '11' '11' '11' '11' ... '11' '11' '11' '11' '11'
            snp      (variant) <U9 '316849996' '316874359' ... '345653648' '345698259'
            cm       (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0
            pos      (variant) int64 157439 181802 248969 ... 28937375 28961091 29005702
            a0       (variant) <U1 'C' 'G' 'G' 'C' 'C' 'T' ... 'T' 'A' 'C' 'A' 'A' 'T'
            a1       (variant) <U1 'T' 'C' 'C' 'T' 'T' 'A' ... 'C' 'G' 'T' 'G' 'C' 'C'
        >>> print(G.shape)
        (14, 779)

    Lets now print the genotype value of the sample `B003` for variant `11_316874359`:

    .. doctest::

        >>> print(G.sel(sample="B003", variant="11_316874359").values)
        0.0

    The special matrix we return is of type :class:`xarray.DataArray`. More information
    about it can be found at the `xarray documentation <http://xarray.pydata.org>`_.


    Parameters
    ----------
    bed : str
        Path to a BED file. It can contain shell-style wildcards to indicate multiple
        BED files.
    bim : str, optional
        Path to a BIM file. It can contain shell-style wildcards to indicate multiple
        BIM files. It defaults to ``None``, in which case it will try to be inferred.
    fam : str, optional
        Path to a FAM file. It defaults to ``None``, in which case it will try to be
        inferred.
    verbose : bool
        ``True`` for progress information; ``False`` otherwise.

    Returns
    -------
    G : :class:`xarray.DataArray`
        Genotype with metadata.

    References
    ----------
    .. [1] PLINK 1 binary. https://www.cog-genomics.org/plink/2.0/input#bed
    """
    from numpy import int64, float64
    from tqdm import tqdm
    from xarray import DataArray
    import pandas as pd
    import dask.array as da

    bed_files = sorted(glob(bed))
    if len(bed_files) == 0:
        raise ValueError("No BED file has been found.")

    if bim is None:
        bim_files = [last_replace(f, ".bed", ".bim") for f in bed_files]
    else:
        bim_files = sorted(glob(bim))
    if len(bim_files) == 0:
        raise ValueError("No BIM file has been found.")

    if fam is None:
        fam_files = [last_replace(f, ".bed", ".fam") for f in bed_files]
    else:
        fam_files = sorted(glob(fam))
    if len(fam_files) == 0:
        raise ValueError("No FAM file has been found.")

    if len(bed_files) != len(bim_files):
        raise ValueError("The numbers of BED and BIM files must match.")

    if len(fam_files) > 1:
        msg = "More than one FAM file has been specified. Only the first one will be "
        msg += "considered."
        if verbose:
            warnings.warn(msg, UserWarning)
        fam_files = fam_files[:1]

    nfiles = len(bed_files) + len(bim_files) + 1
    pbar = tqdm(desc="Mapping files", total=nfiles, disable=not verbose)

    bims = _read_file(bim_files, lambda f: _read_bim(f), pbar)
    nmarkers = {bed_files[i]: b.shape[0] for i, b in enumerate(bims)}
    bim = pd.concat(bims, axis=0, ignore_index=True)
    del bim["i"]
    fam = _read_file(fam_files, lambda f: _read_fam(f), pbar)[0]
    del fam["i"]

    nsamples = fam.shape[0]
    sample_ids = fam["iid"]
    variant_ids = bim["chrom"].astype(str) + "_" + bim["snp"].astype(str)

    G = _read_file(bed_files, lambda f: _read_bed(f, nsamples, nmarkers[f]).T, pbar)
    G = da.concatenate(G, axis=1)

    G = DataArray(G, dims=["sample", "variant"], coords=[sample_ids, variant_ids])
    sample = {c: ("sample", fam[c].tolist()) for c in fam.columns}
    variant = {c: ("variant", bim[c].tolist()) for c in bim.columns}
    G = G.assign_coords(**sample)
    G = G.assign_coords(**variant)
    G.name = "genotype"
    G["pos"] = G["pos"].astype(int64)
    G["cm"] = G["cm"].astype(float64)
    G["trait"] = G["trait"].astype(float64)

    pbar.close()

    return G
示例#8
0
文件: test_dask.py 项目: tfurf/xarray
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayEqual)

    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values,
                                     coords={'x': range(4)},
                                     dims=('x', 'y'),
                                     name='foo')
        self.lazy_array = DataArray(self.data,
                                    coords={'x': range(4)},
                                    dims=('x', 'y'),
                                    name='foo')

    def test_rechunk(self):
        chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
        self.assertEqual(chunked.chunks, ((2, ) * 2, (2, ) * 3))
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        self.assertTrue(chunked.data.name.startswith('xarray-<this-array>'))

    def test_lazy_dataset(self):
        lazy_ds = Dataset({'foo': (('x', 'y'), self.data)})
        self.assertIsInstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], 'x')
        self.assertLazyAndAllClose(u, actual)

    def test_concat_loads_variables(self):
        # Test that concat() computes not-in-memory variables at most once
        # and loads them in the output, while leaving the input unaltered.
        d1 = build_dask_array('d1')
        c1 = build_dask_array('c1')
        d2 = build_dask_array('d2')
        c2 = build_dask_array('c2')
        d3 = build_dask_array('d3')
        c3 = build_dask_array('c3')
        # Note: c is a non-index coord.
        # Index coords are loaded by IndexVariable.__init__.
        ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)})
        ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)})
        ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)})

        assert kernel_call_count == 0
        out = xr.concat([ds1, ds2, ds3],
                        dim='n',
                        data_vars='different',
                        coords='different')
        # each kernel is computed exactly once
        assert kernel_call_count == 6
        # variables are loaded in the output
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3],
                        dim='n',
                        data_vars='all',
                        coords='all')
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3],
                        dim='n',
                        data_vars=['d'],
                        coords=['c'])
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[])
        # variables are loaded once as we are validing that they're identical
        assert kernel_call_count == 12
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3],
                        dim='n',
                        data_vars='different',
                        coords='different',
                        compat='identical')
        # compat=identical doesn't do any more kernel calls than compat=equals
        assert kernel_call_count == 18
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        # When the test for different turns true halfway through,
        # stop computing variables as it would not have any benefit
        ds4 = Dataset(data_vars={'d': ('x', [2.0])},
                      coords={'c': ('x', [2.0])})
        out = xr.concat([ds1, ds2, ds4, ds3],
                        dim='n',
                        data_vars='different',
                        coords='different')
        # the variables of ds1 and ds2 were computed, but those of ds3 didn't
        assert kernel_call_count == 22
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)
        # the data of ds1 and ds2 was loaded into numpy and then
        # concatenated to the data of ds3. Thus, only ds3 is computed now.
        out.compute()
        assert kernel_call_count == 24

        # Finally, test that riginals are unaltered
        assert ds1['d'].data is d1
        assert ds1['c'].data is c1
        assert ds2['d'].data is d2
        assert ds2['c'].data is c2
        assert ds3['d'].data is d3
        assert ds3['c'].data is c3

    def test_groupby(self):
        if LooseVersion(dask.__version__) == LooseVersion('0.15.3'):
            pytest.xfail('upstream bug in dask: '
                         'https://github.com/dask/dask/issues/2718')

        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby('x').mean()
        actual = v.groupby('x').mean()
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords['ab'] = ('x', ['a', 'a', 'b', 'b'])
        with self.assertRaisesRegexp(NotImplementedError, 'dask'):
            v.groupby('ab').first()
        expected = u.groupby('ab').first()
        actual = v.groupby('ab').first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [{
                'x': [2, 3, 4]
        }, {
                'x': [1, 100, 2, 101, 3]
        }, {
                'x': [2.5, 3, 3.5],
                'y': [2, 2.5, 3]
        }]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u['x'])
        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))

    def test_merge(self):
        def duplicate_and_merge(array):
            return xr.merge([array, array.rename('bar')]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        self.assertEqual(count[0], 1)

    def test_persist_Dataset(self):
        ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk()
        ds = ds + 1
        n = len(ds.foo.data.dask)

        ds2 = ds.persist()

        assert len(ds2.foo.data.dask) == 1
        assert len(ds.foo.data.dask) == n  # doesn't mutate in place

    def test_persist_DataArray(self):
        x = da.arange(10, chunks=(5, ))
        y = DataArray(x)
        z = y + 1
        n = len(z.data.dask)

        zz = z.persist()

        assert len(z.data.dask) == n
        assert len(zz.data.dask) == zz.data.npartitions

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=('w', 'x', 'y'))
        stacked = arr.stack(z=('x', 'y'))
        z = pd.MultiIndex.from_product(
            [np.arange(3), np.arange(4)], names=['x', 'y'])
        expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z'])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_dataarray_repr(self):
        # Test that __repr__ converts the dask backend to numpy
        # in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.DataArray 'data' (x: 1)>
        dask.array<shape=(1,), dtype=int64, chunksize=(1,)>
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x""")
        self.assertEqual(expected, repr(a))
        self.assertEquals(kernel_call_count, 0)

    def test_dataset_repr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.Dataset>
        Dimensions:  (x: 1)
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x
        Data variables:
            a        (x) int64 dask.array<shape=(1,), chunksize=(1,)>""")
        self.assertEqual(expected, repr(ds))
        self.assertEquals(kernel_call_count, 0)

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertFalse(a1.coords['y']._in_memory)
        self.assertEquals(kernel_call_count, 2)
        a2 = pickle.loads(pickle.dumps(a1))
        self.assertEquals(kernel_call_count, 2)
        self.assertDataArrayIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)
        self.assertFalse(a1.coords['y']._in_memory)
        self.assertFalse(a2.coords['y']._in_memory)

    def test_dataset_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds1 = Dataset(data_vars={'a': ('x', data)},
                      coords={'y': ('x', nonindex_coord)})
        ds1.compute()
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds1['y']._in_memory)
        self.assertEquals(kernel_call_count, 2)
        ds2 = pickle.loads(pickle.dumps(ds1))
        self.assertEquals(kernel_call_count, 2)
        self.assertDatasetIdentical(ds1, ds2)
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds2['a']._in_memory)
        self.assertFalse(ds1['y']._in_memory)
        self.assertFalse(ds2['y']._in_memory)

    def test_dataarray_getattr(self):
        # ipython/jupyter does a long list of getattr() calls to when trying to
        # represent an object.
        # Make sure we're not accidentally computing dask variables.
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(a, 'NOTEXIST')
        self.assertEquals(kernel_call_count, 0)

    def test_dataset_getattr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(ds, 'NOTEXIST')
        self.assertEquals(kernel_call_count, 0)

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        self.assertFalse(a._in_memory)
        self.assertEquals(a.values.tolist(), [1, 2])
        self.assertFalse(a._in_memory)

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={'x': range(4)},
                      name='foo')
        self.assertLazyAndIdentical(self.lazy_array, a)

    def test_to_dask_dataframe(self):
        # Test conversion of Datasets to dask DataFrames
        x = da.from_array(np.random.randn(10), chunks=4)
        y = np.arange(10, dtype='uint8')
        t = list('abcdefghij')

        ds = Dataset(
            OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))]))

        expected_pd = pd.DataFrame({
            'a': x,
            'b': y
        },
                                   index=pd.Index(t, name='t'))

        # test if 1-D index is correctly set up
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        # test if we have dask dataframes
        self.assertIsInstance(actual, dd.DataFrame)

        # use the .equals from pandas to check dataframes are equivalent
        assert_frame_equal(expected.compute(), actual.compute())

        # test if no index is given
        expected = dd.from_pandas(expected_pd.reset_index(drop=False),
                                  chunksize=4)

        actual = ds.to_dask_dataframe(set_index=False)

        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected.compute(), actual.compute())

    def test_to_dask_dataframe_2D(self):
        # Test if 2-D dataset is supplied
        w = da.from_array(np.random.randn(2, 3), chunks=(1, 2))
        ds = Dataset({'w': (('x', 'y'), w)})
        ds['x'] = ('x', np.array([0, 1], np.int64))
        ds['y'] = ('y', list('abc'))

        # dask dataframes do not (yet) support multiindex,
        # but when it does, this would be the expected index:
        exp_index = pd.MultiIndex.from_arrays(
            [[0, 0, 0, 1, 1, 1], ['a', 'b', 'c', 'a', 'b', 'c']],
            names=['x', 'y'])
        expected = pd.DataFrame({'w': w.reshape(-1)}, index=exp_index)
        # so for now, reset the index
        expected = expected.reset_index(drop=False)

        actual = ds.to_dask_dataframe(set_index=False)

        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

    def test_to_dask_dataframe_coordinates(self):
        # Test if coordinate is also a dask array
        x = da.from_array(np.random.randn(10), chunks=4)
        t = da.from_array(np.arange(10) * 2, chunks=4)

        ds = Dataset(OrderedDict([('a', ('t', x)), ('t', ('t', t))]))

        expected_pd = pd.DataFrame({'a': x}, index=pd.Index(t, name='t'))
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected.compute(), actual.compute())

    def test_to_dask_dataframe_not_daskarray(self):
        # Test if DataArray is not a dask array
        x = np.random.randn(10)
        y = np.arange(10, dtype='uint8')
        t = list('abcdefghij')

        ds = Dataset(
            OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))]))

        expected = pd.DataFrame({'a': x, 'b': y}, index=pd.Index(t, name='t'))

        actual = ds.to_dask_dataframe(set_index=True)
        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())

    def test_to_dask_dataframe_no_coordinate(self):
        # Test if Dataset has a dimension without coordinates
        x = da.from_array(np.random.randn(10), chunks=4)
        ds = Dataset({'x': ('dim_0', x)})
        expected = pd.DataFrame({'x': x.compute()})
        actual = ds.to_dask_dataframe(set_index=True)
        assert_frame_equal(expected, actual.compute())
示例#9
0
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayEqual)

    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values, coords={'x': range(4)},
                                     dims=('x', 'y'), name='foo')
        self.lazy_array = DataArray(self.data, coords={'x': range(4)},
                                    dims=('x', 'y'), name='foo')

    def test_rechunk(self):
        chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
        self.assertEqual(chunked.chunks, ((2,) * 2, (2,) * 3))
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        self.assertTrue(chunked.data.name.startswith('xarray-<this-array>'))

    def test_lazy_dataset(self):
        lazy_ds = Dataset({'foo': (('x', 'y'), self.data)})
        self.assertIsInstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], 'x')
        self.assertLazyAndAllClose(u, actual)

    def test_groupby(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby('x').mean()
        actual = v.groupby('x').mean()
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords['ab'] = ('x', ['a', 'a', 'b', 'b'])
        with self.assertRaisesRegexp(NotImplementedError, 'dask'):
            v.groupby('ab').first()
        expected = u.groupby('ab').first()
        actual = v.groupby('ab').first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [{'x': [2, 3, 4]},
                       {'x': [1, 100, 2, 101, 3]},
                       {'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3]}]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u['x'])
        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))

    def test_merge(self):

        def duplicate_and_merge(array):
            return xr.merge([array, array.rename('bar')]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)),
                      'bar': ('x', range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        self.assertEqual(count[0], 1)

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=('w', 'x', 'y'))
        stacked = arr.stack(z=('x', 'y'))
        z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)],
                                       names=['x', 'y'])
        expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z'])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_variable_pickle(self):
        # Test that pickling/unpickling does not convert the dask
        # backend to numpy
        a1 = Variable(['x'], build_dask_array())
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertEquals(kernel_call_count, 1)
        a2 = pickle.loads(pickle.dumps(a1))
        self.assertEquals(kernel_call_count, 1)
        self.assertVariableIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling does not convert the dask
        # backend to numpy
        a1 = DataArray(build_dask_array())
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertEquals(kernel_call_count, 1)
        a2 = pickle.loads(pickle.dumps(a1))
        self.assertEquals(kernel_call_count, 1)
        self.assertDataArrayIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)

    def test_dataset_pickle(self):
        ds1 = Dataset({'a': DataArray(build_dask_array())})
        ds1.compute()
        self.assertFalse(ds1['a']._in_memory)
        self.assertEquals(kernel_call_count, 1)
        ds2 = pickle.loads(pickle.dumps(ds1))
        self.assertEquals(kernel_call_count, 1)
        self.assertDatasetIdentical(ds1, ds2)
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds2['a']._in_memory)

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1,2]).chunk()
        self.assertFalse(a._in_memory)
        self.assertEquals(a.values.tolist(), [1, 2])
        self.assertFalse(a._in_memory)

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={'x': range(4)}, name='foo')
        self.assertLazyAndIdentical(self.lazy_array, a)
示例#10
0
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayEqual)

    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values, coords={'x': range(4)},
                                     dims=('x', 'y'), name='foo')
        self.lazy_array = DataArray(self.data, coords={'x': range(4)},
                                    dims=('x', 'y'), name='foo')

    def test_rechunk(self):
        chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
        self.assertEqual(chunked.chunks, ((2,) * 2, (2,) * 3))
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        self.assertTrue(chunked.data.name.startswith('xarray-<this-array>'))

    def test_lazy_dataset(self):
        lazy_ds = Dataset({'foo': (('x', 'y'), self.data)})
        self.assertIsInstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], 'x')
        self.assertLazyAndAllClose(u, actual)

    @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4',
                        reason='Need dask 0.16 for new interface')
    def test_compute(self):
        u = self.eager_array
        v = self.lazy_array

        assert dask.is_dask_collection(v)
        (v2,) = dask.compute(v + 1)
        assert not dask.is_dask_collection(v2)

        assert ((u + 1).data == v2.data).all()

    @pytest.mark.skipif(LooseVersion(dask.__version__) <= '0.15.4',
                        reason='Need dask 0.16 for new interface')
    def test_persist(self):
        u = self.eager_array
        v = self.lazy_array + 1

        (v2,) = dask.persist(v)
        assert v is not v2
        assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
        assert v2.__dask_keys__() == v.__dask_keys__()
        assert dask.is_dask_collection(v)
        assert dask.is_dask_collection(v2)

        self.assertLazyAndAllClose(u + 1, v)
        self.assertLazyAndAllClose(u + 1, v2)

    def test_concat_loads_variables(self):
        # Test that concat() computes not-in-memory variables at most once
        # and loads them in the output, while leaving the input unaltered.
        d1 = build_dask_array('d1')
        c1 = build_dask_array('c1')
        d2 = build_dask_array('d2')
        c2 = build_dask_array('c2')
        d3 = build_dask_array('d3')
        c3 = build_dask_array('c3')
        # Note: c is a non-index coord.
        # Index coords are loaded by IndexVariable.__init__.
        ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)})
        ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)})
        ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)})

        assert kernel_call_count == 0
        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different',
                        coords='different')
        # each kernel is computed exactly once
        assert kernel_call_count == 6
        # variables are loaded in the output
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='all', coords='all')
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=['d'], coords=['c'])
        # no extra kernel calls
        assert kernel_call_count == 6
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[])
        # variables are loaded once as we are validing that they're identical
        assert kernel_call_count == 12
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different',
                        coords='different', compat='identical')
        # compat=identical doesn't do any more kernel calls than compat=equals
        assert kernel_call_count == 18
        assert isinstance(out['d'].data, np.ndarray)
        assert isinstance(out['c'].data, np.ndarray)

        # When the test for different turns true halfway through,
        # stop computing variables as it would not have any benefit
        ds4 = Dataset(data_vars={'d': ('x', [2.0])}, coords={'c': ('x', [2.0])})
        out = xr.concat([ds1, ds2, ds4, ds3], dim='n', data_vars='different',
                        coords='different')
        # the variables of ds1 and ds2 were computed, but those of ds3 didn't
        assert kernel_call_count == 22
        assert isinstance(out['d'].data, dask.array.Array)
        assert isinstance(out['c'].data, dask.array.Array)
        # the data of ds1 and ds2 was loaded into numpy and then
        # concatenated to the data of ds3. Thus, only ds3 is computed now.
        out.compute()
        assert kernel_call_count == 24

        # Finally, test that riginals are unaltered
        assert ds1['d'].data is d1
        assert ds1['c'].data is c1
        assert ds2['d'].data is d2
        assert ds2['c'].data is c2
        assert ds3['d'].data is d3
        assert ds3['c'].data is c3

    def test_groupby(self):
        if LooseVersion(dask.__version__) == LooseVersion('0.15.3'):
            pytest.xfail('upstream bug in dask: '
                         'https://github.com/dask/dask/issues/2718')

        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby('x').mean()
        actual = v.groupby('x').mean()
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords['ab'] = ('x', ['a', 'a', 'b', 'b'])
        with raises_regex(NotImplementedError, 'dask'):
            v.groupby('ab').first()
        expected = u.groupby('ab').first()
        actual = v.groupby('ab').first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [{'x': [2, 3, 4]},
                       {'x': [1, 100, 2, 101, 3]},
                       {'x': [2.5, 3, 3.5], 'y': [2, 2.5, 3]}]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u['x'])
        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))

    def test_merge(self):

        def duplicate_and_merge(array):
            return xr.merge([array, array.rename('bar')]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)),
                      'bar': ('x', range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        self.assertEqual(count[0], 1)

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=('w', 'x', 'y'))
        stacked = arr.stack(z=('x', 'y'))
        z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)],
                                       names=['x', 'y'])
        expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z'])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_dataarray_repr(self):
        # Test that __repr__ converts the dask backend to numpy
        # in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.DataArray 'data' (x: 1)>
        dask.array<shape=(1,), dtype=int64, chunksize=(1,)>
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x""")
        self.assertEqual(expected, repr(a))
        assert kernel_call_count == 0

    def test_dataset_repr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.Dataset>
        Dimensions:  (x: 1)
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x
        Data variables:
            a        (x) int64 dask.array<shape=(1,), chunksize=(1,)>""")
        self.assertEqual(expected, repr(ds))
        assert kernel_call_count == 0

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertFalse(a1.coords['y']._in_memory)
        assert kernel_call_count == 2
        a2 = pickle.loads(pickle.dumps(a1))
        assert kernel_call_count == 2
        self.assertDataArrayIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)
        self.assertFalse(a1.coords['y']._in_memory)
        self.assertFalse(a2.coords['y']._in_memory)

    def test_dataset_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds1 = Dataset(data_vars={'a': ('x', data)},
                      coords={'y': ('x', nonindex_coord)})
        ds1.compute()
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds1['y']._in_memory)
        assert kernel_call_count == 2
        ds2 = pickle.loads(pickle.dumps(ds1))
        assert kernel_call_count == 2
        self.assertDatasetIdentical(ds1, ds2)
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds2['a']._in_memory)
        self.assertFalse(ds1['y']._in_memory)
        self.assertFalse(ds2['y']._in_memory)

    def test_dataarray_getattr(self):
        # ipython/jupyter does a long list of getattr() calls to when trying to
        # represent an object.
        # Make sure we're not accidentally computing dask variables.
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'],
                      coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(a, 'NOTEXIST')
        assert kernel_call_count == 0

    def test_dataset_getattr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(ds, 'NOTEXIST')
        assert kernel_call_count == 0

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        self.assertFalse(a._in_memory)
        assert a.values.tolist() == [1, 2]
        self.assertFalse(a._in_memory)

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={'x': range(4)}, name='foo')
        self.assertLazyAndIdentical(self.lazy_array, a)
示例#11
0
def add_name(ds: xr.DataArray, name: str):
    return ds.assign_coords(id=name)
示例#12
0
def set_spatial_precision( array: xr.DataArray, precision: int ) -> xr.DataArray:
    if precision is None: return array
    sdims = [ array.dims[-2], array.dims[-1] ]
    rounded_coords = { dim: array.coords[dim].round( precision ) for dim in sdims }
    return array.assign_coords( rounded_coords )
示例#13
0
文件: alignment.py 项目: falckt/raman
def normalize_spatial_dimensions(
    arr: xr.DataArray,
    origin: str = 'center',
    dims: Iterable[Hashable] = ('x', 'y')) -> xr.DataArray:
    """Normalize spatial coordinates

    For a dataarray with arbitrary spatial positioning, introduce new spatial indexing that
    puts (0, 0) either in the center of the scan or in the bottom left corner. This is a
    necessary preprocessing if multiple scans of the same size should be aligned.

    Args:
        arr: input dataarray
        origin: {'center' (default), 'min'}
            - 'center': new spatial coordinates will have (0, 0) at the center of the image
            - 'min': new spatial coordinates will have (0, 0) at the bottom left corner
                of the image
        dims: dimensions that should be normalized, defaults to ('x', 'y')

    Returns:
        dataarray with normalized spatial coordinates, the orginal coordinates are retained
        as 'orginal_name'_old.

    Examples:
        a : <xarray.DataArray (pixel: 4)>
            array([0., 0., 0., 0.])
            Coordinates:
                x        (pixel) int64 6 6 9 9
                y        (pixel) int64 1 3 1 3
            Dimensions without coordinates: pixel

        >>> normalize_spatial_dimensions(a, 'center')
        <xarray.DataArray (pixel: 4)>
        array([0., 0., 0., 0.])
        Coordinates:
            x_old    (pixel) int64 6 6 9 9
            y_old    (pixel) int64 1 3 1 3
            y        (pixel) float64 -1.0 1.0 -1.0 1.0
            x        (pixel) float64 -1.5 -1.5 1.5 1.5
        Dimensions without coordinates: pixel

        >>> normalize_spatial_dimensions(a, 'min')
        <xarray.DataArray (pixel: 4)>
        array([0., 0., 0., 0.])
        Coordinates:
            x_old    (pixel) int64 6 6 9 9
            y_old    (pixel) int64 1 3 1 3
            y        (pixel) int64 0 2 0 2
            x        (pixel) int64 0 0 3 3
        Dimensions without coordinates: pixel
    """

    orig_dims = set(arr.dims)
    dims = set(dims)

    arr = arr.rename({k: f'{k}_old' for k in dims})

    new_coords = {}
    for dim in dims:
        coord = arr.coords[f'{dim}_old']

        if origin == 'center':
            c0 = (coord.max() + coord.min()) / 2
        elif origin == 'min':
            c0 = coord.min()
        else:
            raise ValueError('Coordinate origin `{origin}` is not supported')

        new_coords[dim] = coord - c0

    arr = arr.assign_coords(new_coords)
    arr = arr.swap_dims({f'{k}_old': k for k in dims & orig_dims})

    return arr
示例#14
0
def gard_postprocess(
    model_output: xr.Dataset,
    scrf: xr.DataArray,
    label: str,
    model_params: Optional[Dict[str, Any]] = None,
    **kwargs,
) -> xr.Dataset:
    """
    Add perturbation to the mean prediction of GARD to more accurately represent extreme events. The perturbation is
    generated with the prediction error during model fit scaled with a spatio-temporally correlated random field.

    Parameters
    ----------
    model_output : xr.Dataset
        GARD model prediction output. Should contain three variables: pred (predicted mean), prediction_error
        (prediction error in fit), and exceedance_prob (probability of exceedance for threshold)
    scrf : xr.DataArray
        Spatio-temporally correlated random fields (SCRF)
    model_params : Dict
        Model parameter dictionary

    Returns
    -------
    downscaled : xr.Dataset
        Final downscaled output
    """
    if model_params is not None:
        thresh = model_params.get('thresh')
    else:
        thresh = None

    ## CURRENTLY needs calendar to be gregorian
    ## TODO: merge in the calendar conversion for GCMs and this should work great!
    assert len(scrf.time) == len(model_output.time)
    assert len(scrf.lat) == len(model_output.lat)
    assert len(scrf.lon) == len(model_output.lon)

    scrf = scrf.assign_coords({
        'lat': model_output.lat,
        'lon': model_output.lon,
        'time': model_output.time
    })

    if thresh is not None:
        # convert scrf from a normal distribution to a uniform distribution
        scrf_uniform = xr.apply_ufunc(norm.cdf,
                                      scrf,
                                      dask='parallelized',
                                      output_dtypes=[scrf.dtype])

        # find where exceedance prob is exceeded
        mask = scrf_uniform > (1 - model_output['exceedance_prob'])

        # Rescale the uniform distribution
        new_uniform = (scrf_uniform - (1 - model_output['exceedance_prob'])
                       ) / model_output['exceedance_prob']

        # Get the normal distribution equivalent of new_uniform
        r_normal = xr.apply_ufunc(norm.ppf,
                                  new_uniform,
                                  dask='parallelized',
                                  output_dtypes=[new_uniform.dtype])

        downscaled = model_output[
            'pred'] + r_normal * model_output['prediction_error']

        # what do we do for thresholds like heat wave?
        valids = xr.ufuncs.logical_or(mask, downscaled >= 0)
        downscaled = downscaled.where(valids, 0)
    else:
        downscaled = model_output[
            'pred'] + scrf * model_output['prediction_error']
    downscaled = downscaled.chunk({'time': 365, 'lat': 150, 'lon': 150})
    return downscaled.to_dataset(name=label)
示例#15
0
class TestDataArrayAndDataset(DaskTestCase):
    def assertLazyAndIdentical(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayIdentical)

    def assertLazyAndAllClose(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose)

    def assertLazyAndEqual(self, expected, actual):
        self.assertLazyAnd(expected, actual, self.assertDataArrayEqual)

    def setUp(self):
        self.values = np.random.randn(4, 6)
        self.data = da.from_array(self.values, chunks=(2, 2))
        self.eager_array = DataArray(self.values,
                                     coords={'x': range(4)},
                                     dims=('x', 'y'),
                                     name='foo')
        self.lazy_array = DataArray(self.data,
                                    coords={'x': range(4)},
                                    dims=('x', 'y'),
                                    name='foo')

    def test_rechunk(self):
        chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
        self.assertEqual(chunked.chunks, ((2, ) * 2, (2, ) * 3))
        self.assertLazyAndIdentical(self.lazy_array, chunked)

    def test_new_chunk(self):
        chunked = self.eager_array.chunk()
        self.assertTrue(chunked.data.name.startswith('xarray-<this-array>'))

    def test_lazy_dataset(self):
        lazy_ds = Dataset({'foo': (('x', 'y'), self.data)})
        self.assertIsInstance(lazy_ds.foo.variable.data, da.Array)

    def test_lazy_array(self):
        u = self.eager_array
        v = self.lazy_array

        self.assertLazyAndAllClose(u, v)
        self.assertLazyAndAllClose(-u, -v)
        self.assertLazyAndAllClose(u.T, v.T)
        self.assertLazyAndAllClose(u.mean(), v.mean())
        self.assertLazyAndAllClose(1 + u, 1 + v)

        actual = xr.concat([v[:2], v[2:]], 'x')
        self.assertLazyAndAllClose(u, actual)

    def test_groupby(self):
        if LooseVersion(dask.__version__) == LooseVersion('0.15.3'):
            pytest.xfail('upstream bug in dask: '
                         'https://github.com/dask/dask/issues/2718')

        u = self.eager_array
        v = self.lazy_array

        expected = u.groupby('x').mean()
        actual = v.groupby('x').mean()
        self.assertLazyAndAllClose(expected, actual)

    def test_groupby_first(self):
        u = self.eager_array
        v = self.lazy_array

        for coords in [u.coords, v.coords]:
            coords['ab'] = ('x', ['a', 'a', 'b', 'b'])
        with self.assertRaisesRegexp(NotImplementedError, 'dask'):
            v.groupby('ab').first()
        expected = u.groupby('ab').first()
        actual = v.groupby('ab').first(skipna=False)
        self.assertLazyAndAllClose(expected, actual)

    def test_reindex(self):
        u = self.eager_array.assign_coords(y=range(6))
        v = self.lazy_array.assign_coords(y=range(6))

        for kwargs in [{
                'x': [2, 3, 4]
        }, {
                'x': [1, 100, 2, 101, 3]
        }, {
                'x': [2.5, 3, 3.5],
                'y': [2, 2.5, 3]
        }]:
            expected = u.reindex(**kwargs)
            actual = v.reindex(**kwargs)
            self.assertLazyAndAllClose(expected, actual)

    def test_to_dataset_roundtrip(self):
        u = self.eager_array
        v = self.lazy_array

        expected = u.assign_coords(x=u['x'])
        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))

    def test_merge(self):
        def duplicate_and_merge(array):
            return xr.merge([array, array.rename('bar')]).to_array()

        expected = duplicate_and_merge(self.eager_array)
        actual = duplicate_and_merge(self.lazy_array)
        self.assertLazyAndEqual(expected, actual)

    def test_ufuncs(self):
        u = self.eager_array
        v = self.lazy_array
        self.assertLazyAndAllClose(np.sin(u), xu.sin(v))

    def test_where_dispatching(self):
        a = np.arange(10)
        b = a > 3
        x = da.from_array(a, 5)
        y = da.from_array(b, 5)
        expected = DataArray(a).where(b)
        self.assertLazyAndEqual(expected, DataArray(a).where(y))
        self.assertLazyAndEqual(expected, DataArray(x).where(b))
        self.assertLazyAndEqual(expected, DataArray(x).where(y))

    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        self.assertEqual(count[0], 1)

    def test_persist_Dataset(self):
        ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk()
        ds = ds + 1
        n = len(ds.foo.data.dask)

        ds2 = ds.persist()

        assert len(ds2.foo.data.dask) == 1
        assert len(ds.foo.data.dask) == n  # doesn't mutate in place

    def test_persist_DataArray(self):
        x = da.arange(10, chunks=(5, ))
        y = DataArray(x)
        z = y + 1
        n = len(z.data.dask)

        zz = z.persist()

        assert len(z.data.dask) == n
        assert len(zz.data.dask) == zz.data.npartitions

    def test_stack(self):
        data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
        arr = DataArray(data, dims=('w', 'x', 'y'))
        stacked = arr.stack(z=('x', 'y'))
        z = pd.MultiIndex.from_product(
            [np.arange(3), np.arange(4)], names=['x', 'y'])
        expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z'])
        assert stacked.data.chunks == expected.data.chunks
        self.assertLazyAndEqual(expected, stacked)

    def test_dot(self):
        eager = self.eager_array.dot(self.eager_array[0])
        lazy = self.lazy_array.dot(self.lazy_array[0])
        self.assertLazyAndAllClose(eager, lazy)

    def test_dataarray_repr(self):
        # Test that __repr__ converts the dask backend to numpy
        # in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.DataArray 'data' (x: 1)>
        dask.array<shape=(1,), dtype=int64, chunksize=(1,)>
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x""")
        self.assertEqual(expected, repr(a))
        self.assertEquals(kernel_call_count, 0)

    def test_dataset_repr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        expected = dedent("""\
        <xarray.Dataset>
        Dimensions:  (x: 1)
        Coordinates:
            y        (x) int64 dask.array<shape=(1,), chunksize=(1,)>
        Dimensions without coordinates: x
        Data variables:
            a        (x) int64 dask.array<shape=(1,), chunksize=(1,)>""")
        self.assertEqual(expected, repr(ds))
        self.assertEquals(kernel_call_count, 0)

    def test_dataarray_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variable nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        a1.compute()
        self.assertFalse(a1._in_memory)
        self.assertFalse(a1.coords['y']._in_memory)
        self.assertEquals(kernel_call_count, 2)
        a2 = pickle.loads(pickle.dumps(a1))
        self.assertEquals(kernel_call_count, 2)
        self.assertDataArrayIdentical(a1, a2)
        self.assertFalse(a1._in_memory)
        self.assertFalse(a2._in_memory)
        self.assertFalse(a1.coords['y']._in_memory)
        self.assertFalse(a2.coords['y']._in_memory)

    def test_dataset_pickle(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds1 = Dataset(data_vars={'a': ('x', data)},
                      coords={'y': ('x', nonindex_coord)})
        ds1.compute()
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds1['y']._in_memory)
        self.assertEquals(kernel_call_count, 2)
        ds2 = pickle.loads(pickle.dumps(ds1))
        self.assertEquals(kernel_call_count, 2)
        self.assertDatasetIdentical(ds1, ds2)
        self.assertFalse(ds1['a']._in_memory)
        self.assertFalse(ds2['a']._in_memory)
        self.assertFalse(ds1['y']._in_memory)
        self.assertFalse(ds2['y']._in_memory)

    def test_dataarray_getattr(self):
        # ipython/jupyter does a long list of getattr() calls to when trying to
        # represent an object.
        # Make sure we're not accidentally computing dask variables.
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(a, 'NOTEXIST')
        self.assertEquals(kernel_call_count, 0)

    def test_dataset_getattr(self):
        # Test that pickling/unpickling converts the dask backend
        # to numpy in neither the data variables nor the non-index coords
        data = build_dask_array('data')
        nonindex_coord = build_dask_array('coord')
        ds = Dataset(data_vars={'a': ('x', data)},
                     coords={'y': ('x', nonindex_coord)})
        with suppress(AttributeError):
            getattr(ds, 'NOTEXIST')
        self.assertEquals(kernel_call_count, 0)

    def test_values(self):
        # Test that invoking the values property does not convert the dask
        # backend to numpy
        a = DataArray([1, 2]).chunk()
        self.assertFalse(a._in_memory)
        self.assertEquals(a.values.tolist(), [1, 2])
        self.assertFalse(a._in_memory)

    def test_from_dask_variable(self):
        # Test array creation from Variable with dask backend.
        # This is used e.g. in broadcast()
        a = DataArray(self.lazy_array.variable,
                      coords={'x': range(4)},
                      name='foo')
        self.assertLazyAndIdentical(self.lazy_array, a)
示例#16
0
文件: adas.py 项目: ukaea/Indica
    def get_adf15(
        self,
        element: str,
        charge: str,
        filetype: str,
        year="",
    ) -> DataArray:
        """Read data from the specified ADF15 ADAS file.

        Implementation is capable of reading files with compact and expanded formatting
        e.g. pec96][ne_pju][ne9.dat and pec40][ar_cl][ar16.dat respectively

        Parameters
        ----------
        element
            The atomic symbol for the element which will be retrieved.
        charge
            Charge state of the ion (e.g. 16 for Ar 16+), can also include
            other string for more complicated path (transport_llu][ar15ic.dat
            setting charge to "15ic")
        filetype
            The type of data to retrieve. Options: ic, cl, ca, ls, llu, ...
        year
            The two-digit year label for the data. = "transport" if special
            transport path


        Returns
        -------
        :
            The data in the specified file. Dimensions are density and
            temperature. Each members of the dataset correspond to a
            different charge state.

        """
        def explicit_reshape(data_to_reshape, nd, nt):
            data = np.empty((nd, nt))
            for id in range(nd):
                for it in range(nt):
                    data[id, it] = data_to_reshape[id * nt + it]

            return data

        def build_file_component(year, element):
            file_component = "transport"
            if year != "transport":
                file_component = f"pec{year}][{element.lower()}"

            return file_component

        def file_type(identifier):
            identifier_dict = {
                "+": "compact",
                ":": "expanded",
            }
            file_type = identifier_dict.get(identifier)
            if file_type is None:
                raise ValueError(
                    f"Unknown file header identified ({identifier}).")

            return file_type

        def transition_match(transition_line):
            transition_type = "orbitals"
            match = (
                r"c\s+(\d+.)"  # isel
                r"\s+(\d+.\d+)"  # wavelength
                r"\s+(\d+)(\(\d\)\d\(.+\d?.\d\))-"  # transition upper level
                r".+(\d+)(\(\d\)\d\(.+\d?.\d\))"  # transition lower level
            )
            header_re = re.compile(match)
            m = header_re.search(transition_line)
            if not m:
                transition_type = "n_levels"
                match = r"c\s+(\d+.)\s+(\d+.\d+)\s+([n]\=.\d+.-.[n]\=.\d+)"
                header_re = re.compile(match)
                m = header_re.search(transition_line)
                if not m:
                    raise ValueError(
                        f"Unknown transition formatting ({identifier}).")

            return transition_type, match

        now = datetime.datetime.now()
        file_component = build_file_component(year, element)
        filename = Path(pathname2url(file_component)) / pathname2url(
            f"{file_component}_{filetype.lower()}]"
            f"[{element.lower()}{charge.lower()}.dat")

        header_match = {
            "compact": r"(\d+).+/(\S+).*\+(.*)photon",
            "expanded": r"(\d+).+/(\S+).*\:(.*)photon",
        }
        section_header_match = {
            "compact":
            r"(\d+.\d+).+\s+(\d+)\s+(\d+).+type\s?"
            r"=\s?(\S+).+isel.+\s+(\d+)",
            "expanded":
            r"(\d+.\d+)\s+(\d+)\s+(\d+).+type\s?="
            r"\s?(\S+).+isel\s+?=\s+?(\d+)",
        }
        with self._get_file("adf15", filename) as f:
            header = f.readline().strip().lower()
            identifier = file_type(header.split("/")[1][2])

            match = header_match[identifier]
            m = re.search(match, header, re.I)
            assert isinstance(m, re.Match)
            ntrans = int(m.group(1))
            element_name = m.group(2).strip().lower()
            charge_state = int(m.group(3))
            assert element_name == element.lower()
            m = re.search(r"(\d+)(\S*)", charge)
            assert isinstance(m, re.Match)
            extracted_charge = m.group(1)
            if charge_state != int(extracted_charge):
                raise ValueError(
                    f"Charge state in ADF15 file ({charge_state}) does not "
                    f"match argument ({charge}).")

            # Read first section header to build arrays outside of reading loop
            match = section_header_match[identifier]
            header_re = re.compile(match)
            m = None
            while not m:
                line = f.readline().strip().lower()
                m = header_re.search(line)
            assert isinstance(m, re.Match)
            nd = int(m.group(2))
            nt = int(m.group(3))
            ttype: List[str] = []
            tindex = np.empty(ntrans)
            wavelength = np.empty(ntrans)

            # Read Photon Emissivity Coefficient rates
            data = np.empty((ntrans, nd, nt))
            for i in range(ntrans):
                m = header_re.search(line)
                assert isinstance(m, re.Match)
                assert int(m.group(5)) - 1 == i
                tindex[i] = i + 1
                ttype.append(m.group(4))
                wavelength[i] = float(m.group(1))  # (Angstroms)

                densities = np.fromfile(f, float, nd, " ")
                temperatures = np.fromfile(f, float, nt, " ")
                data_tmp = np.fromfile(f, float, nd * nt, " ")
                data[i, :, :] = explicit_reshape(data_tmp, nd, nt)
                line = f.readline().strip().lower()

            data = np.transpose(np.array(data), (0, 2, 1))

            # Read Transition information from end of file
            file_end_re = re.compile(r"c\s+[isel].+\s+[transition].+\s+[type]")
            while not file_end_re.search(line):
                line = f.readline().strip().lower()
            _ = f.readline()
            if identifier == "expanded":
                _ = f.readline()
            line = f.readline().strip().lower()
            transition_type, match = transition_match(line)
            transition_re = re.compile(match)

            format_transition = {
                "orbitals":
                lambda m: f"{m.group(4)}-{m.group(6)}".replace(" ", ""),
                "n_levels": lambda m: m.group(3).replace(" ", ""),
            }
            transition = []
            for i in tindex:
                m = transition_re.search(line)
                assert isinstance(m, re.Match)
                assert int(m.group(1)[:-1]) == i
                transition_tmp = format_transition[transition_type](m)
                transition.append(transition_tmp)
                line = f.readline().strip().lower()

        gen_type = ADF15_GENERAL_DATATYPES[filetype]
        spec_type = element
        name = f"{spec_type}_{gen_type}"
        attrs = {
            "datatype": (gen_type, spec_type),
            "provenance": self.create_provenance(filename, now),
        }

        coords = [
            ("index", tindex),
            ("electron_temperature", temperatures),  # eV
            ("electron_density", densities * 10**6),  # m**-3
        ]

        pecs = DataArray(
            data * 10**-6,
            coords=coords,
            name=name,
            attrs=attrs,
        )

        # Add extra dimensions attached to index
        pecs = pecs.assign_coords(wavelength=("index", wavelength))  # (A)
        pecs = pecs.assign_coords(
            transition=("index", transition)
        )  # (2S+1)L(w-1/2)-(2S+1)L(w-1/2) of upper-lower levels, no blank spaces
        pecs = pecs.assign_coords(type=("index", ttype))  # (excit, recomb, cx)

        return pecs