Exemplo n.º 1
0
def finalize (var, time, space, eof, eig, pc, variance, weight, out):
  from pygeode.var import Var
  import numpy as np

  # Keep the name
  name = var.name
  num = eof.shape[0]

  # Conform to the proper shape
  eof = eof.reshape((num,)+space.shape)
  pc = pc.reshape((num,)+time.shape)

  # Use a consistent sign convention
  # (the first element of the eof is non-negative)
  sign = [-1 if eof.reshape(num,-1)[i,0] < 0 else 1 for i in range(num)]
  for i,s in enumerate(sign):
    eof[i,...] *= s
    pc[i,...] *= s

  # Copy the data into fresh array
  # (that way, if these are view on much larger arrays, the rest of
  #  the unused data can be garbage collected)
  eof = np.array(eof)
  pc = np.array(pc)
  eig = np.array(eig)

  # EOF axis
  orderaxis = order(num,name="order")
  eof = Var((orderaxis,)+space.axes, values=eof)
  eig = Var([orderaxis], values=eig)
  pc = Var((orderaxis,)+time.axes, values = pc)

  # Undo normalization by area weight?
  eof = remove_weights(eof, weight=weight).load()

  eof.name = name + "_EOF"
  pc.name = name + "_timeseries"
  eig.name = name + "_eigenvalues"

  # Other things
  # Fraction of total variance
  frac = ((eig**2) / variance).load()
  frac.name = name + "_fraction_of_variance"
  # Eigenvalues of the covariance matrix
  eig2 = (eig**2).load()
  eig2.name = name + "_eigenvalues"

  # Gather up all possible outputs
  outdict = dict(eof=eof, eig=eig, eig2=eig2, pc=pc, var=variance, frac=frac)
  out = whichout(out)
  out = [outdict[o] for o in out]

  return out
Exemplo n.º 2
0
def test_underscores_in_axes():
  from pygeode.axis import ZAxis
  from pygeode.var import Var
  # Axis names without underscores works
  class HeightwrtGround(ZAxis): pass
  ax = HeightwrtGround(values=[1.5])
  x = Var(axes=[ax], name='blah', values=[123.45])
  y = x(heightwrtground=0.0)
  # Axis names with underscores fails
  class Height_wrt_Ground(ZAxis): pass
  ax = Height_wrt_Ground(values=[1.5])
  x = Var(axes=[ax], name='blah', values=[123.45])
  y = x(height_wrt_ground=0.0)
Exemplo n.º 3
0
def test_issue061():
  from pygeode.tutorial import t1
  from pygeode.var import Var

  # Construct 3 variables, with some attributes that are identical and some
  # that are different.
  x = Var(t1.axes, atts=dict(standard_name='x',units='deg C'), dtype=float)
  y = Var(t1.axes, atts=dict(standard_name='y',units='deg C'), dtype=float)
  z = Var(t1.axes, atts=dict(standard_name='z',units='deg C'), dtype=float)

  # Apply a ufunc operator on these variables, to get a new variable
  w = x*y*z

  # Make sure the standard_name is removed.
  assert w.atts == dict(units='deg C')
Exemplo n.º 4
0
def decode_string_var (var):
  from pygeode.var import Var

  name = var.name
  if name.endswith('_name'):
    name = name[:-5]
  data = [''.encode('ascii').join(s) for s in var.get()]
  data = [str(s.decode()) for s in data]
  return Var(axes=var.axes[:-1], values=data, name=name)
Exemplo n.º 5
0
 def setUp(self):
     #    values = np.random.rand(365,180,360)
     values = np.ones([365, 180, 360])
     time = ModelTime365(startdate=dict(year=2000, month=1),
                         values=np.arange(365),
                         units='days')
     lon = Lon(values=np.arange(360))
     lat = Lat(values=np.arange(180) - 89.5)
     self.var = Var(axes=[time, lat, lon], values=values)
Exemplo n.º 6
0
def test_issue005():

    from pygeode.timeaxis import ModelTime365
    from pygeode.axis import TAxis
    import numpy as np
    from pygeode.var import Var
    from pygeode.formats import netcdf as nc
    from pygeode import timeutils

    # Make a time axis starting at year 0
    startdate = dict(year=0, month=1, day=1)
    taxis = ModelTime365(values=10200, startdate=startdate, units='days')

    # Make some dummy variable
    np.random.seed(len(taxis))
    values = np.random.randn(len(taxis))
    var = Var(axes=[taxis], values=values, name='x')

    # Save it
    nc.save("issue005_test.nc", var)

    # Load it
    f = nc.open("issue005_test.nc")

    # Make sure we have a regular time axis
    # (no climatologies!)
    assert f.time.__class__ == ModelTime365
    assert hasattr(f.time, 'year')

    # Okay, now reload it, but override the axis coming in
    f = nc.open("issue005_test.nc", dimtypes=dict(time=TAxis(taxis.values)))

    # Make sure we dimtypes is still working properly
    assert f.x.axes[0].__class__ == TAxis

    # For good measure, test that climatologies are still produced
    taxis = timeutils.modify(taxis, exclude='year', uniquify=True)
    values = np.random.randn(len(taxis))
    var = Var(axes=[taxis], values=values, name='c')

    nc.save("issue005_test.nc", var)
    f = nc.open("issue005_test.nc")

    assert not hasattr(f.time, 'year')
Exemplo n.º 7
0
def test_auxarray_var_arg():
    from pygeode.axis import ZAxis, Hybrid
    from pygeode.var import Var
    import numpy as np
    zaxis = ZAxis(list(range(10)))
    A = [0., 10., 20., 30., 40., 50., 50., 50., 50., 40.]
    B = np.linspace(0, 1, 10)

    # Try passing A and B as arrays.
    # This should work already.
    eta = Hybrid(values=list(range(10)), A=A, B=B)
    assert 'A' in eta.auxarrays and 'B' in eta.auxarrays

    A = Var(axes=[zaxis], values=A)
    B = Var(axes=[zaxis], values=B)

    # Try again with Var object arguments.
    eta = Hybrid(values=list(range(10)), A=A, B=B)
    assert 'A' in eta.auxarrays and 'B' in eta.auxarrays
Exemplo n.º 8
0
def test_issue010():
  from pygeode.var import Var
  from pygeode.axis import Axis
  from pygeode.dataset import Dataset
  from pygeode.formats import netcdf as nc

  # Make some axes
  time_axis = Axis(values=[0], name='time')
  bnds_axis = Axis(values=[0,1], name='bnds')

  # Make some vars (note we don't have a 'bnds' variable corresponding to the 'bnds' dimension
  time_var = Var(axes=[time_axis], values=[1], name='time')
  time_bnds = Var(axes=[time_axis,bnds_axis], values=[[3,4]], name='time_bnds')

  # Make a dataset to hold the vars
  dataset = Dataset([time_var, time_bnds])

  # Manually appy dims2axes to detect our axes
  dataset = nc.dims2axes(dataset)
Exemplo n.º 9
0
def test_issue036():
    import numpy as np
    from pygeode.axis import Height
    from pygeode.var import Var
    from pygeode.plot import plotvar

    height = Height(np.arange(10))
    var = Var([height], values=np.zeros(10))
    plotvar(var)
    assert np.all(height.values == np.arange(10))
Exemplo n.º 10
0
def do_concat (shape1, shape2, iaxis, count=[0]):
  from pygeode.axis import XAxis, YAxis, ZAxis, TAxis
  from pygeode.var import Var
  from pygeode.concat import concat
  from var_test import varTest
  import numpy as np

  # Increment test counter (and generate a unique test name)
  count[0] += 1
  testname = "concattest%05d"%(count[0])

  # Create some test data
  np.random.seed(count[0])
  array1 = np.random.randn(*shape1)
  array2 = np.random.randn(*shape2)

  # Get the # of dimensions, and assign a unique axis class for each dim
  assert array1.ndim == array2.ndim
  ndim = array1.ndim
  axis_classes = (XAxis, YAxis, ZAxis, TAxis)[:ndim]

  # Construct the first var
  axes = [cls(n) for cls,n in zip(axis_classes,array1.shape)]
  var1 = Var(axes = axes, values = array1, name = "myvar", atts={'a':1, 'b':2, 'c':3})

  # The second var should have the same axes, except for the concatenation one
  n1 = array1.shape[iaxis]
  n2 = array2.shape[iaxis]
  axes[iaxis] = axis_classes[iaxis](np.arange(n1, n1+n2))
  var2 = Var(axes = axes, values = array2, name = "myvar", atts={'a':1, 'b':3, 'd':4})

  # Try concatenating
  var = concat(var1,var2)

  # The expected result
  expected = np.concatenate ( (array1, array2), iaxis)

  # Test this
  test = varTest(testname=testname, var=var, values=expected)

  # Store this test
  globals()[testname] = test
Exemplo n.º 11
0
def test_issue028():
    from pygeode.var import Var
    from pygeode.axis import Lat

    # Create a mock variable
    lat = Lat([-45.])
    var = Var(axes=[lat], values=[123.])

    # Try scaling it by a complex number
    var2 = var * 1j
    assert var2.dtype.name == 'complex128'
Exemplo n.º 12
0
def test_slice():
    from pygeode.axis import XAxis, YAxis
    from pygeode.var import Var
    import numpy as np
    values = np.zeros([10, 10])
    xaxis = XAxis(np.arange(10))
    yaxis = YAxis(np.arange(10))
    var = Var(axes=[xaxis, yaxis], values=values)
    #  slicedvar = var.slice[[ 7, -2, -4,  9,  4,  0], [ 7, -2, -4,  9,  4,  0]]
    slicedvar = var.slice[[7, -2, -4, 9, 4, 0], [-1, 0, 4]]
    slicedvar.get()
Exemplo n.º 13
0
    def getview(self, view, pbar):
        from pygeode.tools import partial_sum
        import numpy as np
        from pygeode.axis import Coef
        from pygeode.var import Var

        ti = self.ti
        taxis = self.var.axes[ti]
        # Get number of seconds since start of data
        secs = taxis.reltime(units='seconds')
        # Wrap it as a var, so we can use it in the loop below
        secs = Var([taxis], values=secs)

        cview = view.remove(
            Coef)  # view without regard to a 'coefficient' axis

        X = np.zeros(cview.shape, self.dtype)
        nX = np.zeros(cview.shape, int)
        F = np.zeros(cview.shape, self.dtype)
        nF = np.zeros(cview.shape, int)
        XF = np.zeros(cview.shape, self.dtype)
        nXF = np.zeros(cview.shape, int)
        X2 = np.zeros(cview.shape, self.dtype)
        nX2 = np.zeros(cview.shape, int)

        for slices, (data, t), bins in loopover([self.var, secs], cview, pbar):
            partial_sum(data, slices, F, nF, ti, bins)
            partial_sum(t, slices, X, nX, ti, bins)
            partial_sum(data * t, slices, XF, nXF, ti, bins)
            partial_sum(t**2, slices, X2, nX2, ti, bins)

        F /= nF
        X /= nX
        XF /= nXF
        X2 /= nX2

        #    print '??', X2 - X**2

        A = XF - X * F
        B = X2 * F - X * XF

        icoef = view.index(Coef)
        coef = view.integer_indices[icoef]

        out = np.empty(view.shape, self.dtype)

        #  Stick the two coefficients together into a single array
        out[..., np.where(coef == 0)[0]] = B[..., None]
        out[..., np.where(coef == 1)[0]] = A[..., None]

        out /= (X2 - X**2)[..., None]

        return out
Exemplo n.º 14
0
def make_var():
    from pygeode.var import Var
    from pygeode.timeaxis import StandardTime
    import numpy as np
    time = StandardTime(startdate=dict(year=2009, month=1, day=1),
                        values=list(range(10)),
                        units='days')
    station = make_station_axis()
    return Var([time, station],
               values=np.arange(len(time) * len(station)).reshape(
                   len(time), len(station)),
               name="dummy")
Exemplo n.º 15
0
  def setUp(self):
    # Use linear coordinates
    x = np.array([0,1,2])
    y = np.array([0,1,2,3])
    # Some simple data with a hole in it
    self.data    = np.array([[1,2,3],[4,float('nan'),6],[7,8,9],[10,11,12]])
    self.data_nm = np.array([[1,2,3],[4,5,6],[7,11,9],[10,8,12]])
    # Construct a Var object
    x = XAxis(x)
    y = YAxis(y)
    var    = Var(axes=[y,x], values=self.data)

    self.x = x
    self.y = y
    self.var = var

    # Some interpolation coordinates
    # mid-points (and values just outside the range)
    self.x2 = XAxis(values=[-0.5,0.5,1.5,2.5])
    self.y2 = YAxis(values=[-0.5,0.5,1.5,2.5,3.5])
    # Reverse of original values
    self.x3 = XAxis(values=[2,1,0])
    self.y3 = YAxis(values=[3,2,1,0])
    # Reverse of mid-points
    self.x4 = XAxis(values=[2.5,1.5,0.5,-0.5])
    self.y4 = YAxis(values=[3.5,2.5,1.5,0.5,-0.5])
    # Single non-midpoint
    self.x5 = XAxis(values=[1.4])
    self.y5 = YAxis(values=[2.2])

    # Non-monotonic axis
    self.y_nm = Var(axes=[y,x], values=[[0,0,0], [1,1,2], [2,2,1], [3,3,3]])

    # 2D interpolation
    xfield = np.array([[-1,0,1],[0,1,2],[1,2,3],[0,1,2]])
    yfield = np.array([[-1,0,1,2],[0,1,2,3],[1,2,3,4]]).transpose()
    self.xfield = Var(axes=[y,x], values=xfield)
    self.yfield = Var(axes=[y,x], values=yfield)
    self.x6 = XAxis(values=[-1,0,1,2,3])
Exemplo n.º 16
0
def encode_string_var (var):
  import numpy as np
  from pygeode.var import Var
  from pygeode.axis import DummyAxis

  # Construct a 2D character array to hold strings
  strlen = max(len(string) for string in var.values)
  #TODO: make this a simple dimension (no coordinate values needed!)
  strlen_axis = DummyAxis (strlen, name=var.name+"_strlen")
  dtype = '|S'+str(strlen)  # For a convenient view on the character array
                                      # (to help popluate it from strings)

  data = np.zeros(list(var.shape)+[strlen], dtype='|S1')
  data.view(dtype)[...,0] = var.values
  var = Var(list(var.axes)+[strlen_axis], values=data, name=var.name+"_name")
  return var
Exemplo n.º 17
0
def test_issue025():
  lat = Lat([80,70,60])
  var = Var(axes=[lat], values=[1,2,3], name='2B')

  # Save the variable
  nc.save ("issue025_test.nc", var)

  # This may crash in some versions of the netcdf library.

  # Even if it doesn't crash, it's a good idea to enforce the legal
  # netcdf names

  f = nc.open("issue025_test.nc")

  assert len(f.vars) == 1
  # Must not start with a digit (should have been filtered)
  assert not f.vars[0].name[0].isdigit()
Exemplo n.º 18
0
    def auxasvar(self, name):
        # {{{
        ''' Returns auxiliary array as a new :class:`Var` object.

        Parameters
        ==========
        name : string
            Name of auxiliary array to return

        Returns
        =======
        var : :class:`Var`
            Variable with values of requested auxilliary array

        See Also
        ========
        auxarrays
    '''
        from pygeode.var import Var
        return Var([self], values=self.auxarrays[name], name=name)
Exemplo n.º 19
0
def override_values(dataset, value_override):
    # {{{
    from warnings import warn
    import numpy as np
    from pygeode.var import Var, copy_meta
    vardict = {}
    for name, values in value_override.items():
        if name not in dataset:
            warn("var '%s' not found - values not overridden" % name,
                 stacklevel=3)
            continue
        values = np.asarray(values)
        oldvar = dataset[name]
        assert values.shape == oldvar.shape, "bad shape for '%s'.  Expected %s, got %s" % (
            name, oldvar.shape, values.shape)
        var = Var(oldvar.axes, values=values)
        copy_meta(oldvar, var)
        vardict[name] = var
    dataset = dataset.replace_vars(vardict)
    return dataset
Exemplo n.º 20
0
    def __init__(self, taxis, coef=None, A=None, B=None):
        from pygeode.tools import merge_coefs
        from pygeode.var import Var
        from pygeode.timeaxis import Time
        from pygeode.timeutils import modify
        # Get the coefficients
        if coef is None:
            assert A is not None and B is not None
            coef = merge_coefs(B, A)
        else:
            assert A is None and B is None
        # Ignore 'year' field if it's constant?
        #   (I.e., if there's a 'year' field that's all zeros, then drop it)
        #  - This is an artifact of reading climatological data from a file which can't/doesn't specify it's a climatology
        coeft = coef.getaxis(Time)
        if hasattr(coeft, 'year'):
            import numpy as np
            from warnings import warn
            if len(np.unique(coeft.year.values)) == 1:
                warn("ignoring degenerate 'year' field", stacklevel=2)
                coeft = modify(coeft, exclude='year')
                coef = coef.replace_axes({Time: coeft})

        self.coef = coef
        self.secs = Var([taxis], values=taxis.reltime(units='seconds'))
        self.ti = ti = coef.whichaxis(Time)
        self.ci = ci = coef.whichaxis('coef')
        self.caxis = coef.axes[ci]
        axes = list(coef.axes)
        assert axes[ti].map_to(taxis) is not None, (
            "the given time axis is not compatible with the coefficients.\n" +
            "time axis: %s\n coefficient time axis: %s" %
            (repr(str(taxis)), repr(str(axes[ti]))))
        axes[ti] = taxis
        axes = axes[:ci] + axes[ci + 1:]
        #    Var.__init__(self, axes, dtype=coef.dtype)
        Var.__init__(self, axes, dtype='float64')  # because secs is float64
Exemplo n.º 21
0
def paired_difference(X,
                      Y,
                      axes=None,
                      alpha=0.05,
                      N_fac=None,
                      output='d,p,ci',
                      pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must share all axes over which the means are being computed.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X and Y; the effective number will be given by the number estimated from the
    dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    Nx = np.product([len(X.axes[i]) for i in ixaxes])

    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])

    assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    # Construct work arrays
    d = np.full(oview.shape, np.nan, 'd')
    dd = np.full(oview.shape, np.nan, 'd')
    N = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y],
                                          oview,
                                          inaxes=srcaxes,
                                          pbar=pbar):
        ddata = xdata.astype('d') - ydata.astype('d')
        d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
        dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)

        # Count of non-NaN data points
        N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (N > 1)
    dd[imsk] -= (d * d)[imsk] / N[imsk]
    dd[imsk] /= (N[imsk] - 1)
    d[imsk] /= N[imsk]

    # Ensure variance is non-negative
    dd[dd <= 0.] = 0.

    if N_fac is not None: eN = N // N_fac
    else: eN = N

    emsk = (eN > 1)

    den = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    den = np.zeros(oview.shape, 'd')
    den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1))
    dmsk = (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1))
    ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk]

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=eN - 1, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for t-test of paired difference (%s - %s)' % (
                xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['N_fac'] = N_fac
    ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn)

    return ds
Exemplo n.º 22
0
def difference(X,
               Y,
               axes=None,
               alpha=0.05,
               Nx_fac=None,
               Ny_fac=None,
               output='d,p,ci',
               pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional, defaults to None
    Axes over which to compute means; if othing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float, optional; defaults to 0.05
    Confidence level for which to compute confidence interval.

  Nx_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]

    Nx = np.product([len(X.axes[i]) for i in ixaxes])
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])
    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    Nx = np.full(oview.shape, np.nan, 'd')
    Ny = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)

        # Count of non-NaN data points
        Nx[outsl] = np.nansum(
            [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0)

    for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar):
        ydata = ydata.astype('d')
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)

        # Count of non-NaN data points
        Ny[outsl] = np.nansum(
            [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (Nx > 1) & (Ny > 1)
    xx[imsk] -= (x * x)[imsk] / Nx[imsk]
    xx[imsk] /= (Nx[imsk] - 1)

    x[imsk] /= Nx[imsk]

    yy[imsk] -= (y * y)[imsk] / Ny[imsk]
    yy[imsk] /= (Ny[imsk] - 1)

    y[imsk] /= Ny[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    if Nx_fac is not None: eNx = Nx // Nx_fac
    else: eNx = Nx
    if Ny_fac is not None: eNy = Ny // Ny_fac
    else: eNy = Ny

    emsk = (eNx > 1) & (eNy > 1)

    # Compute difference
    d = x - y

    den = np.zeros(oview.shape, 'd')
    df = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    # Convert to variance of the mean of each sample
    xx[emsk] /= eNx[emsk]
    yy[emsk] /= eNy[emsk]

    den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1)
    dmsk = (den > 0.)

    df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk]

    den[emsk] = np.sqrt(xx[emsk] + yy[emsk])

    dmsk &= (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk]))

    ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk]

    df[~dmsk] = np.nan
    p[~dmsk] = np.nan
    ci[~dmsk] = np.nan

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=df, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % (
            xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['Nx_fac'] = Nx_fac
    ds.atts['Ny_fac'] = Ny_fac
    ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn)

    return ds
Exemplo n.º 23
0
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None):
    # {{{
    r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed.
    Note that a constant term is not included by default. The following
    parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': p-value of regession; see notes.
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    The outputs 'B', 'p', and 'sb' will produce as many outputs as there are
    regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; it corresponds to the probability of obtaining the regression
  coefficient under the null hypothesis that there is no linear relationship.
  Note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
    from pygeode.view import View

    # Split output request now
    ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    Nr = len(Xs)

    Xaxes = combine_axes(Xs)

    srcaxes = combine_axes([Xaxes, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            ia = whichaxis(srcaxes, a)
            if ia in riaxes: ri_new.append(ia)
            else:
                raise KeyError(
                    'One of the Xs or Y does not have the axis %s.' % a)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = tuple([srcaxes[i] for i in oiaxes])
    inaxes = oaxes + tuple([srcaxes[i] for i in riaxes])
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(
        riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (
            Y.name)

    # Construct work arrays
    os = oview.shape
    os1 = os + (Nr, )
    os2 = os + (Nr, Nr)
    y = np.zeros(os, 'd')
    yy = np.zeros(os, 'd')
    xy = np.zeros(os1, 'd')
    xx = np.zeros(os2, 'd')
    xxinv = np.zeros(os2, 'd')

    N = np.prod([len(srcaxes[i]) for i in riaxes])

    # Accumulate data
    for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
        ydata = datatuple[-1].astype('d')
        xdata = [datatuple[i].astype('d') for i in range(Nr)]
        y[outsl] += npsum(ydata, siaxes)
        yy[outsl] += npsum(ydata**2, siaxes)
        for i in range(Nr):
            xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes)
            for j in range(i + 1):
                xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes)

    # Fill in opposite side of xTx
    for i in range(Nr):
        for j in range(i):
            xx[..., j, i] = xx[..., i, j]

    # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
    # loop over oview does not help)
    xx = xx.reshape(-1, Nr, Nr)
    xxinv = xxinv.reshape(-1, Nr, Nr)
    for i in range(xx.shape[0]):
        xxinv[i, :, :] = np.linalg.inv(xx[i, :, :])
    xx = xx.reshape(os2)
    xxinv = xxinv.reshape(os2)

    beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
    vare = np.sum(xy * beta, -1)

    if N_fac is None: N_eff = N
    else: N_eff = N // N_fac

    sigbeta = [
        np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)
    ]

    xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
    yn = Y.name if Y.name != '' else 'Y'

    from .var import Var
    from .dataset import asdataset
    from .axis import NonCoordinateAxis

    ra = NonCoordinateAxis(values=np.arange(Nr),
                           regressor=xns,
                           name='regressor')
    ra2 = NonCoordinateAxis(values=np.arange(Nr),
                            regressor=xns,
                            name='regressor2')
    Nd = len(oaxes)

    rvs = []

    if 'beta' in output:
        B = Var(oaxes + (ra, ), values=beta, name='beta')
        B.atts['longname'] = 'regression coefficient'
        rvs.append(B)

    if 'r2' in output:
        vary = (yy - y**2 / N)
        R2 = 1 - (yy - vare) / vary
        R2 = Var(oaxes, values=R2, name='R2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        p = [
            2. *
            (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr))
            for i in range(Nr)
        ]
        p = np.transpose(np.array(p), [Nd] + list(range(Nd)))
        p = Var(oaxes + (ra, ), values=p, name='p')
        p.atts['longname'] = 'p-values'
        rvs.append(p)

    if 'sb' in output:
        sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd)))
        sb = Var(oaxes + (ra, ), values=sigbeta, name='sb')
        sb.atts['longname'] = 'standard deviation of linear coefficients'
        rvs.append(sb)

    if 'covb' in output:
        sigmat = np.zeros(os2, 'd')
        for i in range(Nr):
            for j in range(Nr):
                #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
                sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
        covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb')
        covb.atts['longname'] = 'Covariance matrix of the linear coefficients'
        rvs.append(covb)

    if 'se' in output:
        se = np.sqrt((yy - vare) / N_eff)
        se = Var(oaxes, values=se, name='se')
        se.atts['longname'] = 'standard deviation of residual'
        rvs.append(se)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'multiple linear regression parameters for %s regressed against %s' % (
            yn, xns)

    return ds
Exemplo n.º 24
0
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis, DummyAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  from copy import copy
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts.copy()
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  # Specify the conventions we're (supposedly) using
  global_atts['Conventions'] = "CF-1.0"

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in list(axisdict.items()):
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Encode non-coordinate axes, including station (timeseries) data.
    # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series
    # Move station lat/lon/name data into separate variables.
    if isinstance(a, NonCoordinateAxis):

      # Keep track of extra variables created from auxarray data.
      extra_vars = []

      # Detect certain arrays that should be treated as "coordinates".
      coordinates = []

      # Encode station latitude.
      if 'lat' in a.auxarrays:
        lat = a.auxasvar('lat')
        lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north")
        extra_vars.append(lat)
        coordinates.append('lat')
      # Encode station longitude.
      if 'lon' in a.auxarrays:
        lon = a.auxasvar('lon')
        lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east")
        extra_vars.append(lon)
        coordinates.append('lon')

      coordinates = " ".join(coordinates)

      # Encode other auxarrays as generic "ancillary" arrays.
      ancillary_variables = []
      for auxname in list(a.auxarrays.keys()):
        if auxname in coordinates: continue  # Handled above
        var = a.auxasvar(auxname)
        if var.dtype.name.startswith('str'):
          var = encode_string_var(var)
        # Some extra CF encoding for the station name, to use it as the unique identifier.
        if auxname == 'station':
          var.atts = dict(cf_role = "timeseries_id")
        extra_vars.append(var)
        ancillary_variables.append(auxname)

      ancillary_variables = " ".join(ancillary_variables)

      # Attach these coordinates to all variables that use this axis.
      #TODO: cleaner way of adding this information without having to do a shallow copy.
      for i,var in enumerate(varlist):
        if var.hasaxis(a):
          var = copy(var)
          var.atts = copy(var.atts)
          if len(coordinates) > 0:
            var.atts['coordinates'] = coordinates
          if len(ancillary_variables) > 0:
            var.atts['ancillary_variables'] = ancillary_variables
          varlist[i] = var

      # Add these coordinates / ancillary variables to the output.
      varlist.extend(extra_vars)

      # The values in the axis itself are meaningless, so mark them as such
      axisdict[name] = DummyAxis(len(a),name=name)

      # Special case: Station (timeseries) data.
      if isinstance(a, Station):
        global_atts['featureType'] = "timeSeries"
      # Nothing more to do for this axis type
      continue

    # Encode custom axes from add-ons
    for n,c in list(custom_axes.items()):
      if isinstance(a,c):
        atts['standard_name'] = n

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.items():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print('??', a.name, axisdict)
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
Exemplo n.º 25
0
# Issue 114
# https://github.com/pygeode/pygeode/issues/114

from pygeode.formats import netcdf4 as nc
from pygeode.axis import Lat
from pygeode.var import Var
from pygeode.dataset import Dataset

lat = Lat([80,70,60])
var = Var(axes=[lat], values=[1,2,3], name='A')
dataset = Dataset([var])

dataset_groups = {'Group 1': dataset, 'Group 2': dataset}

# Save the variable. 
nc.save('issue114_test.nc', dataset_groups, cfmeta=True)

# Read in the file again

dataset_groups_read = nc.open('issue114_test.nc',cfmeta=True)

# Check that the variables are the same
assert (dataset_groups['Group 1'].A == dataset_groups_read['Group 1'].A)


Exemplo n.º 26
0
def SVD(var1,
        var2,
        num=1,
        subspace=-1,
        iaxis=Time,
        weight1=True,
        weight2=True,
        matrix='cov'):
    """
  Finds coupled EOFs of two fields.  Note that the mean/trend/etc. is NOT
  removed in this routine.

  Parameters
  ----------
  var1, var2 : :class:`Var`
    The variables to analyse.

  num : integer
    The number of EOFs to compute (default is ``1``).

  weight1, weight2 : optional 
    Weights to use for defining orthogonality in the var1, var2 domains,
    respectively.  Patterns X and Y in the var1 domain are orthogonal if the
    sum over X*Y*weights1 is 0. Patterns Z and W in the var2 domain are
    orthogonal if the sum over Z*W*weights2 is 0. Default is to use internal
    weights defined for var1 accessed by :meth:`Var.getweights()`. If set to
    ``False`` no weighting is used.

  matrix : string, optional ['cov']
    Which matrix we are diagonalizing (default is 'cov'). 
     * 'cov': covariance matrix of var1 & var2
     * 'cov': correlation matrix of var1 & var2

  iaxis : Axis identifier
    The principal component / expansion coefficient axis, i.e., the 'time'
    axis. Can be an integer (the axis number, leftmost = 0), the axis name
    (string), or a Pygeode axis class.  If not specified, will try to use
    pygeode.timeaxis.Time, and if that fails, the leftmost axis.

  Returns
  -------
  (eof1, pc1, eof2, pc2): tuple
    * eof1: The coupled eof patterns for var1. 
    * pc1: The principal component / expansion coefficients for var1.
    * eof2: The coupled eof patterns for var2.
    * pc2: The principal component / expansion coefficients for var2.

  Notes
  -----
    Multiple orders of EOFs are concatenated along an 'order' axis.
  """
    import numpy as np
    from pygeode.timeaxis import Time
    from pygeode.var import Var
    from pygeode.view import View
    from pygeode import MAX_ARRAY_SIZE
    from warnings import warn
    from pygeode import svdcore as lib

    if matrix in ('cov', 'covariance'): matrix = 'cov'
    elif matrix in ('cor', 'corr', 'correlation'): matrix = 'cor'
    else:
        warn("invalid matrix type '%'.  Defaulting to covariance." % matrix,
             stacklevel=2)
        matrix = 'cov'

    MAX_ITER = 1000

    # Iterate over more EOFs than we need
    # (this helps with convergence)
    # TODO: a more rigorous formula for the optimum number of EOFs to use
    if subspace <= 0: subspace = 2 * num + 8
    if subspace < num: subspace = num  # Just in case

    # Remember the names
    prefix1 = var1.name + '_' if var1.name != '' else ''
    prefix2 = var2.name + '_' if var2.name != '' else ''

    # Apply weights?
    #  if weight1 is not None: var1 *= weight1.sqrt()
    #  if weight2 is not None: var2 *= weight2.sqrt()
    if weight1 is True: weight1 = var1.getweights()
    if weight1 is not False:
        assert not weight1.hasaxis(
            iaxis), "Can't handle weights along the record axis"
        # Normalize the weights
        W = weight1.sum() / weight1.size
        weight1 /= W
        # Apply the weights
        var1 *= weight1.sqrt()
    if weight2 is True: weight2 = var2.getweights()
    if weight2 is not False:
        assert not weight2.hasaxis(
            iaxis), "Can't handle weights along the record axis"
        # Normalize the weights
        W = weight2.sum() / weight2.size
        weight2 /= W
        # Apply the weights
        var2 *= weight2.sqrt()

    #TODO: allow multiple iteration axes (i.e., time and ensemble)
#  if iaxis is None:
#    if var1.hasaxis(Time) and var2.hasaxis(Time):
#      iaxis1 = var1.whichaxis(Time)
#      iaxis2 = var2.whichaxis(Time)
#    else:
#      iaxis1 = 0
#      iaxis2 = 0
#  else:
    iaxis1 = var1.whichaxis(iaxis)
    iaxis2 = var2.whichaxis(iaxis)

    assert var1.axes[iaxis1] == var2.axes[
        iaxis2], "incompatible iteration axes"
    del iaxis  # so we don't use this by accident

    # Special case: can load entire variable in memory
    # This will save some time, especially if the field is stored on disk, or is heavily derived
    if var1.size <= MAX_ARRAY_SIZE:
        print('preloading ' + repr(var1))
        var1 = var1.load()
    if var2.size <= MAX_ARRAY_SIZE:
        print('preloading ' + repr(var2))
        var2 = var2.load()

    # Use correlation instead of covariance?
    # (normalize by standard deviation)
    if matrix == 'cor':
        print('computing standard deviations')
        std1 = var1.stdev(iaxis1).load()
        std2 = var2.stdev(iaxis2).load()
        # account for grid points with zero standard deviation?
        std1.values = std1.values + (std1.values == 0)
        std2.values = std2.values + (std2.values == 0)
        var1 /= std1
        var2 /= std2

    eofshape1 = (subspace, ) + var1.shape[:iaxis1] + var1.shape[iaxis1 + 1:]
    eofshape2 = (subspace, ) + var2.shape[:iaxis2] + var2.shape[iaxis2 + 1:]

    pcshape1 = (var1.shape[iaxis1], subspace)
    pcshape2 = (var2.shape[iaxis2], subspace)

    # number of spatial grid points
    NX1 = var1.size // var1.shape[iaxis1]
    assert NX1 <= MAX_ARRAY_SIZE, 'field is too large!'
    NX2 = var2.size // var2.shape[iaxis2]
    assert NX2 <= MAX_ARRAY_SIZE, 'field is too large!'

    # Total number of timesteps
    NT = var1.shape[iaxis1]
    # Number of timesteps we can do in one fetch
    dt = MAX_ARRAY_SIZE // max(NX1, NX2)

    pcs1 = np.empty(pcshape1, dtype='d')
    pcs2 = np.empty(pcshape2, dtype='d')

    X = np.empty(eofshape2, dtype='d')
    U = np.empty(eofshape1, dtype='d')
    # Seed with sinusoids superimposed on random values
    Y = np.random.rand(*eofshape1)
    V = np.random.rand(*eofshape2)
    from math import pi
    for i in range(subspace):
        Y[i, ...].reshape(NX1)[:] += np.cos(
            np.arange(NX1, dtype='d') / NX1 * 2 * pi * (i + 1))
        V[i, ...].reshape(NX2)[:] += np.cos(
            np.arange(NX2, dtype='d') / NX2 * 2 * pi * (i + 1))


#  raise Exception

# Workspace for C code
    UtAX = np.empty([subspace, subspace], dtype='d')
    XtAtU = np.empty([subspace, subspace], dtype='d')
    VtV = np.empty([subspace, subspace], dtype='d')
    YtY = np.empty([subspace, subspace], dtype='d')

    # Views over whole variables
    # (rearranged to be compatible with our output eof arrays)
    view1 = View((var1.axes[iaxis1], ) + var1.axes[:iaxis1] +
                 var1.axes[iaxis1 + 1:])
    view2 = View((var2.axes[iaxis2], ) + var2.axes[:iaxis2] +
                 var2.axes[iaxis2 + 1:])

    for iter_num in range(1, MAX_ITER + 1):

        print('iter_num: %d' % iter_num)

        assert Y.shape == U.shape
        assert X.shape == V.shape
        U, Y = Y, U
        X, V = V, X

        # Reset the accumulation arrays for the next approximations
        Y[()] = 0
        V[()] = 0

        # Apply the covariance/correlation matrix
        for t in range(0, NT, dt):
            # number of timesteps we actually have
            nt = min(dt, NT - t)

            # Read the data
            chunk1 = view1.modify_slice(0, slice(t, t + nt)).get(var1)
            chunk1 = np.ascontiguousarray(chunk1, dtype='d')
            chunk2 = view2.modify_slice(0, slice(t, t + nt)).get(var2)
            chunk2 = np.ascontiguousarray(chunk2, dtype='d')

            ier = lib.build_svds(subspace, nt, NX1, NX2, chunk1, chunk2, X, Y,
                                 pcs2[t, ...])
            assert ier == 0
            ier = lib.build_svds(subspace, nt, NX2, NX1, chunk2, chunk1, U, V,
                                 pcs1[t, ...])
            assert ier == 0

        # Useful dot products
        lib.dot(subspace, NX1, U, Y, UtAX)
        lib.dot(subspace, NX2, V, V, VtV)
        lib.dot(subspace, NX1, Y, U, XtAtU)
        lib.dot(subspace, NX1, Y, Y, YtY)

        # Compute surrogate matrices (using all available information from this iteration)
        A1, residues, rank, s = np.linalg.lstsq(UtAX, VtV, rcond=1e-30)
        A2, residues, rank, s = np.linalg.lstsq(XtAtU, YtY, rcond=1e-30)

        # Eigendecomposition on surrogate matrices
        Dy, Qy = np.linalg.eig(np.dot(A1, A2))
        Dv, Qv = np.linalg.eig(np.dot(A2, A1))

        # Sort by eigenvalue (largest first)
        S = np.argsort(np.real(Dy))[::-1]
        Dy = Dy[S]
        Qy = np.ascontiguousarray(Qy[:, S], dtype='d')
        S = np.argsort(np.real(Dv))[::-1]
        Dv = Dv[S]
        Qv = np.ascontiguousarray(Qv[:, S], dtype='d')

        # get estimate of true eigenvalues
        D = np.sqrt(Dy)  # should also = np.sqrt(Dv) in theory
        print(D)

        # Translate the surrogate eigenvectors to an estimate of the true eigenvectors
        lib.transform(subspace, NX1, Qy, Y)
        lib.transform(subspace, NX2, Qv, V)

        # Normalize
        lib.normalize(subspace, NX1, Y)
        lib.normalize(subspace, NX2, V)

        if not np.allclose(U[:num, ...], Y[:num, ...], atol=0): continue
        if not np.allclose(X[:num, ...], V[:num, ...], atol=0): continue
        print('converged after %d iterations' % iter_num)
        break

    assert iter_num != MAX_ITER, "no convergence"

    # Flip the sign of the var2 EOFs and PCs so that the covariance is positive
    lib.fixcov(subspace, NT, NX2, pcs1, pcs2, V)

    # Wrap as pygeode vars, and return
    # Only need some of the eofs for output (the rest might not have even converged yet)
    orderaxis = order(num)

    eof1 = np.array(Y[:num])
    pc1 = np.array(pcs1[..., :num]).transpose()
    eof1 = Var((orderaxis, ) + var1.axes[:iaxis1] + var1.axes[iaxis1 + 1:],
               values=eof1)
    pc1 = Var((orderaxis, var1.axes[iaxis1]), values=pc1)

    eof2 = np.array(V[:num])
    pc2 = np.array(pcs2[..., :num]).transpose()
    eof2 = Var((orderaxis, ) + var2.axes[:iaxis2] + var2.axes[iaxis2 + 1:],
               values=eof2)
    pc2 = Var((orderaxis, var2.axes[iaxis2]), values=pc2)

    # Apply weights?
    if weight1 is not False: eof1 /= weight1.sqrt()
    if weight2 is not False: eof2 /= weight2.sqrt()

    # Use correlation instead of covariance?
    # Re-scale the fields by standard deviation
    if matrix == 'cor':
        eof1 *= std1
        eof2 *= std2

    # Give it a name
    eof1.name = prefix1 + "EOF"
    pc1.name = prefix1 + "PC"
    eof2.name = prefix2 + "EOF"
    pc2.name = prefix2 + "PC"

    return eof1, pc1, eof2, pc2
Exemplo n.º 27
0
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None):
    # {{{
    r'''Computes the mean value of X and statistics relevant for a test against
  the hypothesis that it is 0.

  Parameters
  ==========
  X : :class:`Var`
    Variable to average.

  axes : list, optional
    Axes over which to compute the mean; if nothing is specified, the mean is
    computed over all axes.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom;
    the effective number will be given by the number estimated from the dataset
    divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the mean value can be obtained through ``ds.m``).
    The following quantities can be calculated.

    * 'm': The mean value of X
    * 'p': The probability of the computed value if the population mean was zero
    * 'ci': The confidence interval of the mean at the level specified by alpha

    If the average is taken over all axes of X resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  difference

  Notes
  =====
  The number of effective degrees of freedom can be scaled as in :meth:`difference`. 
  The p-value and confidence interval are computed for the t-statistic defined in 
  eq (6.61) of von Storch and Zwiers 1999.'''

    from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
    from pygeode.view import View

    riaxes = [X.whichaxis(n) for n in axes]
    raxes = [a for i, a in enumerate(X.axes) if i in riaxes]
    oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes]
    oview = View(oaxes)

    N = np.product([len(X.axes[i]) for i in riaxes])

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert N > 1, '%s has only one element along the reduction axes' % X.name

    # Construct work arrays
    x = np.zeros(oview.shape, 'd')
    xx = np.zeros(oview.shape, 'd')
    Na = np.zeros(oview.shape, 'd')

    x[()] = np.nan
    xx[()] = np.nan
    Na[()] = np.nan

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0)

        # Sum of weights (kludge to get masking right)
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0)

    imsk = (Na > 0.)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    xx[imsk] -= x[imsk]**2 / Na[imsk]
    xx[imsk] = xx[imsk] / (Na[imsk] - 1)

    x[imsk] /= Na[imsk]

    if N_fac is not None:
        eN = N // N_fac
        eNa = Na // N_fac
    else:
        eN = N
        eNa = Na

    sdom = np.zeros((oview.shape), 'd')
    p = np.zeros((oview.shape), 'd')
    t = np.zeros((oview.shape), 'd')
    ci = np.zeros((oview.shape), 'd')

    sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk])
    dmsk = (sdom > 0.)

    t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk]
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1))
    ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk]

    name = X.name if X.name != '' else 'X'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        m = Var(oaxes, values=x, name='m')
        m.atts['longname'] = 'Mean value of %s' % (name, )
        rvs.append(m)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value of test %s is 0' % (name, )
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence intervale of the mean value of %s' % (
                name, )
        rvs.append(ci)

    return asdataset(rvs)
Exemplo n.º 28
0
def correlate(X, Y, axes=None, output='r2,p', pbar=None):
    # {{{
    r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults
    to 'r2,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the correlation coefficient can be obtained
    through ``ds.r2``).

    * 'r2': The correlation coefficient :math:`\rho_{XY}`
    * 'p':  The p-value; see notes.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding a correlation
  coeefficient of equal or greater magnitude (two-sided) to the given result
  under the hypothesis that the true correlation coefficient between X and Y is
  zero. It is computed from the t-statistic given in eq (8.7), in section
  8.2.3, and assumes normally distributed quantities.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['r2', 'p']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    # Put all the axes being reduced over at the end
    # so that we can reshape
    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    iview = View(inaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Count of non-NaN data points
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    imsk = (Na > 0)

    xx[imsk] -= (x * x)[imsk] / Na[imsk]
    yy[imsk] -= (y * y)[imsk] / Na[imsk]
    xy[imsk] -= (x * y)[imsk] / Na[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    # Compute correlation coefficient, t-statistic, p-value
    den = np.zeros(oview.shape, 'd')
    rho = np.zeros(oview.shape, 'd')

    den[imsk] = np.sqrt((xx * yy)[imsk])
    dmsk = (den > 0.)

    rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk]

    den = 1 - rho**2
    # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
    den[den < eps] = eps

    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk])
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2))

    p[~imsk] = np.nan
    rho[~imsk] = np.nan

    p[~dmsk] = np.nan
    rho[~dmsk] = np.nan

    # Construct and return variables
    xn = X.name if X.name != '' else 'X'  # Note: could write:  xn = X.name or 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'r2' in output:
        r2 = Var(oaxes, values=rho, name='r2')
        r2.atts['longname'] = 'Correlation coefficient between %s and %s' % (
            xn, yn)
        rvs.append(r2)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for correlation coefficient between %s and %s' % (
                xn, yn)
        rvs.append(p)

    ds = asdataset(rvs)
    ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn)

    return ds
Exemplo n.º 29
0
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None):
    # {{{
    r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the
    following parameters can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': p-value of regression; see notes.
    * 'sm': Standard deviation of linear coefficient estimate
    * 'se': Standard deviation of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['m', 'b', 'r2', 'p', 'sm', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from regression. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (
        X.name, Y.name)

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Sum of weights
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    if N_fac is None:
        N_eff = Na - 2.
    else:
        N_eff = Na / N_fac - 2.

    nmsk = (N_eff > 0.)

    xx[nmsk] -= (x * x)[nmsk] / Na[nmsk]
    yy[nmsk] -= (y * y)[nmsk] / Na[nmsk]
    xy[nmsk] -= (x * y)[nmsk] / Na[nmsk]

    dmsk = (xx > 0.)

    m = np.zeros(oview.shape, 'd')
    b = np.zeros(oview.shape, 'd')
    r2 = np.zeros(oview.shape, 'd')

    m[dmsk] = xy[dmsk] / xx[dmsk]
    b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk]

    r2den = xx * yy
    d2msk = (r2den > 0.)

    r2[d2msk] = xy[d2msk]**2 / r2den[d2msk]

    sige = np.zeros(oview.shape, 'd')
    sigm = np.zeros(oview.shape, 'd')
    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk]
    sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk])
    sige[nmsk] = np.sqrt(sige[dmsk])
    t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk]
    p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk]))

    msk = nmsk & dmsk

    m[~msk] = np.nan
    b[~msk] = np.nan
    sige[~msk] = np.nan
    sigm[~msk] = np.nan
    p[~msk] = np.nan

    msk = nmsk & d2msk
    r2[~msk] = np.nan

    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        M = Var(oaxes, values=m, name='m')
        M.atts['longname'] = 'slope'
        rvs.append(M)

    if 'b' in output:
        B = Var(oaxes, values=b, name='b')
        B.atts['longname'] = 'intercept'
        rvs.append(B)

    if 'r2' in output:
        R2 = Var(oaxes, values=r2, name='r2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        P = Var(oaxes, values=p, name='p')
        P.atts['longname'] = 'p-value'
        rvs.append(P)

    if 'sm' in output:
        SM = Var(oaxes, values=sigm, name='sm')
        SM.atts['longname'] = 'standard deviation of slope parameter'
        rvs.append(SM)

    if 'se' in output:
        SE = Var(oaxes, values=sige, name='se')
        SE.atts['longname'] = 'standard deviation of residual'
        rvs.append(SE)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'linear regression parameters for %s regressed against %s' % (
            yn, xn)

    return ds
Exemplo n.º 30
0
for naxes in (1,2):
  print("Testing %s dimensions"%naxes)
  for shape in product(*([sizes]*naxes)):
    print("  Testing shape %s"%str(shape))

    np.random.seed(shape)
    values = np.random.randn(*shape)
#    print "full values:", values

    axes = [axis_classes[i](sorted(np.random.randn(n))) for i,n in enumerate(shape)]
    for i,axis in enumerate(axes):
      axis.name = 'axis%s'%i
#      print "axis %s values: %s"%(i,axis.values)

    var = Var(axes, values=values)
    var.name = 'myvar'

    slicelists = [slices[size] for size in shape]
    print("    # tests: %d" % len(list(product(*slicelists))))
    for sl in product(*slicelists):
      print("    Testing slices %s"%repr(sl))
      # Trap any known failures here to further diagnose
#      assert (count!=4860), "shape: %s, slices: %s, values: %s, axes: %s"%(shape, str(sl), values, [a.values for a in var.axes])

      # slice the var immediately (before massaging the slices for numpy)
      slicedvar = var.slice[sl]

      # Force integer slices to be integer index arrays
      # (so numpy doesn't remove any dimensions)
      sl = tuple([i] if isinstance(i,int) else i for i in sl)