def finalize (var, time, space, eof, eig, pc, variance, weight, out): from pygeode.var import Var import numpy as np # Keep the name name = var.name num = eof.shape[0] # Conform to the proper shape eof = eof.reshape((num,)+space.shape) pc = pc.reshape((num,)+time.shape) # Use a consistent sign convention # (the first element of the eof is non-negative) sign = [-1 if eof.reshape(num,-1)[i,0] < 0 else 1 for i in range(num)] for i,s in enumerate(sign): eof[i,...] *= s pc[i,...] *= s # Copy the data into fresh array # (that way, if these are view on much larger arrays, the rest of # the unused data can be garbage collected) eof = np.array(eof) pc = np.array(pc) eig = np.array(eig) # EOF axis orderaxis = order(num,name="order") eof = Var((orderaxis,)+space.axes, values=eof) eig = Var([orderaxis], values=eig) pc = Var((orderaxis,)+time.axes, values = pc) # Undo normalization by area weight? eof = remove_weights(eof, weight=weight).load() eof.name = name + "_EOF" pc.name = name + "_timeseries" eig.name = name + "_eigenvalues" # Other things # Fraction of total variance frac = ((eig**2) / variance).load() frac.name = name + "_fraction_of_variance" # Eigenvalues of the covariance matrix eig2 = (eig**2).load() eig2.name = name + "_eigenvalues" # Gather up all possible outputs outdict = dict(eof=eof, eig=eig, eig2=eig2, pc=pc, var=variance, frac=frac) out = whichout(out) out = [outdict[o] for o in out] return out
def test_underscores_in_axes(): from pygeode.axis import ZAxis from pygeode.var import Var # Axis names without underscores works class HeightwrtGround(ZAxis): pass ax = HeightwrtGround(values=[1.5]) x = Var(axes=[ax], name='blah', values=[123.45]) y = x(heightwrtground=0.0) # Axis names with underscores fails class Height_wrt_Ground(ZAxis): pass ax = Height_wrt_Ground(values=[1.5]) x = Var(axes=[ax], name='blah', values=[123.45]) y = x(height_wrt_ground=0.0)
def test_issue061(): from pygeode.tutorial import t1 from pygeode.var import Var # Construct 3 variables, with some attributes that are identical and some # that are different. x = Var(t1.axes, atts=dict(standard_name='x',units='deg C'), dtype=float) y = Var(t1.axes, atts=dict(standard_name='y',units='deg C'), dtype=float) z = Var(t1.axes, atts=dict(standard_name='z',units='deg C'), dtype=float) # Apply a ufunc operator on these variables, to get a new variable w = x*y*z # Make sure the standard_name is removed. assert w.atts == dict(units='deg C')
def decode_string_var (var): from pygeode.var import Var name = var.name if name.endswith('_name'): name = name[:-5] data = [''.encode('ascii').join(s) for s in var.get()] data = [str(s.decode()) for s in data] return Var(axes=var.axes[:-1], values=data, name=name)
def setUp(self): # values = np.random.rand(365,180,360) values = np.ones([365, 180, 360]) time = ModelTime365(startdate=dict(year=2000, month=1), values=np.arange(365), units='days') lon = Lon(values=np.arange(360)) lat = Lat(values=np.arange(180) - 89.5) self.var = Var(axes=[time, lat, lon], values=values)
def test_issue005(): from pygeode.timeaxis import ModelTime365 from pygeode.axis import TAxis import numpy as np from pygeode.var import Var from pygeode.formats import netcdf as nc from pygeode import timeutils # Make a time axis starting at year 0 startdate = dict(year=0, month=1, day=1) taxis = ModelTime365(values=10200, startdate=startdate, units='days') # Make some dummy variable np.random.seed(len(taxis)) values = np.random.randn(len(taxis)) var = Var(axes=[taxis], values=values, name='x') # Save it nc.save("issue005_test.nc", var) # Load it f = nc.open("issue005_test.nc") # Make sure we have a regular time axis # (no climatologies!) assert f.time.__class__ == ModelTime365 assert hasattr(f.time, 'year') # Okay, now reload it, but override the axis coming in f = nc.open("issue005_test.nc", dimtypes=dict(time=TAxis(taxis.values))) # Make sure we dimtypes is still working properly assert f.x.axes[0].__class__ == TAxis # For good measure, test that climatologies are still produced taxis = timeutils.modify(taxis, exclude='year', uniquify=True) values = np.random.randn(len(taxis)) var = Var(axes=[taxis], values=values, name='c') nc.save("issue005_test.nc", var) f = nc.open("issue005_test.nc") assert not hasattr(f.time, 'year')
def test_auxarray_var_arg(): from pygeode.axis import ZAxis, Hybrid from pygeode.var import Var import numpy as np zaxis = ZAxis(list(range(10))) A = [0., 10., 20., 30., 40., 50., 50., 50., 50., 40.] B = np.linspace(0, 1, 10) # Try passing A and B as arrays. # This should work already. eta = Hybrid(values=list(range(10)), A=A, B=B) assert 'A' in eta.auxarrays and 'B' in eta.auxarrays A = Var(axes=[zaxis], values=A) B = Var(axes=[zaxis], values=B) # Try again with Var object arguments. eta = Hybrid(values=list(range(10)), A=A, B=B) assert 'A' in eta.auxarrays and 'B' in eta.auxarrays
def test_issue010(): from pygeode.var import Var from pygeode.axis import Axis from pygeode.dataset import Dataset from pygeode.formats import netcdf as nc # Make some axes time_axis = Axis(values=[0], name='time') bnds_axis = Axis(values=[0,1], name='bnds') # Make some vars (note we don't have a 'bnds' variable corresponding to the 'bnds' dimension time_var = Var(axes=[time_axis], values=[1], name='time') time_bnds = Var(axes=[time_axis,bnds_axis], values=[[3,4]], name='time_bnds') # Make a dataset to hold the vars dataset = Dataset([time_var, time_bnds]) # Manually appy dims2axes to detect our axes dataset = nc.dims2axes(dataset)
def test_issue036(): import numpy as np from pygeode.axis import Height from pygeode.var import Var from pygeode.plot import plotvar height = Height(np.arange(10)) var = Var([height], values=np.zeros(10)) plotvar(var) assert np.all(height.values == np.arange(10))
def do_concat (shape1, shape2, iaxis, count=[0]): from pygeode.axis import XAxis, YAxis, ZAxis, TAxis from pygeode.var import Var from pygeode.concat import concat from var_test import varTest import numpy as np # Increment test counter (and generate a unique test name) count[0] += 1 testname = "concattest%05d"%(count[0]) # Create some test data np.random.seed(count[0]) array1 = np.random.randn(*shape1) array2 = np.random.randn(*shape2) # Get the # of dimensions, and assign a unique axis class for each dim assert array1.ndim == array2.ndim ndim = array1.ndim axis_classes = (XAxis, YAxis, ZAxis, TAxis)[:ndim] # Construct the first var axes = [cls(n) for cls,n in zip(axis_classes,array1.shape)] var1 = Var(axes = axes, values = array1, name = "myvar", atts={'a':1, 'b':2, 'c':3}) # The second var should have the same axes, except for the concatenation one n1 = array1.shape[iaxis] n2 = array2.shape[iaxis] axes[iaxis] = axis_classes[iaxis](np.arange(n1, n1+n2)) var2 = Var(axes = axes, values = array2, name = "myvar", atts={'a':1, 'b':3, 'd':4}) # Try concatenating var = concat(var1,var2) # The expected result expected = np.concatenate ( (array1, array2), iaxis) # Test this test = varTest(testname=testname, var=var, values=expected) # Store this test globals()[testname] = test
def test_issue028(): from pygeode.var import Var from pygeode.axis import Lat # Create a mock variable lat = Lat([-45.]) var = Var(axes=[lat], values=[123.]) # Try scaling it by a complex number var2 = var * 1j assert var2.dtype.name == 'complex128'
def test_slice(): from pygeode.axis import XAxis, YAxis from pygeode.var import Var import numpy as np values = np.zeros([10, 10]) xaxis = XAxis(np.arange(10)) yaxis = YAxis(np.arange(10)) var = Var(axes=[xaxis, yaxis], values=values) # slicedvar = var.slice[[ 7, -2, -4, 9, 4, 0], [ 7, -2, -4, 9, 4, 0]] slicedvar = var.slice[[7, -2, -4, 9, 4, 0], [-1, 0, 4]] slicedvar.get()
def getview(self, view, pbar): from pygeode.tools import partial_sum import numpy as np from pygeode.axis import Coef from pygeode.var import Var ti = self.ti taxis = self.var.axes[ti] # Get number of seconds since start of data secs = taxis.reltime(units='seconds') # Wrap it as a var, so we can use it in the loop below secs = Var([taxis], values=secs) cview = view.remove( Coef) # view without regard to a 'coefficient' axis X = np.zeros(cview.shape, self.dtype) nX = np.zeros(cview.shape, int) F = np.zeros(cview.shape, self.dtype) nF = np.zeros(cview.shape, int) XF = np.zeros(cview.shape, self.dtype) nXF = np.zeros(cview.shape, int) X2 = np.zeros(cview.shape, self.dtype) nX2 = np.zeros(cview.shape, int) for slices, (data, t), bins in loopover([self.var, secs], cview, pbar): partial_sum(data, slices, F, nF, ti, bins) partial_sum(t, slices, X, nX, ti, bins) partial_sum(data * t, slices, XF, nXF, ti, bins) partial_sum(t**2, slices, X2, nX2, ti, bins) F /= nF X /= nX XF /= nXF X2 /= nX2 # print '??', X2 - X**2 A = XF - X * F B = X2 * F - X * XF icoef = view.index(Coef) coef = view.integer_indices[icoef] out = np.empty(view.shape, self.dtype) # Stick the two coefficients together into a single array out[..., np.where(coef == 0)[0]] = B[..., None] out[..., np.where(coef == 1)[0]] = A[..., None] out /= (X2 - X**2)[..., None] return out
def make_var(): from pygeode.var import Var from pygeode.timeaxis import StandardTime import numpy as np time = StandardTime(startdate=dict(year=2009, month=1, day=1), values=list(range(10)), units='days') station = make_station_axis() return Var([time, station], values=np.arange(len(time) * len(station)).reshape( len(time), len(station)), name="dummy")
def setUp(self): # Use linear coordinates x = np.array([0,1,2]) y = np.array([0,1,2,3]) # Some simple data with a hole in it self.data = np.array([[1,2,3],[4,float('nan'),6],[7,8,9],[10,11,12]]) self.data_nm = np.array([[1,2,3],[4,5,6],[7,11,9],[10,8,12]]) # Construct a Var object x = XAxis(x) y = YAxis(y) var = Var(axes=[y,x], values=self.data) self.x = x self.y = y self.var = var # Some interpolation coordinates # mid-points (and values just outside the range) self.x2 = XAxis(values=[-0.5,0.5,1.5,2.5]) self.y2 = YAxis(values=[-0.5,0.5,1.5,2.5,3.5]) # Reverse of original values self.x3 = XAxis(values=[2,1,0]) self.y3 = YAxis(values=[3,2,1,0]) # Reverse of mid-points self.x4 = XAxis(values=[2.5,1.5,0.5,-0.5]) self.y4 = YAxis(values=[3.5,2.5,1.5,0.5,-0.5]) # Single non-midpoint self.x5 = XAxis(values=[1.4]) self.y5 = YAxis(values=[2.2]) # Non-monotonic axis self.y_nm = Var(axes=[y,x], values=[[0,0,0], [1,1,2], [2,2,1], [3,3,3]]) # 2D interpolation xfield = np.array([[-1,0,1],[0,1,2],[1,2,3],[0,1,2]]) yfield = np.array([[-1,0,1,2],[0,1,2,3],[1,2,3,4]]).transpose() self.xfield = Var(axes=[y,x], values=xfield) self.yfield = Var(axes=[y,x], values=yfield) self.x6 = XAxis(values=[-1,0,1,2,3])
def encode_string_var (var): import numpy as np from pygeode.var import Var from pygeode.axis import DummyAxis # Construct a 2D character array to hold strings strlen = max(len(string) for string in var.values) #TODO: make this a simple dimension (no coordinate values needed!) strlen_axis = DummyAxis (strlen, name=var.name+"_strlen") dtype = '|S'+str(strlen) # For a convenient view on the character array # (to help popluate it from strings) data = np.zeros(list(var.shape)+[strlen], dtype='|S1') data.view(dtype)[...,0] = var.values var = Var(list(var.axes)+[strlen_axis], values=data, name=var.name+"_name") return var
def test_issue025(): lat = Lat([80,70,60]) var = Var(axes=[lat], values=[1,2,3], name='2B') # Save the variable nc.save ("issue025_test.nc", var) # This may crash in some versions of the netcdf library. # Even if it doesn't crash, it's a good idea to enforce the legal # netcdf names f = nc.open("issue025_test.nc") assert len(f.vars) == 1 # Must not start with a digit (should have been filtered) assert not f.vars[0].name[0].isdigit()
def auxasvar(self, name): # {{{ ''' Returns auxiliary array as a new :class:`Var` object. Parameters ========== name : string Name of auxiliary array to return Returns ======= var : :class:`Var` Variable with values of requested auxilliary array See Also ======== auxarrays ''' from pygeode.var import Var return Var([self], values=self.auxarrays[name], name=name)
def override_values(dataset, value_override): # {{{ from warnings import warn import numpy as np from pygeode.var import Var, copy_meta vardict = {} for name, values in value_override.items(): if name not in dataset: warn("var '%s' not found - values not overridden" % name, stacklevel=3) continue values = np.asarray(values) oldvar = dataset[name] assert values.shape == oldvar.shape, "bad shape for '%s'. Expected %s, got %s" % ( name, oldvar.shape, values.shape) var = Var(oldvar.axes, values=values) copy_meta(oldvar, var) vardict[name] = var dataset = dataset.replace_vars(vardict) return dataset
def __init__(self, taxis, coef=None, A=None, B=None): from pygeode.tools import merge_coefs from pygeode.var import Var from pygeode.timeaxis import Time from pygeode.timeutils import modify # Get the coefficients if coef is None: assert A is not None and B is not None coef = merge_coefs(B, A) else: assert A is None and B is None # Ignore 'year' field if it's constant? # (I.e., if there's a 'year' field that's all zeros, then drop it) # - This is an artifact of reading climatological data from a file which can't/doesn't specify it's a climatology coeft = coef.getaxis(Time) if hasattr(coeft, 'year'): import numpy as np from warnings import warn if len(np.unique(coeft.year.values)) == 1: warn("ignoring degenerate 'year' field", stacklevel=2) coeft = modify(coeft, exclude='year') coef = coef.replace_axes({Time: coeft}) self.coef = coef self.secs = Var([taxis], values=taxis.reltime(units='seconds')) self.ti = ti = coef.whichaxis(Time) self.ci = ci = coef.whichaxis('coef') self.caxis = coef.axes[ci] axes = list(coef.axes) assert axes[ti].map_to(taxis) is not None, ( "the given time axis is not compatible with the coefficients.\n" + "time axis: %s\n coefficient time axis: %s" % (repr(str(taxis)), repr(str(axes[ti])))) axes[ti] = taxis axes = axes[:ci] + axes[ci + 1:] # Var.__init__(self, axes, dtype=coef.dtype) Var.__init__(self, axes, dtype='float64') # because secs is float64
def paired_difference(X, Y, axes=None, alpha=0.05, N_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must share all axes over which the means are being computed. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X and Y; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.full(oview.shape, np.nan, 'd') dd = np.full(oview.shape, np.nan, 'd') N = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Count of non-NaN data points N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (N > 1) dd[imsk] -= (d * d)[imsk] / N[imsk] dd[imsk] /= (N[imsk] - 1) d[imsk] /= N[imsk] # Ensure variance is non-negative dd[dd <= 0.] = 0. if N_fac is not None: eN = N // N_fac else: eN = N emsk = (eN > 1) den = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') den = np.zeros(oview.shape, 'd') den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1)) dmsk = (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1)) ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk] # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=eN - 1, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for t-test of paired difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['N_fac'] = N_fac ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn) return ds
def difference(X, Y, axes=None, alpha=0.05, Nx_fac=None, Ny_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional, defaults to None Axes over which to compute means; if othing is specified, the mean is computed over all axes common to X and Y. alpha : float, optional; defaults to 0.05 Confidence level for which to compute confidence interval. Nx_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name if pbar is None: from pygeode.progress import PBar pbar = PBar() # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') Nx = np.full(oview.shape, np.nan, 'd') Ny = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Count of non-NaN data points Nx[outsl] = np.nansum( [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0) for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Count of non-NaN data points Ny[outsl] = np.nansum( [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (Nx > 1) & (Ny > 1) xx[imsk] -= (x * x)[imsk] / Nx[imsk] xx[imsk] /= (Nx[imsk] - 1) x[imsk] /= Nx[imsk] yy[imsk] -= (y * y)[imsk] / Ny[imsk] yy[imsk] /= (Ny[imsk] - 1) y[imsk] /= Ny[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. if Nx_fac is not None: eNx = Nx // Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny // Ny_fac else: eNy = Ny emsk = (eNx > 1) & (eNy > 1) # Compute difference d = x - y den = np.zeros(oview.shape, 'd') df = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') # Convert to variance of the mean of each sample xx[emsk] /= eNx[emsk] yy[emsk] /= eNy[emsk] den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1) dmsk = (den > 0.) df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk] den[emsk] = np.sqrt(xx[emsk] + yy[emsk]) dmsk &= (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk])) ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk] df[~dmsk] = np.nan p[~dmsk] = np.nan ci[~dmsk] = np.nan # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=df, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['Nx_fac'] = Nx_fac ds.atts['Ny_fac'] = Ny_fac ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn) return ds
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': p-value of regession; see notes. * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; it corresponds to the probability of obtaining the regression coefficient under the null hypothesis that there is no linear relationship. Note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View # Split output request now ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError( 'One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = tuple([srcaxes[i] for i in oiaxes]) inaxes = oaxes + tuple([srcaxes[i] for i in riaxes]) oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len( riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % ( Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr, ) os2 = os + (Nr, Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes) for j in range(i + 1): xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i, :, :] = np.linalg.inv(xx[i, :, :]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [ np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr) ] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from .var import Var from .dataset import asdataset from .axis import NonCoordinateAxis ra = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor') ra2 = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor2') Nd = len(oaxes) rvs = [] if 'beta' in output: B = Var(oaxes + (ra, ), values=beta, name='beta') B.atts['longname'] = 'regression coefficient' rvs.append(B) if 'r2' in output: vary = (yy - y**2 / N) R2 = 1 - (yy - vare) / vary R2 = Var(oaxes, values=R2, name='R2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: p = [ 2. * (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr)) for i in range(Nr) ] p = np.transpose(np.array(p), [Nd] + list(range(Nd))) p = Var(oaxes + (ra, ), values=p, name='p') p.atts['longname'] = 'p-values' rvs.append(p) if 'sb' in output: sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd))) sb = Var(oaxes + (ra, ), values=sigbeta, name='sb') sb.atts['longname'] = 'standard deviation of linear coefficients' rvs.append(sb) if 'covb' in output: sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb') covb.atts['longname'] = 'Covariance matrix of the linear coefficients' rvs.append(covb) if 'se' in output: se = np.sqrt((yy - vare) / N_eff) se = Var(oaxes, values=se, name='se') se.atts['longname'] = 'standard deviation of residual' rvs.append(se) ds = asdataset(rvs) ds.atts[ 'description'] = 'multiple linear regression parameters for %s regressed against %s' % ( yn, xns) return ds
def encode_cf (dataset): from pygeode.dataset import asdataset, Dataset from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless from pygeode.axis import NamedAxis, DummyAxis from pygeode.var import Var from pygeode.timeutils import reltime from copy import copy dataset = asdataset(dataset) varlist = list(dataset) axisdict = dataset.axisdict.copy() global_atts = dataset.atts.copy() del dataset # Fix the variable names for i,v in enumerate(list(varlist)): oldname = v.name newname = fix_name(oldname) if newname != oldname: from warnings import warn warn ("renaming '%s' to '%s'"%(oldname,newname)) varlist[i] = v.rename(newname) # Fix the axis names #TODO # Fix the variable metadata #TODO # Fix the global metadata # Specify the conventions we're (supposedly) using global_atts['Conventions'] = "CF-1.0" for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name # Metadata based on axis classes for name,a in list(axisdict.items()): atts = a.atts.copy() plotatts = a.plotatts.copy() # passed on to Axis constructor if isinstance(a,Lat): atts['standard_name'] = 'latitude' atts['units'] = 'degrees_north' if isinstance(a,Lon): atts['standard_name'] = 'longitude' atts['units'] = 'degrees_east' if isinstance(a,Pres): atts['standard_name'] = 'air_pressure' atts['units'] = 'hPa' atts['positive'] = 'down' if isinstance(a,Hybrid): #TODO: formula_terms (how do we specify LNSP instead of P0?????) atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate' if isinstance(a,Time): atts['standard_name'] = 'time' #TODO: change the unit depending on the time resolution? start = a.startdate atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units, start.get('year',0), start.get('month',1), start.get('day',1), start.get('hour',0), start.get('minute',0), start.get('second',0) ) if isinstance(a,StandardTime): atts['calendar'] = 'standard' if isinstance(a,ModelTime365): atts['calendar'] = '365_day' if isinstance(a,ModelTime360): atts['calendar'] = '360_day' if isinstance(a,Yearless): atts['calendar'] = 'none' if isinstance(a,XAxis): atts['axis'] = 'X' if isinstance(a,YAxis): atts['axis'] = 'Y' if isinstance(a,ZAxis): atts['axis'] = 'Z' if isinstance(a,TAxis): atts['axis'] = 'T' # Change the time axis to be relative to a start date #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata if isinstance(a, Time): #TODO: cast into an integer array if possible axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts) continue # Encode non-coordinate axes, including station (timeseries) data. # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series # Move station lat/lon/name data into separate variables. if isinstance(a, NonCoordinateAxis): # Keep track of extra variables created from auxarray data. extra_vars = [] # Detect certain arrays that should be treated as "coordinates". coordinates = [] # Encode station latitude. if 'lat' in a.auxarrays: lat = a.auxasvar('lat') lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north") extra_vars.append(lat) coordinates.append('lat') # Encode station longitude. if 'lon' in a.auxarrays: lon = a.auxasvar('lon') lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east") extra_vars.append(lon) coordinates.append('lon') coordinates = " ".join(coordinates) # Encode other auxarrays as generic "ancillary" arrays. ancillary_variables = [] for auxname in list(a.auxarrays.keys()): if auxname in coordinates: continue # Handled above var = a.auxasvar(auxname) if var.dtype.name.startswith('str'): var = encode_string_var(var) # Some extra CF encoding for the station name, to use it as the unique identifier. if auxname == 'station': var.atts = dict(cf_role = "timeseries_id") extra_vars.append(var) ancillary_variables.append(auxname) ancillary_variables = " ".join(ancillary_variables) # Attach these coordinates to all variables that use this axis. #TODO: cleaner way of adding this information without having to do a shallow copy. for i,var in enumerate(varlist): if var.hasaxis(a): var = copy(var) var.atts = copy(var.atts) if len(coordinates) > 0: var.atts['coordinates'] = coordinates if len(ancillary_variables) > 0: var.atts['ancillary_variables'] = ancillary_variables varlist[i] = var # Add these coordinates / ancillary variables to the output. varlist.extend(extra_vars) # The values in the axis itself are meaningless, so mark them as such axisdict[name] = DummyAxis(len(a),name=name) # Special case: Station (timeseries) data. if isinstance(a, Station): global_atts['featureType'] = "timeSeries" # Nothing more to do for this axis type continue # Encode custom axes from add-ons for n,c in list(custom_axes.items()): if isinstance(a,c): atts['standard_name'] = n # Add associated arrays as new variables auxarrays = a.auxarrays for aux,values in auxarrays.items(): auxname = name+'_'+aux assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname varlist.append( Var([a], values=values, name=auxname) ) if len(auxarrays) > 0: atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys()) # Create new, generic axes with the desired attributes # (Replaces the existing entry in the dictionary) axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts) # Apply these new axes to the variables for i,oldvar in enumerate(list(varlist)): name = oldvar.name try: #TODO: use Var.replace_axes instead? varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts) except KeyError: print('??', a.name, axisdict) raise dataset = Dataset(varlist, atts=global_atts) return dataset
# Issue 114 # https://github.com/pygeode/pygeode/issues/114 from pygeode.formats import netcdf4 as nc from pygeode.axis import Lat from pygeode.var import Var from pygeode.dataset import Dataset lat = Lat([80,70,60]) var = Var(axes=[lat], values=[1,2,3], name='A') dataset = Dataset([var]) dataset_groups = {'Group 1': dataset, 'Group 2': dataset} # Save the variable. nc.save('issue114_test.nc', dataset_groups, cfmeta=True) # Read in the file again dataset_groups_read = nc.open('issue114_test.nc',cfmeta=True) # Check that the variables are the same assert (dataset_groups['Group 1'].A == dataset_groups_read['Group 1'].A)
def SVD(var1, var2, num=1, subspace=-1, iaxis=Time, weight1=True, weight2=True, matrix='cov'): """ Finds coupled EOFs of two fields. Note that the mean/trend/etc. is NOT removed in this routine. Parameters ---------- var1, var2 : :class:`Var` The variables to analyse. num : integer The number of EOFs to compute (default is ``1``). weight1, weight2 : optional Weights to use for defining orthogonality in the var1, var2 domains, respectively. Patterns X and Y in the var1 domain are orthogonal if the sum over X*Y*weights1 is 0. Patterns Z and W in the var2 domain are orthogonal if the sum over Z*W*weights2 is 0. Default is to use internal weights defined for var1 accessed by :meth:`Var.getweights()`. If set to ``False`` no weighting is used. matrix : string, optional ['cov'] Which matrix we are diagonalizing (default is 'cov'). * 'cov': covariance matrix of var1 & var2 * 'cov': correlation matrix of var1 & var2 iaxis : Axis identifier The principal component / expansion coefficient axis, i.e., the 'time' axis. Can be an integer (the axis number, leftmost = 0), the axis name (string), or a Pygeode axis class. If not specified, will try to use pygeode.timeaxis.Time, and if that fails, the leftmost axis. Returns ------- (eof1, pc1, eof2, pc2): tuple * eof1: The coupled eof patterns for var1. * pc1: The principal component / expansion coefficients for var1. * eof2: The coupled eof patterns for var2. * pc2: The principal component / expansion coefficients for var2. Notes ----- Multiple orders of EOFs are concatenated along an 'order' axis. """ import numpy as np from pygeode.timeaxis import Time from pygeode.var import Var from pygeode.view import View from pygeode import MAX_ARRAY_SIZE from warnings import warn from pygeode import svdcore as lib if matrix in ('cov', 'covariance'): matrix = 'cov' elif matrix in ('cor', 'corr', 'correlation'): matrix = 'cor' else: warn("invalid matrix type '%'. Defaulting to covariance." % matrix, stacklevel=2) matrix = 'cov' MAX_ITER = 1000 # Iterate over more EOFs than we need # (this helps with convergence) # TODO: a more rigorous formula for the optimum number of EOFs to use if subspace <= 0: subspace = 2 * num + 8 if subspace < num: subspace = num # Just in case # Remember the names prefix1 = var1.name + '_' if var1.name != '' else '' prefix2 = var2.name + '_' if var2.name != '' else '' # Apply weights? # if weight1 is not None: var1 *= weight1.sqrt() # if weight2 is not None: var2 *= weight2.sqrt() if weight1 is True: weight1 = var1.getweights() if weight1 is not False: assert not weight1.hasaxis( iaxis), "Can't handle weights along the record axis" # Normalize the weights W = weight1.sum() / weight1.size weight1 /= W # Apply the weights var1 *= weight1.sqrt() if weight2 is True: weight2 = var2.getweights() if weight2 is not False: assert not weight2.hasaxis( iaxis), "Can't handle weights along the record axis" # Normalize the weights W = weight2.sum() / weight2.size weight2 /= W # Apply the weights var2 *= weight2.sqrt() #TODO: allow multiple iteration axes (i.e., time and ensemble) # if iaxis is None: # if var1.hasaxis(Time) and var2.hasaxis(Time): # iaxis1 = var1.whichaxis(Time) # iaxis2 = var2.whichaxis(Time) # else: # iaxis1 = 0 # iaxis2 = 0 # else: iaxis1 = var1.whichaxis(iaxis) iaxis2 = var2.whichaxis(iaxis) assert var1.axes[iaxis1] == var2.axes[ iaxis2], "incompatible iteration axes" del iaxis # so we don't use this by accident # Special case: can load entire variable in memory # This will save some time, especially if the field is stored on disk, or is heavily derived if var1.size <= MAX_ARRAY_SIZE: print('preloading ' + repr(var1)) var1 = var1.load() if var2.size <= MAX_ARRAY_SIZE: print('preloading ' + repr(var2)) var2 = var2.load() # Use correlation instead of covariance? # (normalize by standard deviation) if matrix == 'cor': print('computing standard deviations') std1 = var1.stdev(iaxis1).load() std2 = var2.stdev(iaxis2).load() # account for grid points with zero standard deviation? std1.values = std1.values + (std1.values == 0) std2.values = std2.values + (std2.values == 0) var1 /= std1 var2 /= std2 eofshape1 = (subspace, ) + var1.shape[:iaxis1] + var1.shape[iaxis1 + 1:] eofshape2 = (subspace, ) + var2.shape[:iaxis2] + var2.shape[iaxis2 + 1:] pcshape1 = (var1.shape[iaxis1], subspace) pcshape2 = (var2.shape[iaxis2], subspace) # number of spatial grid points NX1 = var1.size // var1.shape[iaxis1] assert NX1 <= MAX_ARRAY_SIZE, 'field is too large!' NX2 = var2.size // var2.shape[iaxis2] assert NX2 <= MAX_ARRAY_SIZE, 'field is too large!' # Total number of timesteps NT = var1.shape[iaxis1] # Number of timesteps we can do in one fetch dt = MAX_ARRAY_SIZE // max(NX1, NX2) pcs1 = np.empty(pcshape1, dtype='d') pcs2 = np.empty(pcshape2, dtype='d') X = np.empty(eofshape2, dtype='d') U = np.empty(eofshape1, dtype='d') # Seed with sinusoids superimposed on random values Y = np.random.rand(*eofshape1) V = np.random.rand(*eofshape2) from math import pi for i in range(subspace): Y[i, ...].reshape(NX1)[:] += np.cos( np.arange(NX1, dtype='d') / NX1 * 2 * pi * (i + 1)) V[i, ...].reshape(NX2)[:] += np.cos( np.arange(NX2, dtype='d') / NX2 * 2 * pi * (i + 1)) # raise Exception # Workspace for C code UtAX = np.empty([subspace, subspace], dtype='d') XtAtU = np.empty([subspace, subspace], dtype='d') VtV = np.empty([subspace, subspace], dtype='d') YtY = np.empty([subspace, subspace], dtype='d') # Views over whole variables # (rearranged to be compatible with our output eof arrays) view1 = View((var1.axes[iaxis1], ) + var1.axes[:iaxis1] + var1.axes[iaxis1 + 1:]) view2 = View((var2.axes[iaxis2], ) + var2.axes[:iaxis2] + var2.axes[iaxis2 + 1:]) for iter_num in range(1, MAX_ITER + 1): print('iter_num: %d' % iter_num) assert Y.shape == U.shape assert X.shape == V.shape U, Y = Y, U X, V = V, X # Reset the accumulation arrays for the next approximations Y[()] = 0 V[()] = 0 # Apply the covariance/correlation matrix for t in range(0, NT, dt): # number of timesteps we actually have nt = min(dt, NT - t) # Read the data chunk1 = view1.modify_slice(0, slice(t, t + nt)).get(var1) chunk1 = np.ascontiguousarray(chunk1, dtype='d') chunk2 = view2.modify_slice(0, slice(t, t + nt)).get(var2) chunk2 = np.ascontiguousarray(chunk2, dtype='d') ier = lib.build_svds(subspace, nt, NX1, NX2, chunk1, chunk2, X, Y, pcs2[t, ...]) assert ier == 0 ier = lib.build_svds(subspace, nt, NX2, NX1, chunk2, chunk1, U, V, pcs1[t, ...]) assert ier == 0 # Useful dot products lib.dot(subspace, NX1, U, Y, UtAX) lib.dot(subspace, NX2, V, V, VtV) lib.dot(subspace, NX1, Y, U, XtAtU) lib.dot(subspace, NX1, Y, Y, YtY) # Compute surrogate matrices (using all available information from this iteration) A1, residues, rank, s = np.linalg.lstsq(UtAX, VtV, rcond=1e-30) A2, residues, rank, s = np.linalg.lstsq(XtAtU, YtY, rcond=1e-30) # Eigendecomposition on surrogate matrices Dy, Qy = np.linalg.eig(np.dot(A1, A2)) Dv, Qv = np.linalg.eig(np.dot(A2, A1)) # Sort by eigenvalue (largest first) S = np.argsort(np.real(Dy))[::-1] Dy = Dy[S] Qy = np.ascontiguousarray(Qy[:, S], dtype='d') S = np.argsort(np.real(Dv))[::-1] Dv = Dv[S] Qv = np.ascontiguousarray(Qv[:, S], dtype='d') # get estimate of true eigenvalues D = np.sqrt(Dy) # should also = np.sqrt(Dv) in theory print(D) # Translate the surrogate eigenvectors to an estimate of the true eigenvectors lib.transform(subspace, NX1, Qy, Y) lib.transform(subspace, NX2, Qv, V) # Normalize lib.normalize(subspace, NX1, Y) lib.normalize(subspace, NX2, V) if not np.allclose(U[:num, ...], Y[:num, ...], atol=0): continue if not np.allclose(X[:num, ...], V[:num, ...], atol=0): continue print('converged after %d iterations' % iter_num) break assert iter_num != MAX_ITER, "no convergence" # Flip the sign of the var2 EOFs and PCs so that the covariance is positive lib.fixcov(subspace, NT, NX2, pcs1, pcs2, V) # Wrap as pygeode vars, and return # Only need some of the eofs for output (the rest might not have even converged yet) orderaxis = order(num) eof1 = np.array(Y[:num]) pc1 = np.array(pcs1[..., :num]).transpose() eof1 = Var((orderaxis, ) + var1.axes[:iaxis1] + var1.axes[iaxis1 + 1:], values=eof1) pc1 = Var((orderaxis, var1.axes[iaxis1]), values=pc1) eof2 = np.array(V[:num]) pc2 = np.array(pcs2[..., :num]).transpose() eof2 = Var((orderaxis, ) + var2.axes[:iaxis2] + var2.axes[iaxis2 + 1:], values=eof2) pc2 = Var((orderaxis, var2.axes[iaxis2]), values=pc2) # Apply weights? if weight1 is not False: eof1 /= weight1.sqrt() if weight2 is not False: eof2 /= weight2.sqrt() # Use correlation instead of covariance? # Re-scale the fields by standard deviation if matrix == 'cor': eof1 *= std1 eof2 *= std2 # Give it a name eof1.name = prefix1 + "EOF" pc1.name = prefix1 + "PC" eof2.name = prefix2 + "EOF" pc2.name = prefix2 + "PC" return eof1, pc1, eof2, pc2
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None): # {{{ r'''Computes the mean value of X and statistics relevant for a test against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the mean value can be obtained through ``ds.m``). The following quantities can be calculated. * 'm': The mean value of X * 'p': The probability of the computed value if the population mean was zero * 'ci': The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0) imsk = (Na > 0.) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx[imsk] -= x[imsk]**2 / Na[imsk] xx[imsk] = xx[imsk] / (Na[imsk] - 1) x[imsk] /= Na[imsk] if N_fac is not None: eN = N // N_fac eNa = Na // N_fac else: eN = N eNa = Na sdom = np.zeros((oview.shape), 'd') p = np.zeros((oview.shape), 'd') t = np.zeros((oview.shape), 'd') ci = np.zeros((oview.shape), 'd') sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk]) dmsk = (sdom > 0.) t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk] p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1)) ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk] name = X.name if X.name != '' else 'X' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: m = Var(oaxes, values=x, name='m') m.atts['longname'] = 'Mean value of %s' % (name, ) rvs.append(m) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value of test %s is 0' % (name, ) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence intervale of the mean value of %s' % ( name, ) rvs.append(ci) return asdataset(rvs)
def correlate(X, Y, axes=None, output='r2,p', pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'r2,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the correlation coefficient can be obtained through ``ds.r2``). * 'r2': The correlation coefficient :math:`\rho_{XY}` * 'p': The p-value; see notes. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding a correlation coeefficient of equal or greater magnitude (two-sided) to the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['r2', 'p'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Count of non-NaN data points Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) imsk = (Na > 0) xx[imsk] -= (x * x)[imsk] / Na[imsk] yy[imsk] -= (y * y)[imsk] / Na[imsk] xy[imsk] -= (x * y)[imsk] / Na[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx * yy)[imsk]) dmsk = (den > 0.) rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk]) p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2)) p[~imsk] = np.nan rho[~imsk] = np.nan p[~dmsk] = np.nan rho[~dmsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'r2' in output: r2 = Var(oaxes, values=rho, name='r2') r2.atts['longname'] = 'Correlation coefficient between %s and %s' % ( xn, yn) rvs.append(r2) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for correlation coefficient between %s and %s' % ( xn, yn) rvs.append(p) ds = asdataset(rvs) ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn) return ds
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': p-value of regression; see notes. * 'sm': Standard deviation of linear coefficient estimate * 'se': Standard deviation of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['m', 'b', 'r2', 'p', 'sm', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from regression. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % ( X.name, Y.name) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) if N_fac is None: N_eff = Na - 2. else: N_eff = Na / N_fac - 2. nmsk = (N_eff > 0.) xx[nmsk] -= (x * x)[nmsk] / Na[nmsk] yy[nmsk] -= (y * y)[nmsk] / Na[nmsk] xy[nmsk] -= (x * y)[nmsk] / Na[nmsk] dmsk = (xx > 0.) m = np.zeros(oview.shape, 'd') b = np.zeros(oview.shape, 'd') r2 = np.zeros(oview.shape, 'd') m[dmsk] = xy[dmsk] / xx[dmsk] b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk] r2den = xx * yy d2msk = (r2den > 0.) r2[d2msk] = xy[d2msk]**2 / r2den[d2msk] sige = np.zeros(oview.shape, 'd') sigm = np.zeros(oview.shape, 'd') t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk] sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk]) sige[nmsk] = np.sqrt(sige[dmsk]) t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk] p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk])) msk = nmsk & dmsk m[~msk] = np.nan b[~msk] = np.nan sige[~msk] = np.nan sigm[~msk] = np.nan p[~msk] = np.nan msk = nmsk & d2msk r2[~msk] = np.nan xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: M = Var(oaxes, values=m, name='m') M.atts['longname'] = 'slope' rvs.append(M) if 'b' in output: B = Var(oaxes, values=b, name='b') B.atts['longname'] = 'intercept' rvs.append(B) if 'r2' in output: R2 = Var(oaxes, values=r2, name='r2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: P = Var(oaxes, values=p, name='p') P.atts['longname'] = 'p-value' rvs.append(P) if 'sm' in output: SM = Var(oaxes, values=sigm, name='sm') SM.atts['longname'] = 'standard deviation of slope parameter' rvs.append(SM) if 'se' in output: SE = Var(oaxes, values=sige, name='se') SE.atts['longname'] = 'standard deviation of residual' rvs.append(SE) ds = asdataset(rvs) ds.atts[ 'description'] = 'linear regression parameters for %s regressed against %s' % ( yn, xn) return ds
for naxes in (1,2): print("Testing %s dimensions"%naxes) for shape in product(*([sizes]*naxes)): print(" Testing shape %s"%str(shape)) np.random.seed(shape) values = np.random.randn(*shape) # print "full values:", values axes = [axis_classes[i](sorted(np.random.randn(n))) for i,n in enumerate(shape)] for i,axis in enumerate(axes): axis.name = 'axis%s'%i # print "axis %s values: %s"%(i,axis.values) var = Var(axes, values=values) var.name = 'myvar' slicelists = [slices[size] for size in shape] print(" # tests: %d" % len(list(product(*slicelists)))) for sl in product(*slicelists): print(" Testing slices %s"%repr(sl)) # Trap any known failures here to further diagnose # assert (count!=4860), "shape: %s, slices: %s, values: %s, axes: %s"%(shape, str(sl), values, [a.values for a in var.axes]) # slice the var immediately (before massaging the slices for numpy) slicedvar = var.slice[sl] # Force integer slices to be integer index arrays # (so numpy doesn't remove any dimensions) sl = tuple([i] if isinstance(i,int) else i for i in sl)