def getview (self, view, pbar): import numpy as np from pygeode.tools import loopover, npnansum out = np.zeros(view.shape, self.dtype) out *= np.nan N = np.zeros(view.shape, self.dtype) N *= np.nan for outsl, (indata,) in loopover(self.var, view, pbar=pbar): # Must increment in a nan-safe way out[outsl] = np.nansum([out[outsl], npnansum(indata, self.indices)], 0) # Sum of weights (kludge to get masking right) N[outsl] = np.nansum([N[outsl], npnansum(1. + indata*0., self.indices)], 0) return out / N
def getview (self, view, pbar): # {{{ import numpy as np from pygeode.tools import loopover, npnansum out = np.zeros(view.shape, self.dtype) out *= np.nan W = np.zeros(view.shape, self.dtype) W *= np.nan for outsl, (indata, inw) in loopover([self.var, self.mweights], view, self.var.axes, pbar=pbar): # Must increment in a nan-safe way out[outsl] = np.nansum([out[outsl], \ npnansum(indata * inw, self.indices)], 0) # Product of data and weights W[outsl] = np.nansum([W[outsl], \ npnansum(inw + indata*0., self.indices)], 0) # Sum of weights (kludge to get masking right) return out / W
def getview(self, view, pbar): import numpy as np from pygeode.tools import loopover, npnansum out = np.full(view.shape, np.nan, self.dtype) for outsl, (indata, ) in loopover(self.var, view, pbar=pbar): # Accumulation must be nan-safe out[outsl] = np.nansum( [out[outsl], npnansum(indata, self.indices)], 0) return out
def getview(self, view, pbar): import numpy as np from pygeode.tools import loopover, npnansum out = np.full(view.shape, np.nan, self.dtype) N = np.full(view.shape, np.nan, self.dtype) for outsl, (indata, ) in loopover(self.var, view, pbar=pbar): # Must increment in a nan-safe way out[outsl] = np.nansum( [out[outsl], npnansum(indata, self.indices)], 0) # Sum of weights (kludge to get masking right) N[outsl] = np.nansum( [N[outsl], npnansum(~np.isnan(indata), self.indices)], 0) nmsk = (N > 0.) out[nmsk] /= N[nmsk] out[~nmsk] = np.nan return out
def getview (self, view, pbar): import numpy as np from pygeode.tools import loopover, npnansum out = np.zeros(view.shape, self.dtype) out *= np.nan for outsl, (indata,) in loopover(self.var, view, pbar=pbar): # Accumulation must be nan-safe out[outsl] = np.nansum([out[outsl], npnansum(indata, self.indices)], 0) return out
def getview(self, view, pbar): # {{{ import numpy as np from pygeode.tools import loopover, npnansum out = np.full(view.shape, np.nan, self.dtype) W = np.full(view.shape, np.nan, self.dtype) for outsl, (indata, inw) in loopover([self.var, self.mweights], view, self.var.axes, pbar=pbar): # Must increment in a nan-safe way out[outsl] = np.nansum([out[outsl], \ npnansum(indata * inw, self.indices)], 0) # Product of data and weights W[outsl] = np.nansum([W[outsl], \ npnansum((~np.isnan(indata)) * inw, self.indices)], 0) # Sum of weights (kludge to get masking right) nmsk = (W > 0.) out[nmsk] /= W[nmsk] out[~nmsk] = np.nan return out
def getview (self, view, pbar): # {{{ import numpy as np from pygeode.tools import loopover, npnansum out = np.zeros(view.shape, self.dtype) out *= np.nan for outsl, (indata, inw) in loopover([self.var, self.mweights], view, self.var.axes, pbar=pbar): # Accumulation must be nan-safe out[outsl] = np.nansum([out[outsl], \ npnansum(indata * inw, self.indices)], 0) # Product of data and weights return out
def getview (self, view, pbar): import numpy as np from pygeode.tools import loopover, npnansum x = np.zeros(view.shape, self.dtype) x *= np.nan x2 = np.zeros(view.shape, self.dtype) x2 *= np.nan N = np.zeros(view.shape, self.dtype) N *= np.nan for outsl, (indata,) in loopover(self.var, view, pbar=pbar): x[outsl] = np.nansum([x[outsl],\ npnansum(indata, self.indices)], 0) x2[outsl] = np.nansum([x2[outsl],\ npnansum(indata**2, self.indices)], 0) N[outsl] = np.nansum([N[outsl],\ npnansum(1. + indata*0., self.indices)], 0) zeros = np.asarray(N <= 1.) x[zeros] = np.nan x2[zeros] = np.nan return (x2 - x**2/N) / (N - 1.)
def getview(self, view, pbar): import numpy as np from pygeode.tools import loopover, npnansum x = np.full(view.shape, np.nan, self.dtype) x2 = np.full(view.shape, np.nan, self.dtype) N = np.full(view.shape, np.nan, self.dtype) out = np.zeros(view.shape, self.dtype) for outsl, (indata, ) in loopover(self.var, view, pbar=pbar): x[outsl] = np.nansum([x[outsl],\ npnansum(indata, self.indices)], 0) x2[outsl] = np.nansum([x2[outsl],\ npnansum(indata**2, self.indices)], 0) N[outsl] = np.nansum( [N[outsl], npnansum(~np.isnan(indata), self.indices)], 0) nmsk = (N > 1.) out[nmsk] = x2[nmsk] - x[nmsk]**2 / N[nmsk] out[nmsk] /= N[nmsk] - 1. out[~nmsk] = np.nan return out
def paired_difference(X, Y, axes=None, alpha=0.05, N_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must share all axes over which the means are being computed. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X and Y; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.full(oview.shape, np.nan, 'd') dd = np.full(oview.shape, np.nan, 'd') N = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Count of non-NaN data points N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (N > 1) dd[imsk] -= (d * d)[imsk] / N[imsk] dd[imsk] /= (N[imsk] - 1) d[imsk] /= N[imsk] # Ensure variance is non-negative dd[dd <= 0.] = 0. if N_fac is not None: eN = N // N_fac else: eN = N emsk = (eN > 1) den = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') den = np.zeros(oview.shape, 'd') den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1)) dmsk = (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1)) ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk] # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=eN - 1, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for t-test of paired difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['N_fac'] = N_fac ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn) return ds
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') Nx = np.zeros(oview.shape, 'd') Ny = np.zeros(oview.shape, 'd') x[()] = np.nan y[()] = np.nan xx[()] = np.nan yy[()] = np.nan Nx[()] = np.nan Ny[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) for outsl, (ydata,) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Sum of weights (kludge to get masking right) Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Nx) / (Nx - 1) yy = (yy - y**2/Ny) / (Ny - 1) x /= Nx y /= Ny if Nx_fac is not None: eNx = Nx//Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny//Ny_fac else: eNy = Ny #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) d = x - y den = np.sqrt(xx/eNx + yy/eNy) df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1)) p = tdist.cdf(abs(d/den), df)*np.sign(d) ci = tdist.ppf(1. - alpha/2, df) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=df, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, df, p, ci
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.zeros(oview.shape, 'd') dd = np.zeros(oview.shape, 'd') N = np.zeros(oview.shape, 'd') d[()] = np.nan dd[()] = np.nan N[()] = np.nan # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) dd = (dd - d**2/N) / (N - 1) d /= Nx if N_fac is not None: eN = N//N_fac else: eN = N #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) den = np.sqrt(dd/(eN - 1)) p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d) ci = tdist.ppf(1. - alpha/2, eN - 1) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=eN-1, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, eN-1, p, ci
def correlate(X, Y, axes=None, pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= rho, p : :class:`Var` The correlation coefficient :math:`\rho_{XY}` and p-value, respectively. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.zeros(oview.shape, 'd')*np.nan y = np.zeros(oview.shape, 'd')*np.nan xx = np.zeros(oview.shape, 'd')*np.nan yy = np.zeros(oview.shape, 'd')*np.nan xy = np.zeros(oview.shape, 'd')*np.nan Na = np.zeros(oview.shape, 'd')*np.nan if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata*ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) eps = 1e-14 imsk = ~(Na < eps) xx[imsk] -= (x*x)[imsk]/Na[imsk] yy[imsk] -= (y*y)[imsk]/Na[imsk] xy[imsk] -= (x*y)[imsk]/Na[imsk] # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx*yy)[imsk]) rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk]) p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk]) p[~imsk] = np.nan rho[~imsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn)) P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn)) return Rho, P
def isnonzero(X, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X, against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Three quantities are computed: * The mean value of X * The probability of the computed value if the population mean was zero * The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum([Na[outsl], npnansum(1. + xdata*0., riaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Na) / (Na - 1) x /= Na if N_fac is not None: eN = N//N_fac eNa = Na//N_fac else: eN = N eNa = Na #print 'eff. N = %.1f' % eN sdom = np.sqrt(xx/eNa) p = tdist.cdf(abs(x/sdom), eNa - 1)*np.sign(x) ci = tdist.ppf(1. - alpha/2, eNa - 1) * sdom name = X.name if X.name != '' else 'X' if len(oaxes) > 0: from pygeode import Var, Dataset X = Var(oaxes, values=x, name=name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([X, P, CI]) else: # Degenerate case return x, p, ci
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None): # {{{ r'''Computes the mean value of X and statistics relevant for a test against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the mean value can be obtained through ``ds.m``). The following quantities can be calculated. * 'm': The mean value of X * 'p': The probability of the computed value if the population mean was zero * 'ci': The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0) imsk = (Na > 0.) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx[imsk] -= x[imsk]**2 / Na[imsk] xx[imsk] = xx[imsk] / (Na[imsk] - 1) x[imsk] /= Na[imsk] if N_fac is not None: eN = N // N_fac eNa = Na // N_fac else: eN = N eNa = Na sdom = np.zeros((oview.shape), 'd') p = np.zeros((oview.shape), 'd') t = np.zeros((oview.shape), 'd') ci = np.zeros((oview.shape), 'd') sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk]) dmsk = (sdom > 0.) t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk] p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1)) ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk] name = X.name if X.name != '' else 'X' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: m = Var(oaxes, values=x, name='m') m.atts['longname'] = 'Mean value of %s' % (name, ) rvs.append(m) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value of test %s is 0' % (name, ) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence intervale of the mean value of %s' % ( name, ) rvs.append(ci) return asdataset(rvs)
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': p-value of regression; see notes. * 'sm': Standard deviation of linear coefficient estimate * 'se': Standard deviation of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['m', 'b', 'r2', 'p', 'sm', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from regression. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % ( X.name, Y.name) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) if N_fac is None: N_eff = Na - 2. else: N_eff = Na / N_fac - 2. nmsk = (N_eff > 0.) xx[nmsk] -= (x * x)[nmsk] / Na[nmsk] yy[nmsk] -= (y * y)[nmsk] / Na[nmsk] xy[nmsk] -= (x * y)[nmsk] / Na[nmsk] dmsk = (xx > 0.) m = np.zeros(oview.shape, 'd') b = np.zeros(oview.shape, 'd') r2 = np.zeros(oview.shape, 'd') m[dmsk] = xy[dmsk] / xx[dmsk] b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk] r2den = xx * yy d2msk = (r2den > 0.) r2[d2msk] = xy[d2msk]**2 / r2den[d2msk] sige = np.zeros(oview.shape, 'd') sigm = np.zeros(oview.shape, 'd') t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk] sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk]) sige[nmsk] = np.sqrt(sige[dmsk]) t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk] p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk])) msk = nmsk & dmsk m[~msk] = np.nan b[~msk] = np.nan sige[~msk] = np.nan sigm[~msk] = np.nan p[~msk] = np.nan msk = nmsk & d2msk r2[~msk] = np.nan xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: M = Var(oaxes, values=m, name='m') M.atts['longname'] = 'slope' rvs.append(M) if 'b' in output: B = Var(oaxes, values=b, name='b') B.atts['longname'] = 'intercept' rvs.append(B) if 'r2' in output: R2 = Var(oaxes, values=r2, name='r2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: P = Var(oaxes, values=p, name='p') P.atts['longname'] = 'p-value' rvs.append(P) if 'sm' in output: SM = Var(oaxes, values=sigm, name='sm') SM.atts['longname'] = 'standard deviation of slope parameter' rvs.append(SM) if 'se' in output: SE = Var(oaxes, values=sige, name='se') SE.atts['longname'] = 'standard deviation of residual' rvs.append(SE) ds = asdataset(rvs) ds.atts[ 'description'] = 'linear regression parameters for %s regressed against %s' % ( yn, xn) return ds
def difference(X, Y, axes=None, alpha=0.05, Nx_fac=None, Ny_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional, defaults to None Axes over which to compute means; if othing is specified, the mean is computed over all axes common to X and Y. alpha : float, optional; defaults to 0.05 Confidence level for which to compute confidence interval. Nx_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name if pbar is None: from pygeode.progress import PBar pbar = PBar() # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') Nx = np.full(oview.shape, np.nan, 'd') Ny = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Count of non-NaN data points Nx[outsl] = np.nansum( [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0) for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Count of non-NaN data points Ny[outsl] = np.nansum( [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (Nx > 1) & (Ny > 1) xx[imsk] -= (x * x)[imsk] / Nx[imsk] xx[imsk] /= (Nx[imsk] - 1) x[imsk] /= Nx[imsk] yy[imsk] -= (y * y)[imsk] / Ny[imsk] yy[imsk] /= (Ny[imsk] - 1) y[imsk] /= Ny[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. if Nx_fac is not None: eNx = Nx // Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny // Ny_fac else: eNy = Ny emsk = (eNx > 1) & (eNy > 1) # Compute difference d = x - y den = np.zeros(oview.shape, 'd') df = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') # Convert to variance of the mean of each sample xx[emsk] /= eNx[emsk] yy[emsk] /= eNy[emsk] den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1) dmsk = (den > 0.) df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk] den[emsk] = np.sqrt(xx[emsk] + yy[emsk]) dmsk &= (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk])) ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk] df[~dmsk] = np.nan p[~dmsk] = np.nan ci[~dmsk] = np.nan # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=df, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['Nx_fac'] = Nx_fac ds.atts['Ny_fac'] = Ny_fac ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn) return ds
def correlate(X, Y, axes=None, output='r2,p', pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'r2,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the correlation coefficient can be obtained through ``ds.r2``). * 'r2': The correlation coefficient :math:`\rho_{XY}` * 'p': The p-value; see notes. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding a correlation coeefficient of equal or greater magnitude (two-sided) to the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['r2', 'p'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Count of non-NaN data points Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) imsk = (Na > 0) xx[imsk] -= (x * x)[imsk] / Na[imsk] yy[imsk] -= (y * y)[imsk] / Na[imsk] xy[imsk] -= (x * y)[imsk] / Na[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx * yy)[imsk]) dmsk = (den > 0.) rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk]) p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2)) p[~imsk] = np.nan rho[~imsk] = np.nan p[~dmsk] = np.nan rho[~dmsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'r2' in output: r2 = Var(oaxes, values=rho, name='r2') r2.atts['longname'] = 'Correlation coefficient between %s and %s' % ( xn, yn) rvs.append(r2) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for correlation coefficient between %s and %s' % ( xn, yn) rvs.append(p) ds = asdataset(rvs) ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn) return ds