def getaxis(self, iaxis): ''' Get an axis from the dataset. See Also -------- Var.getaxis ''' from pygeode.tools import whichaxis i = whichaxis(self.axes, iaxis) return self.axes[i]
def hasaxis(self, iaxis): ''' Checks if the specified axis exists in the dataset. See Also -------- Var.hasaxis ''' from pygeode.tools import whichaxis try: i = whichaxis(self.axes, iaxis) return True except KeyError: return False
def whichaxis(self, iaxis): # {{{ """ Locates a particular :class:`Axis` associated with this variable. Parameters ---------- iaxis : string, :class:`Axis` class, or int The search criteria for finding the class. Returns ------- i : The index of the matching Axis. Raises ``KeyError`` if there is no match. Examples -------- >>> from pygeode.tutorial import t1 >>> T = t1.Temp >>> print T.axes (<Lat>, <Lon>) >>> print T.whichaxis('lat') 0 >>> print T.whichaxis('lon') 1 >>> print T.whichaxis('time') Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.6/dist-packages/pygeode/var.py", line 352, in whichaxis raise KeyError, "axis %s not found in %s"%(repr(iaxis),self.axes) KeyError: "axis 'time' not found in (<Lat>, <Lon>)" See Also -------- getaxis, hasaxis, Axis """ from pygeode.tools import whichaxis return whichaxis(self.axes, iaxis)
def whichaxis(self, iaxis): # {{{ """ Locates a particular :class:`Axis` associated with this variable. Parameters ---------- iaxis : string, :class:`Axis` class, or int The search criteria for finding the class. Returns ------- i : The index of the matching Axis. Raises ``KeyError`` if there is no match. Examples -------- >>> from pygeode.tutorial import t1 >>> T = t1.Temp >>> print(T.axes) (<Lat>, <Lon>) >>> print(T.whichaxis('lat')) 0 >>> print(T.whichaxis('lon')) 1 >>> print(T.whichaxis('time')) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.6/dist-packages/pygeode/var.py", line 352, in whichaxis raise KeyError, "axis %s not found in %s"%(repr(iaxis),self.axes) KeyError: "axis 'time' not found in (<Lat>, <Lon>)" See Also -------- getaxis, hasaxis, Axis """ from pygeode.tools import whichaxis return whichaxis(self.axes, iaxis)
def correlate(X, Y, axes=None, output='r2,p', pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'r2,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the correlation coefficient can be obtained through ``ds.r2``). * 'r2': The correlation coefficient :math:`\rho_{XY}` * 'p': The p-value; see notes. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding a correlation coeefficient of equal or greater magnitude (two-sided) to the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['r2', 'p'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Count of non-NaN data points Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) imsk = (Na > 0) xx[imsk] -= (x * x)[imsk] / Na[imsk] yy[imsk] -= (y * y)[imsk] / Na[imsk] xy[imsk] -= (x * y)[imsk] / Na[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx * yy)[imsk]) dmsk = (den > 0.) rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk]) p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2)) p[~imsk] = np.nan rho[~imsk] = np.nan p[~dmsk] = np.nan rho[~dmsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'r2' in output: r2 = Var(oaxes, values=rho, name='r2') r2.atts['longname'] = 'Correlation coefficient between %s and %s' % ( xn, yn) rvs.append(r2) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for correlation coefficient between %s and %s' % ( xn, yn) rvs.append(p) ds = asdataset(rvs) ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn) return ds
def difference(X, Y, axes=None, alpha=0.05, Nx_fac=None, Ny_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional, defaults to None Axes over which to compute means; if othing is specified, the mean is computed over all axes common to X and Y. alpha : float, optional; defaults to 0.05 Confidence level for which to compute confidence interval. Nx_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name if pbar is None: from pygeode.progress import PBar pbar = PBar() # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') Nx = np.full(oview.shape, np.nan, 'd') Ny = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Count of non-NaN data points Nx[outsl] = np.nansum( [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0) for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Count of non-NaN data points Ny[outsl] = np.nansum( [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (Nx > 1) & (Ny > 1) xx[imsk] -= (x * x)[imsk] / Nx[imsk] xx[imsk] /= (Nx[imsk] - 1) x[imsk] /= Nx[imsk] yy[imsk] -= (y * y)[imsk] / Ny[imsk] yy[imsk] /= (Ny[imsk] - 1) y[imsk] /= Ny[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. if Nx_fac is not None: eNx = Nx // Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny // Ny_fac else: eNy = Ny emsk = (eNx > 1) & (eNy > 1) # Compute difference d = x - y den = np.zeros(oview.shape, 'd') df = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') # Convert to variance of the mean of each sample xx[emsk] /= eNx[emsk] yy[emsk] /= eNy[emsk] den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1) dmsk = (den > 0.) df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk] den[emsk] = np.sqrt(xx[emsk] + yy[emsk]) dmsk &= (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk])) ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk] df[~dmsk] = np.nan p[~dmsk] = np.nan ci[~dmsk] = np.nan # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=df, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['Nx_fac'] = Nx_fac ds.atts['Ny_fac'] = Ny_fac ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn) return ds
def paired_difference(X, Y, axes=None, alpha=0.05, N_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must share all axes over which the means are being computed. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X and Y; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.full(oview.shape, np.nan, 'd') dd = np.full(oview.shape, np.nan, 'd') N = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Count of non-NaN data points N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (N > 1) dd[imsk] -= (d * d)[imsk] / N[imsk] dd[imsk] /= (N[imsk] - 1) d[imsk] /= N[imsk] # Ensure variance is non-negative dd[dd <= 0.] = 0. if N_fac is not None: eN = N // N_fac else: eN = N emsk = (eN > 1) den = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') den = np.zeros(oview.shape, 'd') den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1)) dmsk = (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1)) ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk] # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=eN - 1, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for t-test of paired difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['N_fac'] = N_fac ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn) return ds
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': p-value of regression; see notes. * 'sm': Standard deviation of linear coefficient estimate * 'se': Standard deviation of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['m', 'b', 'r2', 'p', 'sm', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from regression. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % ( X.name, Y.name) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) if N_fac is None: N_eff = Na - 2. else: N_eff = Na / N_fac - 2. nmsk = (N_eff > 0.) xx[nmsk] -= (x * x)[nmsk] / Na[nmsk] yy[nmsk] -= (y * y)[nmsk] / Na[nmsk] xy[nmsk] -= (x * y)[nmsk] / Na[nmsk] dmsk = (xx > 0.) m = np.zeros(oview.shape, 'd') b = np.zeros(oview.shape, 'd') r2 = np.zeros(oview.shape, 'd') m[dmsk] = xy[dmsk] / xx[dmsk] b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk] r2den = xx * yy d2msk = (r2den > 0.) r2[d2msk] = xy[d2msk]**2 / r2den[d2msk] sige = np.zeros(oview.shape, 'd') sigm = np.zeros(oview.shape, 'd') t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk] sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk]) sige[nmsk] = np.sqrt(sige[dmsk]) t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk] p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk])) msk = nmsk & dmsk m[~msk] = np.nan b[~msk] = np.nan sige[~msk] = np.nan sigm[~msk] = np.nan p[~msk] = np.nan msk = nmsk & d2msk r2[~msk] = np.nan xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: M = Var(oaxes, values=m, name='m') M.atts['longname'] = 'slope' rvs.append(M) if 'b' in output: B = Var(oaxes, values=b, name='b') B.atts['longname'] = 'intercept' rvs.append(B) if 'r2' in output: R2 = Var(oaxes, values=r2, name='r2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: P = Var(oaxes, values=p, name='p') P.atts['longname'] = 'p-value' rvs.append(P) if 'sm' in output: SM = Var(oaxes, values=sigm, name='sm') SM.atts['longname'] = 'standard deviation of slope parameter' rvs.append(SM) if 'se' in output: SE = Var(oaxes, values=sige, name='se') SE.atts['longname'] = 'standard deviation of residual' rvs.append(SE) ds = asdataset(rvs) ds.atts[ 'description'] = 'linear regression parameters for %s regressed against %s' % ( yn, xn) return ds
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': p-value of regession; see notes. * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; it corresponds to the probability of obtaining the regression coefficient under the null hypothesis that there is no linear relationship. Note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View # Split output request now ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError( 'One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = tuple([srcaxes[i] for i in oiaxes]) inaxes = oaxes + tuple([srcaxes[i] for i in riaxes]) oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len( riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % ( Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr, ) os2 = os + (Nr, Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes) for j in range(i + 1): xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i, :, :] = np.linalg.inv(xx[i, :, :]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [ np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr) ] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from .var import Var from .dataset import asdataset from .axis import NonCoordinateAxis ra = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor') ra2 = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor2') Nd = len(oaxes) rvs = [] if 'beta' in output: B = Var(oaxes + (ra, ), values=beta, name='beta') B.atts['longname'] = 'regression coefficient' rvs.append(B) if 'r2' in output: vary = (yy - y**2 / N) R2 = 1 - (yy - vare) / vary R2 = Var(oaxes, values=R2, name='R2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: p = [ 2. * (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr)) for i in range(Nr) ] p = np.transpose(np.array(p), [Nd] + list(range(Nd))) p = Var(oaxes + (ra, ), values=p, name='p') p.atts['longname'] = 'p-values' rvs.append(p) if 'sb' in output: sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd))) sb = Var(oaxes + (ra, ), values=sigbeta, name='sb') sb.atts['longname'] = 'standard deviation of linear coefficients' rvs.append(sb) if 'covb' in output: sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb') covb.atts['longname'] = 'Covariance matrix of the linear coefficients' rvs.append(covb) if 'se' in output: se = np.sqrt((yy - vare) / N_eff) se = Var(oaxes, values=se, name='se') se.atts['longname'] = 'standard deviation of residual' rvs.append(se) ds = asdataset(rvs) ds.atts[ 'description'] = 'multiple linear regression parameters for %s regressed against %s' % ( yn, xns) return ds
def correlate(X, Y, axes=None, pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= rho, p : :class:`Var` The correlation coefficient :math:`\rho_{XY}` and p-value, respectively. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.zeros(oview.shape, 'd')*np.nan y = np.zeros(oview.shape, 'd')*np.nan xx = np.zeros(oview.shape, 'd')*np.nan yy = np.zeros(oview.shape, 'd')*np.nan xy = np.zeros(oview.shape, 'd')*np.nan Na = np.zeros(oview.shape, 'd')*np.nan if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata*ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) eps = 1e-14 imsk = ~(Na < eps) xx[imsk] -= (x*x)[imsk]/Na[imsk] yy[imsk] -= (y*y)[imsk]/Na[imsk] xy[imsk] -= (x*y)[imsk]/Na[imsk] # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx*yy)[imsk]) rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk]) p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk]) p[~imsk] = np.nan rho[~imsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn)) P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn)) return Rho, P
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.zeros(oview.shape, 'd') dd = np.zeros(oview.shape, 'd') N = np.zeros(oview.shape, 'd') d[()] = np.nan dd[()] = np.nan N[()] = np.nan # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) dd = (dd - d**2/N) / (N - 1) d /= Nx if N_fac is not None: eN = N//N_fac else: eN = N #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) den = np.sqrt(dd/(eN - 1)) p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d) ci = tdist.ppf(1. - alpha/2, eN - 1) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=eN-1, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, eN-1, p, ci
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') Nx = np.zeros(oview.shape, 'd') Ny = np.zeros(oview.shape, 'd') x[()] = np.nan y[()] = np.nan xx[()] = np.nan yy[()] = np.nan Nx[()] = np.nan Ny[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) for outsl, (ydata,) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Sum of weights (kludge to get masking right) Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Nx) / (Nx - 1) yy = (yy - y**2/Ny) / (Ny - 1) x /= Nx y /= Ny if Nx_fac is not None: eNx = Nx//Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny//Ny_fac else: eNy = Ny #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) d = x - y den = np.sqrt(xx/eNx + yy/eNy) df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1)) p = tdist.cdf(abs(d/den), df)*np.sign(d) ci = tdist.ppf(1. - alpha/2, df) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=df, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, df, p, ci
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero for each regressor * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals If the regression is computed over all axes so that the result is a scalar, the above are returned as a tuple of floats in the order specified by ``output``. Otherwise they are returned as :class:`Var` instances. The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr,) os2 = os + (Nr,Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes) for j in range(i+1): xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i,:,:] = np.linalg.inv(xx[i,:,:]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] for o in output: if o == 'B': if len(oaxes) == 0: ret.append(beta) else: ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)]) elif o == 'r': vary = (yy - y**2/N) R2 = 1 - (yy - vare) / vary if len(oaxes) == 0: ret.append(R2) else: ret.append(Var(oaxes, values=R2, name='R2')) elif o == 'p': ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)] if len(oaxes) == 0: ret.append(ps) else: ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)]) elif o == 'sb': if len(oaxes) == 0: ret.append(sigbeta) else: ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)]) elif o == 'covb': from .axis import NonCoordinateAxis as nca cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1') cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2') sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat')) elif o == 'se': se = np.sqrt((yy - vare) / N_eff) if len(oaxes) == 0: ret.append(se) else: ret.append(Var(oaxes, values=se, name='sig_resid')) else: print('multiple_regress: unrecognized output "%s"' % o) return ret
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. Returns ======= results : list of :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero * 'sm': Variance in linear coefficient * 'se': Variance of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name) # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') xy = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') x[outsl] += npsum(xdata, siaxes) y[outsl] += npsum(ydata, siaxes) xx[outsl] += npsum(xdata**2, siaxes) yy[outsl] += npsum(ydata**2, siaxes) xy[outsl] += npsum(xdata*ydata, siaxes) N = np.prod([len(srcaxes[i]) for i in riaxes]) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx -= x**2/N yy -= y**2/N xy -= (x*y)/N m = xy/xx b = (y - m*x)/float(N) if N_fac is None: N_eff = N else: N_eff = N // N_fac sige = (yy - m * xy) / (N_eff - 2.) sigm = np.sqrt(sige / xx) t = np.abs(m) / sigm p = tdist.cdf(t, N-2) * np.sign(m) xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] if 'm' in output: M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn)) ret.append(M) if 'b' in output: B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn)) ret.append(B) if 'r' in output: ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn))) if 'p' in output: P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn)) ret.append(P) if 'sm' in output: ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn))) if 'se' in output: ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn))) return ret