def plotting_positions(data, alpha=0.4, beta=0.4): """Returns the plotting positions (or empirical percentile points) for the data. Plotting positions are defined as (i-alpha)/(n-alpha-beta), where: - i is the rank order statistics - n is the number of unmasked values along the given axis - alpha and beta are two parameters. Typical values for alpha and beta are: - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4) - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5) - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6) - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])]. That's R default (R type 7) - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (R type 8) - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom. The resulting quantile estimates are approximately unbiased if x is normally distributed (R type 9) - (.4,.4) : approximately quantile unbiased (Cunnane) - (.35,.35): APL, used with PWM """ data = masked_array(data, copy=False).reshape(1,-1) n = data.count() plpos = numpy.empty(data.size, dtype=float_) plpos[n:] = 0 plpos[data.argsort()[:n]] = (numpy.arange(1,n+1) - alpha)/(n+1-alpha-beta) return masked_array(plpos, mask=data._mask)
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): """Computes quantile estimates with the Harrell-Davis method, where the estimates are calculated as a weighted linear combination of order statistics. If var=True, the variance of the estimate is also returned. Depending on var, returns a (p,) array of quantiles or a (2,p) array of quantiles and variances. :Inputs: data: ndarray Data array. prob: Sequence List of quantiles to compute. axis : integer *[None]* Axis along which to compute the quantiles. If None, use a flattened array. var : boolean *[False]* Whether to return the variance of the estimate. :Note: The function is restricted to 2D arrays. """ def _hd_1D(data,prob,var): "Computes the HD quantiles for a 1D array." xsorted = numpy.squeeze(numpy.sort(data.compressed().view(ndarray))) n = len(xsorted) #......... hd = empty((2,len(prob)), float_) if n < 2: hd.flat = numpy.nan if var: return hd return hd[0] #......... v = arange(n+1) / float(n) betacdf = beta.cdf for (i,p) in enumerate(prob): _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) w = _w[1:] - _w[:-1] hd_mean = dot(w, xsorted) hd[0,i] = hd_mean # hd[1,i] = dot(w, (xsorted-hd_mean)**2) # hd[0, prob == 0] = xsorted[0] hd[0, prob == 1] = xsorted[-1] if var: hd[1, prob == 0] = hd[1, prob == 1] = numpy.nan return hd return hd[0] # Initialization & checks --------- data = masked_array(data, copy=False, dtype=float_) p = numpy.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): result = _hd_1D(data, p, var) else: assert data.ndim <= 2, "Array should be 2D at most !" result = apply_along_axis(_hd_1D, axis, data, p, var) # return masked_array(result, mask=numpy.isnan(result))
def _quantiles1D(data,m,p): x = numpy.sort(data.compressed()) n = len(x) if n == 0: return masked_array(numpy.empty(len(p), dtype=float_), mask=True) elif n == 1: return masked_array(numpy.resize(x, p.shape), mask=nomask) aleph = (n*p + m) k = numpy.floor(aleph.clip(1, n-1)).astype(int_) gamma = (aleph-k).clip(0,1) return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
def winsorize(data, alpha=0.2): """Returns a Winsorized version of the input array: the (alpha/2.) lowest values are set to the (alpha/2.)th percentile, and the (alpha/2.) highest values are set to the (1-alpha/2.)th percentile Masked values are skipped. The input array is first flattened. """ data = masked_array(data, copy=False).ravel() idxsort = data.argsort() (nsize, ncounts) = (data.size, data.count()) ntrim = int(alpha * ncounts) (xmin,xmax) = data[idxsort[[ntrim, ncounts-nsize-ntrim-1]]] return masked_array(numpy.clip(data, xmin, xmax), mask=data._mask)
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): """Computes the standard error of the Harrell-Davis quantile estimates by jackknife. :Inputs: data: ndarray Data array. prob: Sequence List of quantiles to compute. axis : integer *[None]* Axis along which to compute the quantiles. If None, use a flattened array. var : boolean *[False]* Whether to return the variance of the estimate. stderr : boolean *[False]* Whether to return the standard error of the estimate. :Note: The function is restricted to 2D arrays. """ def _hdsd_1D(data,prob): "Computes the std error for 1D arrays." xsorted = numpy.sort(data.compressed()) n = len(xsorted) #......... hdsd = empty(len(prob), float_) if n < 2: hdsd.flat = numpy.nan #......... vv = arange(n) / float(n-1) betacdf = beta.cdf # for (i,p) in enumerate(prob): _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) w = _w[1:] - _w[:-1] mx_ = numpy.fromiter([dot(w,xsorted[r_[range(0,k), range(k+1,n)].astype(int_)]) for k in range(n)], dtype=float_) mx_var = numpy.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1) hdsd[i] = float(n-1) * sqrt(numpy.diag(mx_var).diagonal() / float(n)) return hdsd # Initialization & checks --------- data = masked_array(data, copy=False, dtype=float_) p = numpy.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): result = _hdsd_1D(data.compressed(), p) else: assert data.ndim <= 2, "Array should be 2D at most !" result = apply_along_axis(_hdsd_1D, axis, data, p) # return masked_array(result, mask=numpy.isnan(result)).ravel()
def trim_both(data, proportiontocut=0.2, axis=None): """Trims the data by masking the int(trim*n) smallest and int(trim*n) largest values of data along the given axis, where n is the number of unmasked values. :Inputs: data : MaskedArray Data to trim. trim : float *[0.2]* Percentage of trimming. If n is the number of unmasked values before trimming, the number of values after trimming is (1-2*trim)*n. axis : integer *[None]* Axis along which to perform the trimming. """ #................... def _trim_1D(data, trim): "Private function: return a trimmed 1D array." nsize = data.size ncounts = data.count() ntrim = int(trim * ncounts) idxsort = data.argsort() data[idxsort[:ntrim]] = masked data[idxsort[ncounts-nsize-ntrim:]] = masked return data #................... data = masked_array(data, copy=False, subok=True) data.unshare_mask() if (axis is None): return _trim_1D(data.ravel(), proportiontocut) else: assert data.ndim <= 2, "Array should be 2D at most !" return apply_along_axis(_trim_1D, axis, data, proportiontocut)
def median_cihs(data, alpha=0.05, axis=None): """Computes the alpha-level confidence interval for the median of the data, following the Hettmasperger-Sheather method. :Inputs: data : sequence Input data. Masked values are discarded. The input should be 1D only alpha : float *[0.05]* Confidence degree. """ def _cihs_1D(data, alpha): data = numpy.sort(data.compressed()) n = len(data) alpha = min(alpha, 1-alpha) k = int(binom._ppf(alpha/2., n, 0.5)) gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) if gk < 1-alpha: k -= 1 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) I = (gk - 1 + alpha)/(gk - gkk) lambd = (n-k) * I / float(k + (n-2*k)*I) lims = (lambd*data[k] + (1-lambd)*data[k-1], lambd*data[n-k-1] + (1-lambd)*data[n-k]) return lims data = masked_array(data, copy=False) # Computes quantiles along axis (or globally) if (axis is None): result = _cihs_1D(data.compressed(), p, var) else: assert data.ndim <= 2, "Array should be 2D at most !" result = apply_along_axis(_cihs_1D, axis, data, alpha) # return result
def trimmed_stde(data, proportiontocut=0.2, axis=None): """Returns the standard error of the trimmed mean for the input data, along the given axis. Trimming is performed on both ends of the distribution. :Inputs: data : MaskedArray Data to trim. proportiontocut : float *[0.2]* Proportion of the data to cut from each side of the data . As a result, (2*proportiontocut*n) values are actually trimmed. axis : integer *[None]* Axis along which to perform the trimming. """ #........................ def _trimmed_stde_1D(data, trim=0.2): "Returns the standard error of the trimmed mean for a 1D input data." winsorized = winsorize(data) nsize = winsorized.count() winstd = winsorized.stdu() return winstd / ((1-2*trim) * numpy.sqrt(nsize)) #........................ data = masked_array(data, copy=False, subok=True) data.unshare_mask() if (axis is None): return _trimmed_stde_1D(data.ravel(), proportiontocut) else: assert data.ndim <= 2, "Array should be 2D at most !" return apply_along_axis(_trimmed_stde_1D, axis, data, proportiontocut)
def rsh(data, points=None): """Evalutates Rosenblatt's shifted histogram estimators for each point of 'points' on the dataset 'data'. :Inputs: data : sequence Input data. Masked values are discarded. points : Sequence of points where to evaluate Rosenblatt shifted histogram. If None, use the data. """ data = masked_array(data, copy=False) if points is None: points = data else: points = numpy.array(points, copy=False, ndmin=1) if data.ndim != 1: raise AttributeError("The input array should be 1D only !") n = data.count() h = 1.2 * idealfourths(data) / n**(1./5) nhi = (data[:,None] <= points[None,:] + h).sum(0) nlo = (data[:,None] < points[None,:] - h).sum(0) return (nhi-nlo) / (2.*n*h)
def mquantiles(data, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None): """Computes empirical quantiles for a *1xN* data array. Samples quantile are defined by: *Q(p) = (1-g).x[i] +g.x[i+1]* where *x[j]* is the jth order statistic, with *i = (floor(n*p+m))*, *m=alpha+p*(1-alpha-beta)* and *g = n*p + m - i)*. Typical values of (alpha,beta) are: - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4) - (.5,.5) : *p(k) = (k+1/2.)/n* : piecewise linear function (R, type 5) - (0,0) : *p(k) = k/(n+1)* : (R type 6) - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])]. That's R default (R type 7) - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (R type 8) - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom. The resulting quantile estimates are approximately unbiased if x is normally distributed (R type 9) - (.4,.4) : approximately quantile unbiased (Cunnane) - (.35,.35): APL, used with PWM :Parameters: x : Sequence Input data, as a sequence or array of dimension at most 2. prob : Sequence *[(0.25, 0.5, 0.75)]* List of quantiles to compute. alpha : Float (*[0.4]*) Plotting positions parameter. beta : Float (*[0.4]*) Plotting positions parameter. axis : Integer *[None]* Axis along which to compute quantiles. If *None*, uses the whole (flattened/compressed) dataset. """ def _quantiles1D(data,m,p): x = numpy.sort(data.compressed()) n = len(x) if n == 0: return masked_array(numpy.empty(len(p), dtype=float_), mask=True) elif n == 1: return masked_array(numpy.resize(x, p.shape), mask=nomask) aleph = (n*p + m) k = numpy.floor(aleph.clip(1, n-1)).astype(int_) gamma = (aleph-k).clip(0,1) return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] # Initialization & checks --------- data = masked_array(data, copy=False) p = narray(prob, copy=False, ndmin=1) m = alphap + p*(1.-alphap-betap) # Computes quantiles along axis (or globally) if (axis is None): return _quantiles1D(data, m, p) else: assert data.ndim <= 2, "Array should be 2D at most !" return apply_along_axis(_quantiles1D, axis, data, m, p)
def mmedian(data, axis=None): """Returns the median of data along the given axis. Missing data are discarded.""" def _median1D(data): x = numpy.sort(data.compressed()) if x.size == 0: return masked return numpy.median(x) data = masked_array(data, subok=True, copy=True) if axis is None: return _median1D(data) else: return apply_along_axis(_median1D, axis, data)
def stde_median(data, axis=None): """Returns the McKean-Schrader estimate of the standard error of the sample median along the given axis. """ def _stdemed_1D(data): sorted = numpy.sort(data.compressed()) n = len(sorted) z = 2.5758293035489004 k = int(round((n+1)/2. - z * sqrt(n/4.),0)) return ((sorted[n-k] - sorted[k-1])/(2.*z)) # data = masked_array(data, copy=False, subok=True) if (axis is None): return _stdemed_1D(data) else: assert data.ndim <= 2, "Array should be 2D at most !" return apply_along_axis(_stdemed_1D, axis, data)
def idealfourths(data, axis=None): """Returns an estimate of the interquartile range of the data along the given axis, as computed with the ideal fourths. """ def _idf(data): x = numpy.sort(data.compressed()) n = len(x) (j,h) = divmod(n/4. + 5/12.,1) qlo = (1-h)*x[j] + h*x[j+1] k = n - j qup = (1-h)*x[k] + h*x[k-1] return qup - qlo data = masked_array(data, copy=False) if (axis is None): return _idf(data) else: return apply_along_axis(_idf, axis, data)
def rank_data(data, axis=None, use_missing=False): """Returns the rank (also known as order statistics) of each data point along the given axis. If some values are tied, their rank is averaged. If some values are masked, their rank is set to 0 if use_missing is False, or set to the average rank of the unmasked values if use_missing is True. :Inputs: data : sequence Input data. The data is transformed to a masked array axis : integer *[None]* Axis along which to perform the ranking. If None, the array is first flattened. An exception is raised if the axis is specified for arrays with a dimension larger than 2 use_missing : boolean *[False]* Flag indicating whether the masked values have a rank of 0 (False) or equal to the average rank of the unmasked values (True) """ # def _rank1d(data, use_missing=False): n = data.count() rk = numpy.empty(data.size, dtype=float_) idx = data.argsort() rk[idx[:n]] = numpy.arange(1,n+1) # if use_missing: rk[idx[n:]] = (n+1)/2. else: rk[idx[n:]] = 0 # repeats = find_repeats(data) for r in repeats[0]: condition = (data==r).filled(False) rk[condition] = rk[condition].mean() return rk # data = masked_array(data, copy=False) if axis is None: if data.ndim > 1: return _rank1d(data.ravel(), use_missing).reshape(data.shape) else: return _rank1d(data, use_missing) else: return apply_along_axis(_rank1d, axis, data, use_missing)
def trim_tail(data, proportiontocut=0.2, tail='left', axis=None): """Trims the data by masking int(trim*n) values from ONE tail of the data along the given axis, where n is the number of unmasked values. :Inputs: data : MaskedArray Data to trim. trim : float *[0.2]* Percentage of trimming. If n is the number of unmasked values before trimming, the number of values after trimming is (1-2*trim)*n. axis : integer *[None]* Axis along which to perform the trimming. """ #................... def _trim_1D(data, trim, left): "Private function: return a trimmed 1D array." nsize = data.size ncounts = data.count() ntrim = int(trim * ncounts) idxsort = data.argsort() if left: data[idxsort[:ntrim]] = masked else: data[idxsort[ncounts-nsize-ntrim:]] = masked return data #................... data = masked_array(data, copy=False, subok=True) data.unshare_mask() # if not isinstance(tail, str): raise TypeError("The tail argument should be in ('left','right')") tail = tail.lower()[0] if tail == 'l': left = True elif tail == 'r': left=False else: raise ValueError("The tail argument should be in ('left','right')") # if (axis is None): return _trim_1D(data.ravel(), proportiontocut, left) else: assert data.ndim <= 2, "Array should be 2D at most !" return apply_along_axis(_trim_1D, axis, data, proportiontocut, left)
def trimmed_mean_ci(data, proportiontocut=0.2, alpha=0.05, axis=None): """Returns the selected confidence interval of the trimmed mean along the given axis. :Inputs: data : sequence Input data. The data is transformed to a masked array proportiontocut : float *[0.2]* Proportion of the data to cut from each side of the data . As a result, (2*proportiontocut*n) values are actually trimmed. alpha : float *[0.05]* Confidence level of the intervals axis : integer *[None]* Axis along which to cut. """ data = masked_array(data, copy=False) trimmed = trim_both(data, proportiontocut=proportiontocut, axis=axis) tmean = trimmed.mean(axis) tstde = trimmed_stde(data, proportiontocut=proportiontocut, axis=axis) df = trimmed.count(axis) - 1 tppf = t.ppf(1-alpha/2.,df) return numpy.array((tmean - tppf*tstde, tmean+tppf*tstde))
def mjci(data, prob=[0.25,0.5,0.75], axis=None): """Returns the Maritz-Jarrett estimators of the standard error of selected experimental quantiles of the data. :Input: data : sequence Input data. prob : sequence *[0.25,0.5,0.75]* Sequence of quantiles whose standard error must be estimated. axis : integer *[None]* Axis along which to compute the standard error. """ def _mjci_1D(data, p): data = data.compressed() sorted = numpy.sort(data) n = data.size prob = (numpy.array(p) * n + 0.5).astype(int_) betacdf = beta.cdf # mj = empty(len(prob), float_) x = arange(1,n+1, dtype=float_) / n y = x - 1./n for (i,m) in enumerate(prob): (m1,m2) = (m-1, n-m) W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) C1 = numpy.dot(W,sorted) C2 = numpy.dot(W,sorted**2) mj[i] = sqrt(C2 - C1**2) return mj # data = masked_array(data, copy=False) assert data.ndim <= 2, "Array should be 2D at most !" p = numpy.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): return _mjci_1D(data, p) else: return apply_along_axis(_mjci_1D, axis, data, p)
def test_addfield(self): "Tests addfield" [d, m, mrec, dlist, dates, ts, mts] = self.data mts = addfield(mts, masked_array(d+10, mask=m[::-1])) assert_equal(mts.f2, d+10) assert_equal(mts.f2._mask, m[::-1])
def fromtextfile( fname, delimitor=None, commentchar="#", missingchar="", dates_column=None, varnames=None, vartypes=None, dates=None ): """Creates a multitimeseries from data stored in the file `filename`. :Parameters: - `filename` : file name/handle Handle of an opened file. - `delimitor` : Character *None* Alphanumeric character used to separate columns in the file. If None, any (group of) white spacestring(s) will be used. - `commentchar` : String *['#']* Alphanumeric character used to mark the start of a comment. - `missingchar` : String *['']* String indicating missing data, and used to create the masks. - `datescol` : Integer *[None]* Position of the columns storing dates. If None, a position will be estimated from the variable names. - `varnames` : Sequence *[None]* Sequence of the variable names. If None, a list will be created from the first non empty line of the file. - `vartypes` : Sequence *[None]* Sequence of the variables dtypes. If None, the sequence will be estimated from the first non-commented line. Ultra simple: the varnames are in the header, one line""" # Try to open the file ...................... f = openfile(fname) # Get the first non-empty line as the varnames while True: line = f.readline() firstline = line[: line.find(commentchar)].strip() _varnames = firstline.split(delimitor) if len(_varnames) > 1: break if varnames is None: varnames = _varnames # Get the data .............................. _variables = MA.asarray([line.strip().split(delimitor) for line in f if line[0] != commentchar and len(line) > 1]) (nvars, nfields) = _variables.shape # Check if we need to get the dates.......... if dates_column is None: dates_column = [i for (i, n) in enumerate(list(varnames)) if n.lower() in ["_dates", "dates"]] elif isinstance(dates_column, (int, float)): if dates_column > nfields: raise ValueError, "Invalid column number: %i > %i" % (dates_column, nfields) dates_column = [dates_column] if len(dates_column) > 0: cols = range(nfields) [cols.remove(i) for i in dates_column] newdates = date_array(_variables[:, dates_column[-1]]) _variables = _variables[:, cols] varnames = [varnames[i] for i in cols] if vartypes is not None: vartypes = [vartypes[i] for i in cols] nfields -= len(dates_column) else: newdates = None # Try to guess the dtype .................... if vartypes is None: vartypes = _guessvartypes(_variables[0]) else: vartypes = [numeric.dtype(v) for v in vartypes] if len(vartypes) != nfields: msg = "Attempting to %i dtypes for %i fields!" msg += " Reverting to default." warnings.warn(msg % (len(vartypes), nfields)) vartypes = _guessvartypes(_variables[0]) # Construct the descriptor .................. mdescr = [(n, f) for (n, f) in zip(varnames, vartypes)] # Get the data and the mask ................. # We just need a list of masked_arrays. It's easier to create it like that: _mask = _variables.T == missingchar _datalist = [masked_array(a, mask=m, dtype=t) for (a, m, t) in zip(_variables.T, _mask, vartypes)] # newdates = __getdates(dates=dates, newdates=newdates, length=nvars, freq=None, start_date=None) return MultiTimeSeries(_datalist, dates=newdates, dtype=mdescr)