def barplot(self, func=None, *args, **kwargs): """ Plots a bar chart comparing the phases for each period, as transformed by the ``func`` function. Parameters ---------- func : function, optional Function to apply. By default, use the :func:`numpy.ma.mean` function. args : var Mandatory arguments of function ``func``. kwargs : var Optional arguments of function ``func``. """ func = func or ma.mean width = 0.2 colordict = ENSOcolors['fill'] barlist = [] pos = self.positions - 3 * width for (i, s) in zip(['W', -1, 0, 1], [self.series, self.cold, self.neutral, self.warm]): pos += width series = ma.apply_along_axis(func, 0, s, *funcopt) b = self.bar(pos, series, width=width, bottom=0, color=colordict[i], ecolor='k', capsize=3) barlist.append(b[0]) self.barlist = barlist self.figure.axes.append(self) self.format_xaxis() return barlist
def hdquantiles_sd(data, prob=list([.25, .5, .75]), axis=None): """Computes the standard error of the Harrell-Davis quantile estimates by jackknife. Parameters ---------- data : ndarray Data array. prob : sequence Sequence of quantiles to compute. axis : int Axis along which to compute the quantiles. If None, use a flattened array. Notes ----- The function is restricted to 2D arrays. """ def _hdsd_1D(data, prob): "Computes the std error for 1D arrays." xsorted = np.sort(data.compressed()) n = len(xsorted) #......... hdsd = np.empty(len(prob), float_) if n < 2: hdsd.flat = np.nan #......... vv = np.arange(n) / float(n - 1) betacdf = beta.cdf # for (i, p) in enumerate(prob): _w = betacdf(vv, (n + 1) * p, (n + 1) * (1 - p)) w = _w[1:] - _w[:-1] mx_ = np.fromiter([ np.dot( w, xsorted[np.r_[list(range(0, k)), list(range(k + 1, n))].astype(int_)]) for k in range(n) ], dtype=float_) mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n - 1) hdsd[i] = float(n - 1) * np.sqrt( np.diag(mx_var).diagonal() / float(n)) return hdsd # Initialization & checks --------- data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): result = _hdsd_1D(data, p) else: if data.ndim > 2: raise ValueError( "Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hdsd_1D, axis, data, p) # return ma.fix_invalid(result, copy=False).ravel()
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): """ The standard error of the Harrell-Davis quantile estimates by jackknife. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- hdquantiles_sd : MaskedArray Standard error of the Harrell-Davis quantile estimates. See Also -------- hdquantiles """ def _hdsd_1D(data, prob): "Computes the std error for 1D arrays." xsorted = np.sort(data.compressed()) n = len(xsorted) hdsd = np.empty(len(prob), float_) if n < 2: hdsd.flat = np.nan vv = np.arange(n) / float(n-1) betacdf = beta.cdf for (i,p) in enumerate(prob): _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) w = _w[1:] - _w[:-1] mx_ = np.fromiter([w[:k] @ xsorted[:k] + w[k:] @ xsorted[k+1:] for k in range(n)], dtype=float_) # mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / (n - 1) # hdsd[i] = (n - 1) * np.sqrt(mx_var / n) hdsd[i] = np.sqrt(mx_.var() * (n - 1)) return hdsd # Initialization & checks data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): result = _hdsd_1D(data, p) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hdsd_1D, axis, data, p) return ma.fix_invalid(result, copy=False).ravel()
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): """ The standard error of the Harrell-Davis quantile estimates by jackknife. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- hdquantiles_sd : MaskedArray Standard error of the Harrell-Davis quantile estimates. See Also -------- hdquantiles """ def _hdsd_1D(data, prob): "Computes the std error for 1D arrays." xsorted = np.sort(data.compressed()) n = len(xsorted) hdsd = np.empty(len(prob), float_) if n < 2: hdsd.flat = np.nan vv = np.arange(n) / float(n-1) betacdf = beta.cdf for (i,p) in enumerate(prob): _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) w = _w[1:] - _w[:-1] mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)), list(range(k+1,n))].astype(int_)]) for k in range(n)], dtype=float_) mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1) hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n)) return hdsd # Initialization & checks data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): result = _hdsd_1D(data, p) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hdsd_1D, axis, data, p) return ma.fix_invalid(result, copy=False).ravel()
def hdquantiles_sd(data, prob=list([0.25, 0.5, 0.75]), axis=None): """Computes the standard error of the Harrell-Davis quantile estimates by jackknife. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. axis : int Axis along which to compute the quantiles. If None, use a flattened array. Notes ----- The function is restricted to 2D arrays. """ def _hdsd_1D(data, prob): "Computes the std error for 1D arrays." xsorted = np.sort(data.compressed()) n = len(xsorted) # ......... hdsd = np.empty(len(prob), float_) if n < 2: hdsd.flat = np.nan # ......... vv = np.arange(n) / float(n - 1) betacdf = beta.cdf # for (i, p) in enumerate(prob): _w = betacdf(vv, (n + 1) * p, (n + 1) * (1 - p)) w = _w[1:] - _w[:-1] mx_ = np.fromiter( [np.dot(w, xsorted[np.r_[range(0, k), range(k + 1, n)].astype(int_)]) for k in range(n)], dtype=float_ ) mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n - 1) hdsd[i] = float(n - 1) * np.sqrt(np.diag(mx_var).diagonal() / float(n)) return hdsd # Initialization & checks --------- data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if axis is None: result = _hdsd_1D(data, p) else: assert data.ndim <= 2, "Array should be 2D at most !" result = ma.apply_along_axis(_hdsd_1D, axis, data, p) # return ma.fix_invalid(result, copy=False).ravel()
def median_cihs(data, alpha=0.05, axis=None): """ Computes the alpha-level confidence interval for the median of the data. Uses the Hettmasperger-Sheather method. Parameters ---------- data : array_like Input data. Masked values are discarded. The input should be 1D only, or `axis` should be set to None. alpha : float, optional Confidence level of the intervals. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- median_cihs Alpha level confidence interval. """ def _cihs_1D(data, alpha): data = np.sort(data.compressed()) n = len(data) alpha = min(alpha, 1 - alpha) k = int(binom._ppf(alpha / 2., n, 0.5)) gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) if gk < 1 - alpha: k -= 1 gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) gkk = binom.cdf(n - k - 1, n, 0.5) - binom.cdf(k, n, 0.5) I = (gk - 1 + alpha) / (gk - gkk) lambd = (n - k) * I / float(k + (n - 2 * k) * I) lims = (lambd * data[k] + (1 - lambd) * data[k - 1], lambd * data[n - k - 1] + (1 - lambd) * data[n - k]) return lims data = ma.rray(data, copy=False) # Computes quantiles along axis (or globally) if (axis is None): result = _cihs_1D(data.compressed(), alpha) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) return result
def get_psi_bar(self, V=None, zpoint='F'): """Doc String""" if V is None: V = self.mnc('Tav.nc', 'VVEL', mask=self.HFacS[:]) vflux = V * self.dzf[:, np.newaxis, np.newaxis] Vdx = vflux * self.HFacS Vdx = ma.mean(Vdx, axis=2) * self.Lx psi = ma.cumsum(Vdx, axis=0) if zpoint == 'F': return psi elif zpoint == 'C': psi = ma.apply_along_axis(np.vstack, 1, [np.zeros(self.Ny + 1), psi]) return 0.5 * (psi[1:] + psi[:-1])
def median_cihs(data, alpha=0.05, axis=None): """ Computes the alpha-level confidence interval for the median of the data. Uses the Hettmasperger-Sheather method. Parameters ---------- data : array_like Input data. Masked values are discarded. The input should be 1D only, or `axis` should be set to None. alpha : float, optional Confidence level of the intervals. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- median_cihs Alpha level confidence interval. """ def _cihs_1D(data, alpha): data = np.sort(data.compressed()) n = len(data) alpha = min(alpha, 1 - alpha) k = int(binom._ppf(alpha / 2.0, n, 0.5)) gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) if gk < 1 - alpha: k -= 1 gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) gkk = binom.cdf(n - k - 1, n, 0.5) - binom.cdf(k, n, 0.5) I = (gk - 1 + alpha) / (gk - gkk) lambd = (n - k) * I / float(k + (n - 2 * k) * I) lims = (lambd * data[k] + (1 - lambd) * data[k - 1], lambd * data[n - k - 1] + (1 - lambd) * data[n - k]) return lims data = ma.rray(data, copy=False) # Computes quantiles along axis (or globally) if axis is None: result = _cihs_1D(data.compressed(), alpha) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) return result
def Mstep_part2(X,K,Mu,P,Var,post, minVariance=0.25): n,d = np.shape(X) # n data points of dimension d N = np.zeros(K) mask = (X[:]==0) post_T = np.transpose(post) X_T = np.transpose(X) X_Cu = ma.array(X,mask=mask) # X_Cu = get_partial_X(X) # print "---------------------STARTING M STEP---------------------" N = np.sum(post,axis=0) P = np.divide(N,n) # update means Mu_new = np.dot(post_T,X_Cu) Mu_bot = np.dot(post_T,~mask) mask_2 = (Mu_bot[:]<1) Mu_bot = ma.array(Mu_bot,mask=mask_2) #masks out values <1 #print Mu_bot Mu_new = np.divide(Mu_new, Mu_bot) Mu = (Mu * mask_2) + Mu_new.filled(0) #keep original masked out values # update variances nonzeros = np.apply_along_axis(np.count_nonzero,1,~mask) sig_denoms= np.sum(post * np.transpose([nonzeros]),axis=0) #print post for j in xrange(K): norm = lambda x: LA.norm(x - Mu[j])**2 Var[j] = max(minVariance, np.sum(np.multiply(post_T[j],ma.apply_along_axis(norm,1,X_Cu)))/(sig_denoms[j])) # for j in range(K): # # Update parameters # N[j] = math.fsum(post_T[j]) # P[j] = N[j]/n # for l in range(d): # Mu_bot = math.fsum([post[t][j] for t in range(n) if X[t][l] > 0]) # Mu_top = math.fsum([post[t][j] * X[t][l] for t in range(n) if X[t][l] > 0]) # if Mu_bot >= 1: # Mu[j][l] = Mu_top / Mu_bot # variances = np.array([variance(X_Cu[t], Mu[j]) for t in range(n)]) # var_top = np.dot(post_T[j], variances) # var_bot = math.fsum( [post[t][j] * len(X_Cu[t]) for t in range(n)] ) # Var[j] = max(var_top / var_bot, minVariance) return (Mu,P,Var)
def mjci(data, prob=[0.25, 0.5, 0.75], axis=None): """ Returns the Maritz-Jarrett estimators of the standard error of selected experimental quantiles of the data. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. axis : int Axis along which to compute the quantiles. If None, use a flattened array. """ def _mjci_1D(data, p): data = np.sort(data.compressed()) n = data.size prob = (np.array(p) * n + 0.5).astype(int_) betacdf = beta.cdf # mj = np.empty(len(prob), float_) x = np.arange(1, n + 1, dtype=float_) / n y = x - 1. / n for (i, m) in enumerate(prob): (m1, m2) = (m - 1, n - m) W = betacdf(x, m - 1, n - m) - betacdf(y, m - 1, n - m) C1 = np.dot(W, data) C2 = np.dot(W, data**2) mj[i] = np.sqrt(C2 - C1**2) return mj # data = ma.array(data, copy=False) if data.ndim > 2: raise ValueError( "Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): return _mjci_1D(data, p) else: return ma.apply_along_axis(_mjci_1D, axis, data, p)
def generate_histograms(self): if self.state < 8: raise InvalidContextBuilderState("Cannot generate histograms without indices") self.logger.debug("10) Generating histograms") if self.point_coords is self.environment_coords: builder = lambda arr: np.bincount(arr[~arr.mask], minlength=self.num_bins) self.histograms = ma.apply_along_axis(builder, -1, self.indices) self.histograms = self.histograms.data.astype(np.uint16) else: builder = lambda arr: np.bincount(arr, minlength=self.num_bins) self.histograms = np.apply_along_axis(builder, -1, self.indices) self.histograms = self.histograms.astype(np.uint16) self.state = 10 return self.histograms
def mjci(data, prob=[0.25,0.5,0.75], axis=None): """ Returns the Maritz-Jarrett estimators of the standard error of selected experimental quantiles of the data. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. axis : int Axis along which to compute the quantiles. If None, use a flattened array. """ def _mjci_1D(data, p): data = np.sort(data.compressed()) n = data.size prob = (np.array(p) * n + 0.5).astype(int_) betacdf = beta.cdf mj = np.empty(len(prob), float_) x = np.arange(1,n+1, dtype=float_) / n y = x - 1./n for (i,m) in enumerate(prob): (m1,m2) = (m-1, n-m) W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) C1 = np.dot(W,data) C2 = np.dot(W,data**2) mj[i] = np.sqrt(C2 - C1**2) return mj data = ma.array(data, copy=False) if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): return _mjci_1D(data, p) else: return ma.apply_along_axis(_mjci_1D, axis, data, p)
def idealfourths(data, axis=None): """Returns an estimate of the lower and upper quartiles of the data along the given axis, as computed with the ideal fourths. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan,np.nan] (j,h) = divmod(n/4. + 5/12.,1) qlo = (1-h)*x[j-1] + h*x[j] k = n - j qup = (1-h)*x[k] + h*x[k-1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if (axis is None): return _idf(data) else: return ma.apply_along_axis(_idf, axis, data)
def idealfourths(data, axis=None): """ Returns an estimate of the lower and upper quartiles. Uses the ideal fourths algorithm. Parameters ---------- data : array_like Input array. axis : int, optional Axis along which the quartiles are estimated. If None, the arrays are flattened. Returns ------- idealfourths : {list of floats, masked array} Returns the two internal values that divide `data` into four parts using the ideal fourths algorithm either along the flattened array (if `axis` is None) or along `axis` of `data`. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan, np.nan] (j, h) = divmod(n / 4. + 5 / 12., 1) j = int(j) qlo = (1 - h) * x[j - 1] + h * x[j] k = n - j qup = (1 - h) * x[k] + h * x[k - 1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if (axis is None): return _idf(data) else: return ma.apply_along_axis(_idf, axis, data)
def idealfourths(data, axis=None): """ Returns an estimate of the lower and upper quartiles. Uses the ideal fourths algorithm. Parameters ---------- data : array_like Input array. axis : int, optional Axis along which the quartiles are estimated. If None, the arrays are flattened. Returns ------- idealfourths : {list of floats, masked array} Returns the two internal values that divide `data` into four parts using the ideal fourths algorithm either along the flattened array (if `axis` is None) or along `axis` of `data`. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan, np.nan] (j, h) = divmod(n / 4.0 + 5 / 12.0, 1) j = int(j) qlo = (1 - h) * x[j - 1] + h * x[j] k = n - j qup = (1 - h) * x[k] + h * x[k - 1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if axis is None: return _idf(data) else: return ma.apply_along_axis(_idf, axis, data)
def median_cihs(data, alpha=0.05, axis=None): """Computes the alpha-level confidence interval for the median of the data, following the Hettmasperger-Sheather method. Parameters ---------- data : sequence Input data. Masked values are discarded. The input should be 1D only, or axis should be set to None. alpha : float Confidence level of the intervals. axis : integer Axis along which to compute the quantiles. If None, use a flattened array. """ def _cihs_1D(data, alpha): data = np.sort(data.compressed()) n = len(data) alpha = min(alpha, 1 - alpha) k = int(binom._ppf(alpha / 2., n, 0.5)) gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) if gk < 1 - alpha: k -= 1 gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) gkk = binom.cdf(n - k - 1, n, 0.5) - binom.cdf(k, n, 0.5) I = (gk - 1 + alpha) / (gk - gkk) lambd = (n - k) * I / float(k + (n - 2 * k) * I) lims = (lambd * data[k] + (1 - lambd) * data[k - 1], lambd * data[n - k - 1] + (1 - lambd) * data[n - k]) return lims data = ma.rray(data, copy=False) # Computes quantiles along axis (or globally) if (axis is None): result = _cihs_1D(data.compressed(), alpha) else: assert data.ndim <= 2, "Array should be 2D at most !" result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) # return result
def median_cihs(data, alpha=0.05, axis=None): """Computes the alpha-level confidence interval for the median of the data, following the Hettmasperger-Sheather method. Parameters ---------- data : sequence Input data. Masked values are discarded. The input should be 1D only, or axis should be set to None. alpha : float Confidence level of the intervals. axis : integer Axis along which to compute the quantiles. If None, use a flattened array. """ def _cihs_1D(data, alpha): data = np.sort(data.compressed()) n = len(data) alpha = min(alpha, 1 - alpha) k = int(binom._ppf(alpha / 2.0, n, 0.5)) gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) if gk < 1 - alpha: k -= 1 gk = binom.cdf(n - k, n, 0.5) - binom.cdf(k - 1, n, 0.5) gkk = binom.cdf(n - k - 1, n, 0.5) - binom.cdf(k, n, 0.5) I = (gk - 1 + alpha) / (gk - gkk) lambd = (n - k) * I / float(k + (n - 2 * k) * I) lims = (lambd * data[k] + (1 - lambd) * data[k - 1], lambd * data[n - k - 1] + (1 - lambd) * data[n - k]) return lims data = ma.rray(data, copy=False) # Computes quantiles along axis (or globally) if axis is None: result = _cihs_1D(data.compressed(), alpha) else: assert data.ndim <= 2, "Array should be 2D at most !" result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) # return result
def hdquantiles_sd(data, prob=list([.25, .5, .75]), axis=None): """ The standard error of the Harrell-Davis quantile estimates by jackknife. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- hdquantiles_sd : MaskedArray Standard error of the Harrell-Davis quantile estimates. See Also -------- hdquantiles """ def _hdsd_1D(data, prob): "Computes the std error for 1D arrays." xsorted = np.sort(data.compressed()) n = len(xsorted) hdsd = np.empty(len(prob), float_) if n < 2: hdsd.flat = np.nan vv = np.arange(n) / float(n - 1) betacdf = beta.cdf for (i, p) in enumerate(prob): _w = betacdf(vv, n * p, n * (1 - p)) w = _w[1:] - _w[:-1] # cumulative sum of weights and data points if # ith point is left out for jackknife mx_ = np.zeros_like(xsorted) mx_[1:] = np.cumsum(w * xsorted[:-1]) # similar but from the right mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1] hdsd[i] = np.sqrt(mx_.var() * (n - 1)) return hdsd # Initialization & checks data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None): result = _hdsd_1D(data, p) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hdsd_1D, axis, data, p) return ma.fix_invalid(result, copy=False).ravel()
def quantile(x, probs=DEF_PROBS, typ=DEF_TYPE, method=DEF_METHOD, limit=DEF_LIMIT, na_rm=DEF_NARM, is_sorted=False): """Compute the sample quantiles of any vector distribution. >>> quantile(x, probs=DEF_PROBS, type = DEF_TYPE, method=DEF_METHOD, limit=DEF_LIMIT, na_rm = DEF_NARM, is_sorted=False) """ ## various parameter checkings # check the data if isinstance(x, (pd.DataFrame, pd.Series)): try: x = x.values except: raise TypeError("conversion type error for input dataset") elif not isinstance(x, np.ndarray): try: x = np.asarray(x) except: raise TypeError("wrong type for input dataset") ndim = x.ndim if ndim > 2: raise ValueError("array should be 2D at most !") # check the probs if isinstance(probs, (pd.DataFrame, pd.Series)): try: probs = probs.values except: raise TypeError("conversion type error for input probabilities") elif isinstance(probs, (list, tuple)): try: probs = np.array(probs, copy=False, ndmin=1) except: raise TypeError("conversion type for error input probabilities") elif not isinstance(probs, np.ndarray): raise TypeError("wrong type for input probabilities") # adjust the values: this is taken from R implementation, where alues up to # 2e-14 outside that range are accepted and moved to the nearby endpoint eps = 100 * np.finfo(np.double).eps if (probs < -eps).any() or (probs > 1 + eps).any(): raise ValueError("probs values outside [0,1]") probs = np.maximum(0, np.minimum(1, probs)) #weights = np.ones(x) ## check the weights #if isinstance(weights, (pd.DataFrame,pd.Series)): # try: weights = weights.values # except: raise TypeError("conversion type error for input weights") #elif not isinstance(weights, np.ndarray): # try: weights = np.asarray(weights) # except: raise TypeError("wrong type for input weights") #if x.shape != weights.shape: # raise ValueError("the length of data and weights must be the same") # check parameter typ value if typ not in TYPES: raise ValueError( "typ should be an integer in range [1,{}]!".format(TYPES)) # check parameter method value if method not in METHODS: raise ValueError("method should be in {}!".format(METHODS)) # check parameter method if not isinstance(is_sorted, bool): raise TypeError("wrong type for boolean flag is_sorted!") # check parameter na_rm if not isinstance(na_rm, bool): raise TypeError("wrong type for boolean flag na_rm!") # check parameter limit if not isinstance(limit, (list, tuple, np.ndarray)): raise TypeError("wrong type for boolean flag limit!") if len(limit) != 2: raise ValueError("the length of limit must be 2") ## algorithm implementation def gamma_indice(g, j, typ): gamma = np.zeros(len(j)) if typ == 1: gamma[np.where(g > 0)] = 1 # gamma[np.where(g <= 0)] = 0 elif typ == 2: gamma[np.where(g > 0)] = 1 gamma[np.where(g <= 0)] = 0.5 elif typ == 3: gamma[np.where(np.logical_or(g != 0, j % 2 == 1))] = 1 elif typ >= 4: gamma = g return gamma def _canonical_quantile1D(typ, sorted_x, probs): """Compute the quantile of a 1D numpy array using the canonical/direct approach derived from the original algorithms from Hyndman & Fan, Cunane and Filliben. """ # inspired by the _quantiles1D function of mquantiles N = len(sorted_x) # sorted_x.count() m_indice = lambda p, i: {1: 0, 2: 0, 3: -0.5, 4: 0, 5: 0.5, \ 6: p, 7: 1-p, 8: (p+1)/3 , 9: (2*p+3)/8, \ 10: .4 + .2 * p, 11: .3175 +.365*p}[i] j_indice = lambda p, n, m: np.int_(np.floor(n * p + m)) g_indice = lambda p, n, m, j: p * n + m - j m = m_indice(probs, typ) j = j_indice(probs, N, m) j_1 = j - 1 # adjust for the bounds j_1[j_1 < 0] = 0 j[j > N - 1] = N - 1 x1 = sorted_x[j_1] # indexes start at 0... x2 = sorted_x[j] g = g_indice(probs, N, m, j) gamma = gamma_indice(g, j, typ) return (1 - gamma) * x1 + gamma * x2 def _mquantile1D(typ, sorted_x, probs): """Compute the quantiles of a 1D numpy array following the implementation of the _quantiles1D function of mquantiles. source: https://github.com/scipy/scipy/blob/master/scipy/stats/mstats_basic.py """ N = len( sorted_x ) # sorted_x.count() # though ndarray's have no 'count' attribute if N == 0: return np_ma.array(np.empty(len(probs), dtype=float), mask=True) elif N == 1: return np_ma.array(np.resize(sorted_x, probs.shape), mask=np_ma.nomask) # note that, wrt to the original implementation (see source code mentioned # above), we also added the definitions of (alphap,betap) for typ in [1,2,3] abp_indice = lambda typ: {1: (0, 1), 2: (0, 1), 3: (-.5, -1.5), 4: (0, 1), \ 5: (.5 , .5), 6: (0 , 0), 7:(1 , 1), 8: (1/3, 1/3), \ 9: (3/8 , 3/8), 10: (.4,.4), 11: (.3175, .3175)}[typ] alphap, betap = abp_indice(typ) m = alphap + probs * (1. - alphap - betap) aleph = (probs * N + m) j = np.floor(aleph.clip(1, N - 1)).astype(int) g = (aleph - j).clip(0, 1) gamma = gamma_indice(g, j, typ) return (1. - gamma) * sorted_x[ (j - 1).tolist()] + gamma * sorted_x[j.tolist()] def _wquantile1D(typ, x, probs, weights): # not used """Compute the weighted quantile of a 1D numpy array. """ # Check the data ind_sorted = np.argsort(x) sorted_x = x[ind_sorted] sorted_weights = weights[ind_sorted] # Compute the auxiliary arrays Sn = np.cumsum(sorted_weights) #assert Sn != 0, "The sum of the weights must not be zero" Pn = (Sn - 0.5 * sorted_weights) / np.sum(sorted_weights) # Get the value of the weighted median return np.interp(probs, Pn, sorted_x) ## actual calculation # select method if method == 'DIRECT': _quantile1D = _canonical_quantile1D elif method == 'INHERIT': _quantile1D = _mquantile1D # define input data if na_rm is True: data = np_ma.array(x, copy=True, mask=np.isnan(x)) # weights = np_ma.array(x, copy=True, mask = np.isnan(x)) elif np.isnan(x).any(): raise ValueError( "missing values and NaN's not allowed if 'na_rm' is FALSE") else: data = np_ma.array(x, copy=False) # filter the input data if limit is True: condition = (limit[0] < data) & (data < limit[1]) data[~condition.filled(True)] = np_ma.masked # sort if not already the case if is_sorted is False: # ind_sorted = np.argsort(x) # sorted_x = x[ind_sorted] sorted_data = np_ma.sort(data.compressed()) # Computes quantiles along axis (or globally) if ndim == 1: return _quantile1D(typ, data if is_sorted else sorted_data, probs) else: return np_ma.apply_along_axis(_quantile1D, 1, typ, \ data if is_sorted else sorted_data, probs)
def hdquantiles(data, prob=list([.25, .5, .75]), axis=None, var=False, ): """ Computes quantile estimates with the Harrell-Davis method. The quantile estimates are calculated as a weighted linear combination of order statistics. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. var : bool, optional Whether to return the variance of the estimate. Returns ------- hdquantiles : MaskedArray A (p,) array of quantiles (if `var` is False), or a (2,p) array of quantiles and variances (if `var` is True), where ``p`` is the number of quantiles. """ def _hd_1D(data, prob, var): "Computes the HD quantiles for a 1D array. Returns nan for invalid data." xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) # Don't use length here, in case we have a numpy scalar n = xsorted.size hd = np.empty((2, len(prob)), float_) if n < 2: hd.flat = np.nan if var: return hd return hd[0] v = np.arange(n + 1) / float(n) betacdf = beta.cdf for (i, p) in enumerate(prob): _w = betacdf(v, (n + 1) * p, (n + 1) * (1 - p)) w = _w[1:] - _w[:-1] hd_mean = np.dot(w, xsorted) hd[0, i] = hd_mean # hd[1, i] = np.dot(w, (xsorted - hd_mean) ** 2) # hd[0, prob == 0] = xsorted[0] hd[0, prob == 1] = xsorted[-1] if var: hd[1, prob == 0] = hd[1, prob == 1] = np.nan return hd return hd[0] # Initialization & checks data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None) or (data.ndim == 1): result = _hd_1D(data, p, var) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hd_1D, axis, data, p, var) return ma.fix_invalid(result, copy=False)
def hdquantiles( data, prob=list([.25, .5, .75]), axis=None, var=False, ): """Computes quantile estimates with the Harrell-Davis method, where the estimates are calculated as a weighted linear combination of order statistics. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. axis : int Axis along which to compute the quantiles. If None, use a flattened array. var : boolean Whether to return the variance of the estimate. Returns ------- A (p,) array of quantiles (if ``var`` is False), or a (2,p) array of quantiles and variances (if ``var`` is True), where ``p`` is the number of quantiles. Notes ----- The function is restricted to 2D arrays. """ def _hd_1D(data, prob, var): "Computes the HD quantiles for a 1D array. Returns nan for invalid data." xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) # Don't use length here, in case we have a numpy scalar n = xsorted.size #......... hd = np.empty((2, len(prob)), float_) if n < 2: hd.flat = np.nan if var: return hd return hd[0] #......... v = np.arange(n + 1) / float(n) betacdf = beta.cdf for (i, p) in enumerate(prob): _w = betacdf(v, (n + 1) * p, (n + 1) * (1 - p)) w = _w[1:] - _w[:-1] hd_mean = np.dot(w, xsorted) hd[0, i] = hd_mean # hd[1, i] = np.dot(w, (xsorted - hd_mean)**2) # hd[0, prob == 0] = xsorted[0] hd[0, prob == 1] = xsorted[-1] if var: hd[1, prob == 0] = hd[1, prob == 1] = np.nan return hd return hd[0] # Initialization & checks --------- data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None) or (data.ndim == 1): result = _hd_1D(data, p, var) else: assert data.ndim <= 2, "Array should be 2D at most !" result = ma.apply_along_axis(_hd_1D, axis, data, p, var) # return ma.fix_invalid(result, copy=False)
def hdquantiles(data, prob=list([0.25, 0.5, 0.75]), axis=None, var=False): """ Computes quantile estimates with the Harrell-Davis method. The quantile estimates are calculated as a weighted linear combination of order statistics. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. var : bool, optional Whether to return the variance of the estimate. Returns ------- hdquantiles : MaskedArray A (p,) array of quantiles (if `var` is False), or a (2,p) array of quantiles and variances (if `var` is True), where ``p`` is the number of quantiles. """ def _hd_1D(data, prob, var): "Computes the HD quantiles for a 1D array. Returns nan for invalid data." xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) # Don't use length here, in case we have a numpy scalar n = xsorted.size hd = np.empty((2, len(prob)), float_) if n < 2: hd.flat = np.nan if var: return hd return hd[0] v = np.arange(n + 1) / float(n) betacdf = beta.cdf for (i, p) in enumerate(prob): _w = betacdf(v, (n + 1) * p, (n + 1) * (1 - p)) w = _w[1:] - _w[:-1] hd_mean = np.dot(w, xsorted) hd[0, i] = hd_mean # hd[1, i] = np.dot(w, (xsorted - hd_mean) ** 2) # hd[0, prob == 0] = xsorted[0] hd[0, prob == 1] = xsorted[-1] if var: hd[1, prob == 0] = hd[1, prob == 1] = np.nan return hd return hd[0] # Initialization & checks data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None) or (data.ndim == 1): result = _hd_1D(data, p, var) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hd_1D, axis, data, p, var) return ma.fix_invalid(result, copy=False)
def whiskerbox(series, fsp=None, positions=None, mode='mquantiles', width=0.8, wisk=None, plot_mean=False, logscale=None, color=None, outliers=None): """ Draws a whisker plot. The bottom and top of the boxes correspond to the lower and upper quartiles respectively (25th and 75th percentiles). Parameters ---------- series : Sequence Input data. If the sequence is 2D, each column is assumed to represent a different variable. fsp : :class:`Subplot` Subplot where to draw the data. If None, uses the current axe. positions : {None, sequence}, optional Positions along the x-axis. If None, use a scale from 1 to the number of columns. mode : {'mquantiles', 'hdquantiles'}, optional Type of algorithm used to compute the quantiles. If 'mquantiles', use the classical form :func:`~scipy.stats.mstats.mquantiles` If 'hdquantiles', use the Harrell-Davies estimators of the function :func:`~scipy.stats.mmorestats.hdquantiles`. wisk : {None, float}, optional Whiskers size, as a multiplier of the inter-quartile range. If None, the whiskers are drawn between the 5th and 95th percentiles. plot_mean : {False, True}, optional Whether to overlay the mean on the box. color : {None, string}, optional Color of the main box. outliers : {dictionary}, optional Options for plotting outliers. By default, the dictionary uses ``dict(marker='x', ms=4, mfc='#999999', ls='')`` """ outliers = outliers or dict( marker='x', ms=4, mfc='#999999', mec='#999999', ls='', ) if fsp is None: fsp = pyplot.gca() if not fsp._hold: fsp.cla() # Make sure the series is a masked array series = ma.array(series, copy=False, subok=False) # Reshape the series ................... if series.ndim == 1: series = series.reshape(-1, 1) elif series.ndim > 2: series = np.swapaxes(series, 1, -1).reshape(-1, series.shape[1]) if positions is None: positions = np.arange(1, series.shape[1] + 1) # Get the quantiles .................... plist = [0.05, 0.25, 0.5, 0.75, 0.95] # Harrell-Davies ........ if mode == 'hdquantiles': # 1D data ........... if series.ndim == 0: (qb, ql, qm, qh, qt) = mstats.hdquantiles(series.ravel(), plist) # 2D data ........... else: (qb, ql, qm, qh, qt) = ma.apply_along_axis(mstats.hdquantiles, 0, series, plist) # Basic quantiles ....... else: (qb, ql, qm, qh, qt) = mstats.mquantiles(series, plist, axis=0) # Get the heights, bottoms, and whiskers positions heights = qh - ql bottoms = ql if wisk is not None: hival = qh + wisk * heights loval = ql - wisk * heights else: (hival, loval) = (qt, qb) # Plot the whiskers and outliers ....... for i, pos, xh, xl in np.broadcast(np.arange(len(positions)), positions, hival, loval): x = series[:, i] # Get high extreme .. wisk_h = x[(x <= xh).filled(False)] if len(wisk_h) == 0: wisk_h = qh[i] else: wisk_h = max(wisk_h) # Low extremes ...... wisk_l = x[(x >= xl).filled(False)] if len(wisk_l) == 0: wisk_l = ql[i] else: wisk_l = min(wisk_l) fsp.plot((pos, pos), (wisk_l, wisk_h), dashes=(1, 1), c='k', zorder=1) fsp.plot((pos - 0.25 * width, pos + 0.25 * width), (wisk_l, wisk_l), '-', c='k') fsp.plot((pos - 0.25 * width, pos + 0.25 * width), (wisk_h, wisk_h), '-', c='k') # Outliers, if any... if outliers is not None and len(outliers) > 0: flh = x[(x > xh).filled(False)].view(ndarray) fll = x[(x < xl).filled(False)].view(ndarray) if len(flh) > 0 and len(fll) > 0: fsp.plot([pos] * (len(flh) + len(fll)), np.r_[flh, fll], **outliers) # Plot the median.... fsp.plot((pos - 0.5 * width, pos + 0.5 * width), (qm[i], qm[i]), ls='-', c='k', lw=1.2, zorder=99) # Plot the mean...... if plot_mean: fsp.plot((pos - 0.5 * width, pos + 0.5 * width), (x.mean(), x.mean()), ls=':', dashes=(1, 1), c='#000000', lw=1.1, zorder=99) # fsp.plot((pos,), (x.mean(),), marker='o', color=color, zorder=99) # Plot the boxes ....................... bars = fsp.bar(positions - 0.5 * width, heights, width=width, bottom=bottoms, color=color, yerr=None, xerr=None, ecolor='k', capsize=3, zorder=50) if logscale: fsp.set_yscale('log') return bars
def hdquantiles(data, prob=list([0.25, 0.5, 0.75]), axis=None, var=False): """Computes quantile estimates with the Harrell-Davis method, where the estimates are calculated as a weighted linear combination of order statistics. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. axis : int Axis along which to compute the quantiles. If None, use a flattened array. var : boolean Whether to return the variance of the estimate. Returns ------- A (p,) array of quantiles (if ``var`` is False), or a (2,p) array of quantiles and variances (if ``var`` is True), where ``p`` is the number of quantiles. Notes ----- The function is restricted to 2D arrays. """ def _hd_1D(data, prob, var): "Computes the HD quantiles for a 1D array. Returns nan for invalid data." xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) # Don't use length here, in case we have a numpy scalar n = xsorted.size # ......... hd = np.empty((2, len(prob)), float_) if n < 2: hd.flat = np.nan if var: return hd return hd[0] # ......... v = np.arange(n + 1) / float(n) betacdf = beta.cdf for (i, p) in enumerate(prob): _w = betacdf(v, (n + 1) * p, (n + 1) * (1 - p)) w = _w[1:] - _w[:-1] hd_mean = np.dot(w, xsorted) hd[0, i] = hd_mean # hd[1, i] = np.dot(w, (xsorted - hd_mean) ** 2) # hd[0, prob == 0] = xsorted[0] hd[0, prob == 1] = xsorted[-1] if var: hd[1, prob == 0] = hd[1, prob == 1] = np.nan return hd return hd[0] # Initialization & checks --------- data = ma.array(data, copy=False, dtype=float_) p = np.array(prob, copy=False, ndmin=1) # Computes quantiles along axis (or globally) if (axis is None) or (data.ndim == 1): result = _hd_1D(data, p, var) else: assert data.ndim <= 2, "Array should be 2D at most !" result = ma.apply_along_axis(_hd_1D, axis, data, p, var) # return ma.fix_invalid(result, copy=False)
def plotting_positions(data, alpha=0.4, beta=0.4, axis=0, masknan=False): """Returns the plotting positions (or empirical percentile points) for the data. Plotting positions are defined as (i-alpha)/(n+1-alpha-beta), where: - i is the rank order statistics (starting at 1) - n is the number of unmasked values along the given axis - alpha and beta are two parameters. Typical values for alpha and beta are: - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4) - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5) (Bliss 1967: "Rankit") - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6), (Van der Waerden 1952) - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])]. That's R default (R type 7) - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (R type 8), (Tukey 1962) - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. The resulting quantile estimates are approximately unbiased if x is normally distributed (R type 9) (Blom 1958) - (.4,.4) : approximately quantile unbiased (Cunnane) - (.35,.35): APL, used with PWM Parameters ---------- x : sequence Input data, as a sequence or array of dimension at most 2. prob : sequence List of quantiles to compute. alpha : {0.4, float} optional Plotting positions parameter. beta : {0.4, float} optional Plotting positions parameter. Notes ----- I think the adjustments assume that there are no ties in order to be a reasonable approximation to a continuous density function. TODO: check this References ---------- unknown, dates to original papers from Beasley, Erickson, Allison 2009 Behav Genet """ if isinstance(data, np.ma.MaskedArray): if axis is None or data.ndim == 1: return stats.mstats.plotting_positions(data, alpha=alpha, beta=beta) else: return ma.apply_along_axis(stats.mstats.plotting_positions, axis, data, alpha=alpha, beta=beta) if masknan: nanmask = np.isnan(data) if nanmask.any(): marr = ma.array(data, mask=nanmask) #code duplication: if axis is None or data.ndim == 1: marr = stats.mstats.plotting_positions(marr, alpha=alpha, beta=beta) else: marr = ma.apply_along_axis(stats.mstats.plotting_positions, axis, marr, alpha=alpha, beta=beta) return ma.filled(marr, fill_value=np.nan) data = np.asarray(data) if data.size == 1: # use helper function instead data = np.atleast_1d(data) axis = 0 if axis is None: data = data.ravel() axis = 0 n = data.shape[axis] if data.ndim == 1: plpos = np.empty(data.shape, dtype=float) plpos[data.argsort()] = (np.arange(1,n+1) - alpha)/(n+1.-alpha-beta) else: #nd assignment instead of second argsort doesn't look easy plpos = (data.argsort(axis).argsort(axis) + 1. - alpha)/(n+1.-alpha-beta) return plpos
def mquantiles(a, prob=list([.25, .5, .75]), alphap=.4, betap=.4, axis=None, limit=()): """ Computes empirical quantiles for a data array. Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, where ``x[j]`` is the j-th order statistic, and gamma is a function of ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and ``g = n*p + m - j``. Reinterpreting the above equations to compare to **R** lead to the equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` Typical values of (alphap,betap) are: - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf (**R** type 4) - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function (**R** type 5) - (0,0) : ``p(k) = k/(n+1)`` : (**R** type 6) - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. (**R** type 7, **R** default) - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (**R** type 8) - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. The resulting quantile estimates are approximately unbiased if x is normally distributed (**R** type 9) - (.4,.4) : approximately quantile unbiased (Cunnane) - (.35,.35): APL, used with PWM Parameters ---------- a : array_like Input data, as a sequence or array of dimension at most 2. prob : array_like, optional List of quantiles to compute. alphap : float, optional Plotting positions parameter, default is 0.4. betap : float, optional Plotting positions parameter, default is 0.4. axis : int, optional Axis along which to perform the trimming. If None (default), the input array is first flattened. limit : tuple Tuple of (lower, upper) values. Values of `a` outside this open interval are ignored. Returns ------- mquantiles : MaskedArray An array containing the calculated quantiles. Notes ----- This formulation is very similar to **R** except the calculation of ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined with each type. References ---------- .. [1] *R* statistical software at http://www.r-project.org/ Examples -------- >>> from scipy.stats.mstats import mquantiles >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) >>> mquantiles(a) array([ 19.2, 40. , 42.8]) Using a 2D array, specifying axis and limit. >>> data = np.array([[ 6., 7., 1.], [ 47., 15., 2.], [ 49., 36., 3.], [ 15., 39., 4.], [ 42., 40., -999.], [ 41., 41., -999.], [ 7., -999., -999.], [ 39., -999., -999.], [ 43., -999., -999.], [ 40., -999., -999.], [ 36., -999., -999.]]) >>> mquantiles(data, axis=0, limit=(0, 50)) array([[ 19.2 , 14.6 , 1.45], [ 40. , 37.5 , 2.5 ], [ 42.8 , 40.05, 3.55]]) >>> data[:, 2] = -999. >>> mquantiles(data, axis=0, limit=(0, 50)) masked_array(data = [[19.2 14.6 --] [40.0 37.5 --] [42.8 40.05 --]], mask = [[False False True] [False False True] [False False True]], fill_value = 1e+20) """ def _quantiles1D(data, m, p): x = np.sort(data.compressed()) n = len(x) if n == 0: return ma.array(np.empty(len(p), dtype=float), mask=True) elif n == 1: return ma.array(np.resize(x, p.shape), mask=nomask) aleph = (n * p + m) k = np.floor(aleph.clip(1, n - 1)).astype(int) gamma = (aleph - k).clip(0, 1) return (1. - gamma) * x[(k - 1).tolist()] + gamma * x[k.tolist()] # Initialization & checks --------- data = ma.array(a, copy=False) if data.ndim > 2: raise TypeError("Array should be 2D at most !") # if limit: condition = (limit[0] < data) & (data < limit[1]) data[~condition.filled(True)] = masked # p = np.array(prob, copy=False, ndmin=1) m = alphap + p * (1. - alphap - betap) # Computes quantiles along axis (or globally) if (axis is None): return _quantiles1D(data, m, p) return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
def whiskerbox( series, fsp=None, positions=None, mode="mquantiles", width=0.8, wisk=None, plot_mean=False, logscale=None, color=None, outliers=None, ): """ Draws a whisker plot. The bottom and top of the boxes correspond to the lower and upper quartiles respectively (25th and 75th percentiles). Parameters ---------- series : Sequence Input data. If the sequence is 2D, each column is assumed to represent a different variable. fsp : :class:`Subplot` Subplot where to draw the data. If None, uses the current axe. positions : {None, sequence}, optional Positions along the x-axis. If None, use a scale from 1 to the number of columns. mode : {'mquantiles', 'hdquantiles'}, optional Type of algorithm used to compute the quantiles. If 'mquantiles', use the classical form :func:`~scipy.stats.mstats.mquantiles` If 'hdquantiles', use the Harrell-Davies estimators of the function :func:`~scipy.stats.mmorestats.hdquantiles`. wisk : {None, float}, optional Whiskers size, as a multiplier of the inter-quartile range. If None, the whiskers are drawn between the 5th and 95th percentiles. plot_mean : {False, True}, optional Whether to overlay the mean on the box. color : {None, string}, optional Color of the main box. outliers : {dictionary}, optional Options for plotting outliers. By default, the dictionary uses ``dict(marker='x', ms=4, mfc='#999999', ls='')`` """ outliers = outliers or dict(marker="x", ms=4, mfc="#999999", mec="#999999", ls="") if fsp is None: fsp = pyplot.gca() if not fsp._hold: fsp.cla() # Make sure the series is a masked array series = ma.array(series, copy=False, subok=False) # Reshape the series ................... if series.ndim == 1: series = series.reshape(-1, 1) elif series.ndim > 2: series = np.swapaxes(series, 1, -1).reshape(-1, series.shape[1]) if positions is None: positions = np.arange(1, series.shape[1] + 1) # Get the quantiles .................... plist = [0.05, 0.25, 0.5, 0.75, 0.95] # Harrell-Davies ........ if mode == "hdquantiles": # 1D data ........... if series.ndim == 0: (qb, ql, qm, qh, qt) = mstats.hdquantiles(series.ravel(), plist) # 2D data ........... else: (qb, ql, qm, qh, qt) = ma.apply_along_axis(mstats.hdquantiles, 0, series, plist) # Basic quantiles ....... else: (qb, ql, qm, qh, qt) = mstats.mquantiles(series, plist, axis=0) # Get the heights, bottoms, and whiskers positions heights = qh - ql bottoms = ql if wisk is not None: hival = qh + wisk * heights loval = ql - wisk * heights else: (hival, loval) = (qt, qb) # Plot the whiskers and outliers ....... for i, pos, xh, xl in np.broadcast(np.arange(len(positions)), positions, hival, loval): x = series[:, i] # Get high extreme .. wisk_h = x[(x <= xh).filled(False)] if len(wisk_h) == 0: wisk_h = qh[i] else: wisk_h = max(wisk_h) # Low extremes ...... wisk_l = x[(x >= xl).filled(False)] if len(wisk_l) == 0: wisk_l = ql[i] else: wisk_l = min(wisk_l) fsp.plot((pos, pos), (wisk_l, wisk_h), dashes=(1, 1), c="k", zorder=1) fsp.plot((pos - 0.25 * width, pos + 0.25 * width), (wisk_l, wisk_l), "-", c="k") fsp.plot((pos - 0.25 * width, pos + 0.25 * width), (wisk_h, wisk_h), "-", c="k") # Outliers, if any... if outliers is not None and len(outliers) > 0: flh = x[(x > xh).filled(False)].view(ndarray) fll = x[(x < xl).filled(False)].view(ndarray) if len(flh) > 0 and len(fll) > 0: fsp.plot([pos] * (len(flh) + len(fll)), np.r_[flh, fll], **outliers) # Plot the median.... fsp.plot((pos - 0.5 * width, pos + 0.5 * width), (qm[i], qm[i]), ls="-", c="k", lw=1.2, zorder=99) # Plot the mean...... if plot_mean: fsp.plot( (pos - 0.5 * width, pos + 0.5 * width), (x.mean(), x.mean()), ls=":", dashes=(1, 1), c="#000000", lw=1.1, zorder=99, ) # fsp.plot((pos,), (x.mean(),), marker='o', color=color, zorder=99) # Plot the boxes ....................... bars = fsp.bar( positions - 0.5 * width, heights, width=width, bottom=bottoms, color=color, yerr=None, xerr=None, ecolor="k", capsize=3, zorder=50, ) if logscale: fsp.set_yscale("log") return bars
def plotting_positions(data, alpha=0.4, beta=0.4, axis=0, masknan=False): """Returns the plotting positions (or empirical percentile points) for the data. Plotting positions are defined as (i-alpha)/(n+1-alpha-beta), where: - i is the rank order statistics (starting at 1) - n is the number of unmasked values along the given axis - alpha and beta are two parameters. Typical values for alpha and beta are: - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4) - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5) (Bliss 1967: "Rankit") - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6), (Van der Waerden 1952) - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])]. That's R default (R type 7) - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (R type 8), (Tukey 1962) - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. The resulting quantile estimates are approximately unbiased if x is normally distributed (R type 9) (Blom 1958) - (.4,.4) : approximately quantile unbiased (Cunnane) - (.35,.35): APL, used with PWM Parameters ---------- x : sequence Input data, as a sequence or array of dimension at most 2. prob : sequence List of quantiles to compute. alpha : {0.4, float} optional Plotting positions parameter. beta : {0.4, float} optional Plotting positions parameter. Notes ----- I think the adjustments assume that there are no ties in order to be a reasonable approximation to a continuous density function. TODO: check this References ---------- unknown, dates to original papers from Beasley, Erickson, Allison 2009 Behav Genet """ if isinstance(data, np.ma.MaskedArray): if axis is None or data.ndim == 1: return stats.mstats.plotting_positions(data, alpha=alpha, beta=beta) else: return ma.apply_along_axis(stats.mstats.plotting_positions, axis, data, alpha=alpha, beta=beta) if masknan: nanmask = np.isnan(data) if nanmask.any(): marr = ma.array(data, mask=nanmask) #code duplication: if axis is None or data.ndim == 1: marr = stats.mstats.plotting_positions(marr, alpha=alpha, beta=beta) else: marr = ma.apply_along_axis(stats.mstats.plotting_positions, axis, marr, alpha=alpha, beta=beta) return ma.filled(marr, fill_value=np.nan) data = np.asarray(data) if data.size == 1: # use helper function instead data = np.atleast_1d(data) axis = 0 if axis is None: data = data.ravel() axis = 0 n = data.shape[axis] if data.ndim == 1: plpos = np.empty(data.shape, dtype=float) plpos[data.argsort()] = (np.arange(1, n + 1) - alpha) / (n + 1. - alpha - beta) else: #nd assignment instead of second argsort does not look easy plpos = (data.argsort(axis).argsort(axis) + 1. - alpha) / (n + 1. - alpha - beta) return plpos
def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None, limit=()): """ Computes empirical quantiles for a data array. Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, where ``x[j]`` is the j-th order statistic, and gamma is a function of ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and ``g = n*p + m - j``. Reinterpreting the above equations to compare to **R** lead to the equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` Typical values of (alphap,betap) are: - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf (**R** type 4) - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function (**R** type 5) - (0,0) : ``p(k) = k/(n+1)`` : (**R** type 6) - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. (**R** type 7, **R** default) - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (**R** type 8) - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. The resulting quantile estimates are approximately unbiased if x is normally distributed (**R** type 9) - (.4,.4) : approximately quantile unbiased (Cunnane) - (.35,.35): APL, used with PWM Parameters ---------- a : array_like Input data, as a sequence or array of dimension at most 2. prob : array_like, optional List of quantiles to compute. alphap : float, optional Plotting positions parameter, default is 0.4. betap : float, optional Plotting positions parameter, default is 0.4. axis : int, optional Axis along which to perform the trimming. If None (default), the input array is first flattened. limit : tuple Tuple of (lower, upper) values. Values of `a` outside this open interval are ignored. Returns ------- mquantiles : MaskedArray An array containing the calculated quantiles. Notes ----- This formulation is very similar to **R** except the calculation of ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined with each type. References ---------- .. [1] *R* statistical software at http://www.r-project.org/ Examples -------- >>> from scipy.stats.mstats import mquantiles >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) >>> mquantiles(a) array([ 19.2, 40. , 42.8]) Using a 2D array, specifying axis and limit. >>> data = np.array([[ 6., 7., 1.], [ 47., 15., 2.], [ 49., 36., 3.], [ 15., 39., 4.], [ 42., 40., -999.], [ 41., 41., -999.], [ 7., -999., -999.], [ 39., -999., -999.], [ 43., -999., -999.], [ 40., -999., -999.], [ 36., -999., -999.]]) >>> mquantiles(data, axis=0, limit=(0, 50)) array([[ 19.2 , 14.6 , 1.45], [ 40. , 37.5 , 2.5 ], [ 42.8 , 40.05, 3.55]]) >>> data[:, 2] = -999. >>> mquantiles(data, axis=0, limit=(0, 50)) masked_array(data = [[19.2 14.6 --] [40.0 37.5 --] [42.8 40.05 --]], mask = [[False False True] [False False True] [False False True]], fill_value = 1e+20) """ def _quantiles1D(data,m,p): x = np.sort(data.compressed()) n = len(x) if n == 0: return ma.array(np.empty(len(p), dtype=float), mask=True) elif n == 1: return ma.array(np.resize(x, p.shape), mask=nomask) aleph = (n*p + m) k = np.floor(aleph.clip(1, n-1)).astype(int) gamma = (aleph-k).clip(0,1) return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] # Initialization & checks --------- data = ma.array(a, copy=False) if data.ndim > 2: raise TypeError("Array should be 2D at most !") # if limit: condition = (limit[0] < data) & (data < limit[1]) data[~condition.filled(True)] = masked # p = np.array(prob, copy=False, ndmin=1) m = alphap + p*(1.-alphap-betap) # Computes quantiles along axis (or globally) if (axis is None): return _quantiles1D(data, m, p) return ma.apply_along_axis(_quantiles1D, axis, data, m, p)