def dispersion_idx(data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1, alpha=0.05, plotflag=False): '''Return Dispersion Index vs threshold Parameters ---------- data, ti : array_like data values and sampled times, respectively. u : array-like threshold values (default linspace(umin, umax, nu)) umin, umax : real scalars Minimum and maximum threshold, respectively (default min(data), max(data)). nu : scalar integer number of threshold values (default min(N-nmin,100)) nmin : scalar integer Minimum number of extremes to include. (Default 10). tb : Real scalar Block period (same unit as the sampled times) (default 1) alpha : real scalar Confidence coefficient (default 0.05) plotflag: bool Returns ------- DI : PlotData object Dispersion index b_u : real scalar threshold where the number of exceedances in a fixed period (Tb) is consistent with a Poisson process. ok_u : array-like all thresholds where the number of exceedances in a fixed period (Tb) is consistent with a Poisson process. Notes ------ DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold. DI measures the homogenity of data and the purpose of DI is to determine the threshold where the number of exceedances in a fixed period (Tb) is consistent with a Poisson process. For a Poisson process the DI is one. Thus the threshold should be so high that DI is not significantly different from 1. The Poisson hypothesis is not rejected if the estimated DI is between: chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1) where M is the total number of fixed periods/blocks -generally the total number of years in the sample. Example ------- >>> import wafo.data >>> xn = wafo.data.sea() >>> t, data = xn.T >>> Ie = findpot(data,t,0,5); >>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) >>> h = di.plot() # a threshold around 1 seems appropriate. >>> round(u*100)/100 1.03 vline(u) See also -------- reslife, fitgenparrange, extremal_idx References ---------- Ribatet, M. A.,(2006), A User's Guide to the POT Package (Version 1.0) month = {August}, url = {http://cran.r-project.org/} Cunnane, C. (1979) Note on the poisson assumption in partial duration series model. Water Resource Research, 15\bold{(2)} :489--494.} ''' n = len(data) if t is None: ti = arange(n) else: ti = arr(t) - min(t) t1 = np.empty(ti.shape, dtype=int) t1[:] = np.floor(ti / tb) if u is None: sd = np.sort(data) nmin = max(nmin, 0) if 2 * nmin > n: warnings.warn('nmin possibly too large!') sdmax, sdmin = sd[-nmin], sd[0] umax = sdmax if umax is None else min(umax, sdmax) umin = sdmin if umin is None else max(umin, sdmin) if nu is None: nu = min(n - nmin, 100) u = linspace(umin, umax, nu) nu = len(u) di = np.zeros(nu) d = arr(data) mint = int(min(t1)) # ; % mint should be 0. maxt = int(max(t1)) M = maxt - mint + 1 occ = np.zeros(M) for ix, tresh in enumerate(u.tolist()): excess = (d > tresh) lambda_ = excess.sum() / M for block in range(M): occ[block] = sum(excess[t1 == block]) di[ix] = occ.var() / lambda_ p = 1 - alpha diLo = _invchi2(1 - alpha / 2, M - 1) / (M - 1) diUp = _invchi2(alpha / 2, M - 1) / (M - 1) # Find appropriate threshold k1, = np.where((diLo < di) & (di < diUp)) if len(k1) > 0: ok_u = u[k1] b_di = (di[k1].mean() < di[k1]) k = b_di.argmax() b_u = ok_u[k] else: b_u = ok_u = None CItxt = '%d%s CI' % (100 * p, '%') titleTxt = 'Dispersion Index plot' res = PlotData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index') #'caption',CItxt); res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) res.children = [ PlotData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt) ] res.plot_args_children = ['--r'] if plotflag: res.plot(di) return res, b_u, ok_u
def dispersion_idx( data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1, alpha=0.05, plotflag=False): '''Return Dispersion Index vs threshold Parameters ---------- data, ti : array_like data values and sampled times, respectively. u : array-like threshold values (default linspace(umin, umax, nu)) umin, umax : real scalars Minimum and maximum threshold, respectively (default min(data), max(data)). nu : scalar integer number of threshold values (default min(N-nmin,100)) nmin : scalar integer Minimum number of extremes to include. (Default 10). tb : Real scalar Block period (same unit as the sampled times) (default 1) alpha : real scalar Confidence coefficient (default 0.05) plotflag: bool Returns ------- DI : PlotData object Dispersion index b_u : real scalar threshold where the number of exceedances in a fixed period (Tb) is consistent with a Poisson process. ok_u : array-like all thresholds where the number of exceedances in a fixed period (Tb) is consistent with a Poisson process. Notes ------ DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold. DI measures the homogenity of data and the purpose of DI is to determine the threshold where the number of exceedances in a fixed period (Tb) is consistent with a Poisson process. For a Poisson process the DI is one. Thus the threshold should be so high that DI is not significantly different from 1. The Poisson hypothesis is not rejected if the estimated DI is between: chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1) where M is the total number of fixed periods/blocks -generally the total number of years in the sample. Example ------- >>> import wafo.data >>> xn = wafo.data.sea() >>> t, data = xn.T >>> Ie = findpot(data,t,0,5); >>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) >>> h = di.plot() # a threshold around 1 seems appropriate. >>> round(u*100)/100 1.03 vline(u) See also -------- reslife, fitgenparrange, extremal_idx References ---------- Ribatet, M. A.,(2006), A User's Guide to the POT Package (Version 1.0) month = {August}, url = {http://cran.r-project.org/} Cunnane, C. (1979) Note on the poisson assumption in partial duration series model. Water Resource Research, 15\bold{(2)} :489--494.} ''' n = len(data) if t is None: ti = arange(n) else: ti = arr(t) - min(t) t1 = np.empty(ti.shape, dtype=int) t1[:] = np.floor(ti / tb) if u is None: sd = np.sort(data) nmin = max(nmin, 0) if 2 * nmin > n: warnings.warn('nmin possibly too large!') sdmax, sdmin = sd[-nmin], sd[0] umax = sdmax if umax is None else min(umax, sdmax) umin = sdmin if umin is None else max(umin, sdmin) if nu is None: nu = min(n - nmin, 100) u = linspace(umin, umax, nu) nu = len(u) di = np.zeros(nu) d = arr(data) mint = int(min(t1)) # ; % mint should be 0. maxt = int(max(t1)) M = maxt - mint + 1 occ = np.zeros(M) for ix, tresh in enumerate(u.tolist()): excess = (d > tresh) lambda_ = excess.sum() / M for block in range(M): occ[block] = sum(excess[t1 == block]) di[ix] = occ.var() / lambda_ p = 1 - alpha diLo = _invchi2(1 - alpha / 2, M - 1) / (M - 1) diUp = _invchi2(alpha / 2, M - 1) / (M - 1) # Find appropriate threshold k1, = np.where((diLo < di) & (di < diUp)) if len(k1) > 0: ok_u = u[k1] b_di = (di[k1].mean() < di[k1]) k = b_di.argmax() b_u = ok_u[k] else: b_u = ok_u = None CItxt = '%d%s CI' % (100 * p, '%') titleTxt = 'Dispersion Index plot' res = PlotData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index') #'caption',CItxt); res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) res.children = [ PlotData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt)] res.plot_args_children = ['--r'] if plotflag: res.plot(di) return res, b_u, ok_u
def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False): ''' Return Mean Residual Life, i.e., mean excesses vs thresholds Parameters --------- data : array_like vector of data of length N. u : array-like threshold values (default linspace(umin, umax, nu)) umin, umax : real scalars Minimum and maximum threshold, respectively (default min(data), max(data)). nu : scalar integer number of threshold values (default min(N-nmin,100)) nmin : scalar integer Minimum number of extremes to include. (Default 3). alpha : real scalar Confidence coefficient (default 0.05) plotflag: bool Returns ------- mrl : PlotData object Mean residual life values, i.e., mean excesses over thresholds, u. Notes ----- RESLIFE estimate mean excesses over thresholds. The purpose of MRL is to determine the threshold where the upper tail of the data can be approximated with the generalized Pareto distribution (GPD). The GPD is appropriate for the tail, if the MRL is a linear function of the threshold, u. Theoretically in the GPD model E(X-u0|X>u0) = s0/(1+k) E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0 where k,s is the shape and scale parameter, respectively. s0 = scale parameter for threshold u0<u. Example ------- >>> import wafo >>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100) >>> mrl = reslife(R,nu=20) >>> h = mrl.plot() See also --------- genpareto fitgenparrange, disprsnidx ''' if u is None: sd = np.sort(data) n = len(data) nmin = max(nmin, 0) if 2 * nmin > n: warnings.warn('nmin possibly too large!') sdmax, sdmin = sd[-nmin], sd[0] umax = sdmax if umax is None else min(umax, sdmax) umin = sdmin if umin is None else max(umin, sdmin) if nu is None: nu = min(n - nmin, 100) u = linspace(umin, umax, nu) nu = len(u) #mrl1 = valarray(nu) #srl = valarray(nu) #num = valarray(nu) mean_and_std = lambda data1: (data1.mean(), data1.std(), data1.size) dat = arr(data) tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()]) mrl, srl, num = tmp.T p = 1 - alpha alpha2 = alpha / 2 # Approximate P% confidence interval #%Za = -invnorm(alpha2); % known mean Za = -_invt(alpha2, num - 1) # unknown mean mrlu = mrl + Za * srl / sqrt(num) mrll = mrl - Za * srl / sqrt(num) #options.CI = [mrll,mrlu]; #options.numdata = num; titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%') res = PlotData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt) res.workspace = dict(numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) res.children = [ PlotData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt) ] res.plot_args_children = [':r'] if plotflag: res.plot() return res
def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False): ''' Return Mean Residual Life, i.e., mean excesses vs thresholds Parameters --------- data : array_like vector of data of length N. u : array-like threshold values (default linspace(umin, umax, nu)) umin, umax : real scalars Minimum and maximum threshold, respectively (default min(data), max(data)). nu : scalar integer number of threshold values (default min(N-nmin,100)) nmin : scalar integer Minimum number of extremes to include. (Default 3). alpha : real scalar Confidence coefficient (default 0.05) plotflag: bool Returns ------- mrl : PlotData object Mean residual life values, i.e., mean excesses over thresholds, u. Notes ----- RESLIFE estimate mean excesses over thresholds. The purpose of MRL is to determine the threshold where the upper tail of the data can be approximated with the generalized Pareto distribution (GPD). The GPD is appropriate for the tail, if the MRL is a linear function of the threshold, u. Theoretically in the GPD model E(X-u0|X>u0) = s0/(1+k) E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0 where k,s is the shape and scale parameter, respectively. s0 = scale parameter for threshold u0<u. Example ------- >>> import wafo >>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100) >>> mrl = reslife(R,nu=20) >>> h = mrl.plot() See also --------- genpareto fitgenparrange, disprsnidx ''' if u is None: sd = np.sort(data) n = len(data) nmin = max(nmin, 0) if 2 * nmin > n: warnings.warn('nmin possibly too large!') sdmax, sdmin = sd[-nmin], sd[0] umax = sdmax if umax is None else min(umax, sdmax) umin = sdmin if umin is None else max(umin, sdmin) if nu is None: nu = min(n - nmin, 100) u = linspace(umin, umax, nu) nu = len(u) #mrl1 = valarray(nu) #srl = valarray(nu) #num = valarray(nu) mean_and_std = lambda data1: (data1.mean(), data1.std(), data1.size) dat = arr(data) tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()]) mrl, srl, num = tmp.T p = 1 - alpha alpha2 = alpha / 2 # Approximate P% confidence interval #%Za = -invnorm(alpha2); % known mean Za = -_invt(alpha2, num - 1) # unknown mean mrlu = mrl + Za * srl / sqrt(num) mrll = mrl - Za * srl / sqrt(num) #options.CI = [mrll,mrlu]; #options.numdata = num; titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%') res = PlotData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt) res.workspace = dict( numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) res.children = [ PlotData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)] res.plot_args_children = [':r'] if plotflag: res.plot() return res