def HybridNormalGPDCDF(xs, u, mu, sigma, shape, loc, scale): ''' Params: xs: unsorted list of datat to fit semi-parametric CDF to. u: threshold to move from Gaussian CDF Fit in center to GPD tail fitting. mu: mean of the data. sigma: standard deviation of the data. shape: gpd least squares estimated shape parameter. loc: gpd least squares estimated location parameter. scale: gpd least squares estimated scale parameter. Returns: an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the CDF fit. ''' out = list() l = (mu - abs(u - mu)) h = (mu + abs(u - mu)) #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h)) for x in xs: if x < l: nrm = norm.cdf(l, mu, sigma) out.append(nrm * (1 - genpareto.cdf(l - x, shape, loc=loc, scale=scale))) elif x >= h: nrm = norm.cdf(h, mu, sigma) out.append((1 - nrm) * genpareto.cdf(x - h, shape, loc=loc, scale=scale) + nrm) else: out.append(norm.cdf(x, mu, sigma)) return out
def get_pvalue(sorted_scores, stat, n): # approximate the gpd tail n_exceed = 250 is_gpd_fitted = False while n_exceed >= 10: exceedances = sorted_scores[:n_exceed] # check if the n_exceed largest permutation values follow GPD # with Anderson-Darling goodness-of-fit test try: ad = eva.gpdAd(FloatVector(exceedances)) ad_pval = ad.rx2('p.value')[0] except: n_exceed -= 10 continue # H0 = exceedances come from a GPD if ad_pval > 0.05: is_gpd_fitted = True break n_exceed -= 10 if not is_gpd_fitted: #print('GPD good fit is never reached - use ECDF instead...') return (None) # compute the exceedance threshold t t = float((sorted_scores[n_exceed] + sorted_scores[n_exceed - 1]) / 2) # estimate shape and scale params with maximum likelihood gpd_fit = eva.gpdFit(FloatVector(sorted_scores), threshold=t, method='mle') scale, shape = gpd_fit.rx2('par.ests')[0], gpd_fit.rx2('par.ests')[1] # compute GPD p-value f_gpd = genpareto.cdf(x=gt_score - t, c=shape, scale=scale) return (n_exceed / n * (1 - f_gpd))
def gpd_ad(x, tp): u, y = get_excesses(x, tp) xi, sigma = gpd_fit(y) z = genpareto.cdf(y, xi, 0, sigma) z = np.sort(z) n = len(z) i = np.linspace(1, n, n) stat = -n - (1 / n) * np.sum( (2 * i - 1) * (np.log(z) + np.log1p(-z[::-1]))) return u, stat, xi, sigma
def _margin_tail_cdf(self, x, i): # CDF of GP approximation (no need to weight it by p, that's done elsewhere) # i = component index if self.shapes[i] != 0: return gp.cdf(x, c=self.shapes[i], loc=self.u[i], scale=self.scales[i]) else: return expdist.cdf(x, loc=self.u[i], scale=self.scales[i])
def EstimaProbabilidade(self, Magnitude, Parametros): if self.tipoSerie == 'Parcial': probabilidade = genpareto.cdf(Magnitude, Parametros[0], loc = Parametros[1], scale = Parametros[2]) elif self.tipoSerie == 'Anual': probabilidade = genextreme.cdf(Magnitude, Parametros[0], loc = Parametros[1], scale = Parametros[2]) return probabilidade
def survival_function(sample, threshold, fit_method, alpha): #Plot the survival function, (1 - cdf) [shape, scale, sample, sample_excess, sample_over_thresh] = gpdfit(sample, threshold, fit_method) n = len(sample_over_thresh) y_surv = 1 - np.arange(1, n + 1) / n i_initial = 0 n = len(sample) for i in range(0, n): if sample[i] > threshold + 0.0001: i_initial = i break #Computing confidence interval with the Dvoretzky–Kiefer–Wolfowitz F1 = [] F2 = [] for i in range(i_initial, len(sample)): e = (((mt.log(2 / alpha)) / (2 * len(sample_over_thresh)))**0.5) F1.append(y_surv[i - i_initial] - e) F2.append(y_surv[i - i_initial] + e) x_points = np.arange(0, max(sample), 0.001) surv_func = 1 - genpareto.cdf(x_points, shape, loc=threshold, scale=scale) #Plotting survival function plt.figure(9) plt.plot(x_points, surv_func, color='black', label='Theoretical Survival Function') plt.xlabel('Data') plt.ylabel('Survival Function') plt.title('Data Survival Function Plot') plt.scatter(sorted(sample_over_thresh), y_surv, label='Empirical Survival Function') plt.plot(sorted(sample_over_thresh), F1, linestyle='--', color='red', alpha=0.8, lw=0.9, label='Dvoretzky–Kiefer–Wolfowitz Confidence Bands') plt.plot(sorted(sample_over_thresh), F2, linestyle='--', color='red', alpha=0.8, lw=0.9) plt.legend() plt.show()
def GeneralizedPareto_CDF(x): ''' Generalized Pareto fit Returns cumulative probability function at x. ''' # fit a generalized pareto and get params shape, _, scale = genpareto.fit(x) # get generalized pareto CDF cdf = genpareto.cdf(x, shape, scale=scale) return cdf
def HybridSemiParametricGPDCDF(xs, u, ydata, shape, loc, scale): ''' Params: xs: unsorted list of datat to fit semi-parametric CDF to. u: threshold to move from Gaussian Kernel estimation to GPD tail fitting. mu: mean of the data. sigma: standard deviation of the data. shape: gpd least squares estimated shape parameter. loc: gpd least squares estimated location parameter. scale: gpd least squares estimated scale parameter. Returns: an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the CDF fit. ''' #print("Starting Canonical Maximum Likelihood") out = list() mu = mean(ydata) l = (mu - abs(u - mu)) h = (mu + abs(u - mu)) #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h)) srtdxs = sorted(list(xs) + [l, h]) bandwidth = 0.2 cdf_smoother, bandwidth = kde_statsmodels_m_cdf_output(ydata, srtdxs, bandwidth=bandwidth) d = dict(zip(srtdxs, cdf_smoother)) for x in xs: if x < l: nrm = d[l] out.append(nrm * (1 - genpareto.cdf(l - x, shape, loc=loc, scale=scale))) elif x >= h: nrm = d[h] out.append((1 - nrm) * genpareto.cdf(x - h, shape, loc=loc, scale=scale) + nrm) else: out.append(d[x]) return xs, out, srtdxs, cdf_smoother, bandwidth
def gpdcdf(sample, threshold, fit_method, alpha): #plot gpd cdf with empirical points [shape, scale, sample, sample_excess, sample_over_thresh] = gpdfit(sample, threshold, fit_method) #fit the data n = len(sample_over_thresh) y = np.arange(1, n + 1) / n #empirical probabilities i_initial = 0 n = len(sample) for i in range(0, n): if sample[i] > threshold + 0.0001: i_initial = i break #Computing confidence interval with the Dvoretzky–Kiefer–Wolfowitz method based on the empirical points F1 = [] F2 = [] for i in range(i_initial, len(sample)): e = (((mt.log(2 / alpha)) / (2 * len(sample_over_thresh)))**0.5) F1.append(y[i - i_initial] - e) F2.append(y[i - i_initial] + e) x_points = np.arange(0, max(sample), 0.001) #generating points to apply in the cdf cdf = genpareto.cdf(x_points, shape, loc=threshold, scale=scale) #getting theoretical cdf #Plotting cdf plt.figure(7) plt.plot(x_points, cdf, color='black', label='Theoretical CDF') plt.xlabel('Data') plt.ylabel('CDF') plt.title('Data Comulative Distribution Function') plt.scatter(sorted(sample_over_thresh), y, label='Empirical CDF') plt.plot(sorted(sample_over_thresh), F1, linestyle='--', color='red', alpha=0.8, lw=0.9, label='Dvoretzky–Kiefer–Wolfowitz Confidence Bands') plt.plot(sorted(sample_over_thresh), F2, linestyle='--', color='red', alpha=0.8, lw=0.9) plt.legend() plt.show()
def ppplot(sample, threshold, fit_method, alpha): #probability-probability plot to diagnostic the model [shape, scale, sample, sample_excess, sample_over_thresh] = gpdfit(sample, threshold, fit_method) #fit the data n = len(sample_over_thresh) #Getting empirical probabilities y = np.arange(1, n + 1) / n #Getting theoretical probabilities cdf_pp = genpareto.cdf(sample_over_thresh, shape, loc=threshold, scale=scale) #Getting Confidence Intervals using the Dvoretzky–Kiefer–Wolfowitz method i_initial = 0 n = len(sample) for i in range(0, n): if sample[i] > threshold + 0.0001: i_initial = i break F1 = [] F2 = [] for i in range(i_initial, len(sample)): e = (((mt.log(2 / alpha)) / (2 * len(sample_over_thresh)))**0.5) F1.append(y[i - i_initial] - e) F2.append(y[i - i_initial] + e) #Plotting PP plt.figure(6) sns.regplot(y, cdf_pp, ci=None, line_kws={ 'color': 'black', 'label': 'Regression Line' }) plt.plot(y, F1, linestyle='--', color='red', alpha=0.5, lw=0.8, label='Dvoretzky–Kiefer–Wolfowitz Confidence Bands') plt.plot(y, F2, linestyle='--', color='red', alpha=0.5, lw=0.8) plt.legend() plt.title('P-P Plot') plt.xlabel('Empirical Probability') plt.ylabel('Theoritical Probability') plt.show()
def EstimaFrequencias(self, Parametros): if self.tipoSerie == 'Parcial': limite = lp.LimiteParcial(self.dadoSerie).AchaLimite(2) Parciais = se.Series(self.dadoSerie).serieMaxParcial(limite) datasP, PicosParciais = se.Series(Parciais).separaDados() PicosParciais.sort(reverse = True) print(PicosParciais) frequencias = genpareto.cdf(PicosParciais, Parametros[0], loc = Parametros[1], scale = Parametros[2]) elif self.tipoSerie == 'Anual': Anuais = se.Series(self.dadoSerie).serieMaxAnual() datasA, PicosAnuais = se.Series(Anuais).separaDados() PicosAnuais.sort(reverse = True) print(PicosAnuais) frequencias = genextreme.cdf(PicosAnuais, Parametros[0], loc = Parametros[1], scale = Parametros[2]) return frequencias
def extremal_distribution_fit(data, var_name, sample, threshold, fit_type, x_min, x_max, n_points, loc=None, scale=None, cumulative=True): # Initialization of the output variables param = None x = None y = None y_rp = None if fit_type == 'gpd': # Fit the exceedances over threshold to Generalized Pareto distribution param = generalized_pareto_distribution_fit(sample, threshold, loc, scale) # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) if cumulative: y = genpareto.cdf(x, param[0], param[1], param[2]) # Calculate the number of extreme peaks per year n_peaks_year = len(sample) / len( data[var_name].index.year.unique()) y_rp = return_period_curve(n_peaks_year, y) else: y = genpareto.pdf(x, param[0], param[1], param[2]) elif fit_type == 'coles': # Fit the exceedances over threshold to Generalized Pareto distribution param = generalized_pareto_distribution_fit(sample, threshold, loc, scale) x = np.arange(1, 501) u = param[1] sigma = param[2] xi = param[0] # Mean number of data in a year (numero medio de datos en un año) n_y = len(data[var_name]) / len(data[var_name].index.year.unique()) # Total number of POT / number of years z_u = len(sample) / len(data[var_name]) # n_y*z_u is the number of POT / number of years -- > numer of POT per year y_rp = u + (sigma / xi) * (((x * n_y * z_u)**xi) - 1) elif fit_type == 'gev': param = generalized_extreme_value_distribution_fit(sample, loc, scale) # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) if cumulative: y = genextreme.cdf(x, param[0], param[1], param[2]) # Calculate the number of extreme peaks per year n_peaks_year = 1 y_rp = return_period_curve(n_peaks_year, y) else: y = genpareto.pdf(x, param[0], param[1], param[2]) elif fit_type == 'poisson': # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) # Fit the exceedances over threshold to Generalized Pareto distribution gpd_param = generalized_pareto_distribution_fit( sample, threshold, loc, scale) # Poisson parameter (número de eventos extraños al año) poisspareto_param = len(sample) / len( data[var_name].index.year.unique()) # Poisson pareto parameters poisspareto_param = [ poisspareto_param, gpd_param[0], gpd_param[2], gpd_param[1] ] # Equivalent gev parameters param = [0, 0, 0] param[0] = -poisspareto_param[1] param[1] = poisspareto_param[2] * (poisspareto_param[0]** poisspareto_param[1]) param[2] = poisspareto_param[3] + ( (poisspareto_param[2] / poisspareto_param[1]) * ((poisspareto_param[0]**poisspareto_param[1]) - 1)) if cumulative: y = genextreme.cdf(x, param[0], param[2], param[1]) # Calculate the number of extreme peaks per year n_peaks_year = 1 y_rp = return_period_curve(n_peaks_year, y) else: y = genextreme.pdf(x, param[0], param[2], param[1]) return param, x, y, y_rp
fit = genpareto.fit(poted_values, floc=[poted_values[-1]]) fit = genpareto.fit(poted_values, floc=fit[1], fscale=fit[2]) if j == 0: #print(fit[2]) mu_check.append(poted_values[-1]) gamma = fit[0] mu = fit[1] sigma = fit[2] #gpd_params_dict[str(j + 1)]["gamma"].append(gamma) #gpd_params_dict[str(j + 1)]["mu"].append(mu[0]) #gpd_params_dict[str(j + 1)]["sigma"].insert(sigma) if dw[i, j] >= fit[1]: hpp[i - gev_window, j] = 1 - genpareto.cdf( dw[i, j], fit[0], fit[1], fit[2]) + 1e-50 #gpd_params[j].append(fit) totalhpp1 = -np.log10(np.prod(hpp, axis=1)) min_index = np.argmax(totalhpp1) le = pa.detection.learning_entropy(w, m=1200, order=1) snr[seed_counter] = 10 * np.log10( (np.std(desired_output[gev_window:])**2) / (noise_sigma**2)) # print(Fore.RED + "experiment number: " + str(seed_counter)) # print(Fore.GREEN + "SNR: " + (str(snr[seed_counter]))) # print(Fore.BLACK + "min_index GPD: " + str(min_index)) if min_index > 199 and min_index < 211: gpd_result[seed_counter] = 1 max_index_elbnd = np.argmax(elbnd[-400:])
def genpareto_gradient_cdf(x, c, scale): """Gradient of the Generalized Pareto Distribution function w.r.t. to the scale and shape parameter :param x: array_like quantiles :param c: positive number shape parameter :param scale:positive number scale parameter (default=1) :return: (2 X n)-matrix where n is equal to the size of x The first row corresponds to the gradient of the cdf w.r.t. the shape parameter evaluated at x The second row corresponds to the gradient of the cdf w.r.t. the scale parameter evaluated at x """ output = np.zeros(shape=(2, x.size)) cond = 0 < (1+c*x/scale) output[0] = np.where(cond, (-1/c**2*log(1 + c*x/scale) + x/(c*(scale + c * x)))*(1 - genpareto.cdf(x, c, scale)), 0) output[1] = -x/scale*genpareto.pdf(x, c, scale) return output
import numpy as np from scipy.stats import genpareto import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) c = 0.1 mean, var, skew, kurt = genpareto.stats(c, moments='mvsk') x = np.linspace(genpareto.ppf(0.01, c),genpareto.ppf(0.99, c), 100) ax.plot(x, genpareto.pdf(x, c),'r-', lw=5, alpha=0.6, label='genpareto pdf') rv = genpareto(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = genpareto.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c)) r = genpareto.rvs(c, size=1000) ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
padx=4) quantileroot.mainloop() #Probability Plot line2 = [.01 * x for x in range(100)] plt.title('Probability Plot') plt.ylabel('Model') plt.xlabel('Empirical') plt.scatter(listofpercents, [ genparetodist(x, Threshold, Maxsigma, Maxxis) for x in xvalues['x' + str(threshposition)] ], s=.5) plt.scatter(listofpercents, [ genpareto.cdf(y, Maxxis, loc=0, scale=Maxsigma) for y in yvalues['y' + str(threshposition)] ], s=.5, label='Simulation Data') plt.axis([0, 1, 0, 1]) plt.plot(line2, line2, 'b', label='Best Model Fit') plt.legend(bbox_to_anchor=(0., 1.1, 1., .102), loc=3, ncol=1, mode="expand", borderaxespad=0.) plt.show() print 'The Dotted line (Real Data), should reasonably agree with the model to make a Linear fit.' print '\n'
import numpy as np from scipy.stats import genpareto import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) c = 0.1 mean, var, skew, kurt = genpareto.stats(c, moments='mvsk') x = np.linspace(genpareto.ppf(0.01, c), genpareto.ppf(0.99, c), 100) ax.plot(x, genpareto.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genpareto pdf') rv = genpareto(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = genpareto.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c)) r = genpareto.rvs(c, size=1000) ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
gamma_k = lamda * mean_X - gamma_alpha / gamma_beta sort_comp_poisson_rnd = compound_poisson_distribution( lamda, num_values, mu, sigma) sort_comp_poisson_rnd.sort() alpha_low = .99 alpha_high = .99999 low_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_low)] high_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_high)] mu_gp = low_val sample_data = compound_poisson_distribution(lamda, num_values, mu, sigma) data_gp = sample_data[sample_data > mu_gp] - mu_gp gpd_value = gpd.fit(data_gp) cdf_values = np.arange(low_val, high_val + (high_val - low_val) / 1000, (high_val - low_val) / 1000) norm_cdf_tail = 1 - norm.cdf(cdf_values, mean_SN, (var_SN)**(1 / 2)) gamma_cdf_tail = 1 - gamma.cdf( cdf_values - gamma_k, gamma_alpha, scale=1 / gamma_beta) GP_cdf_tail = 1 - (genpareto.cdf( cdf_values - low_val, gpd_value[0], scale=gpd_value[2]) * 0.01 + 0.99) emp_cdf_tails = emp_cdf_tail(sample_data, cdf_values) plt.loglog(cdf_values, norm_cdf_tail, label='CLT') plt.loglog(cdf_values, gamma_cdf_tail, label='GAMMA') plt.loglog(cdf_values, GP_cdf_tail, label='GP') plt.loglog(cdf_values, emp_cdf_tails, label='EMP') plt.title('LOG-LOG plot of 1-F_SN vs x') plt.legend() plt.show()