def Plot_Dist_Train_Extreme(r_err, GT_val,bin1=500,bin2=500,interval1 = 0.95,interval2=0.99): covMat = np.array(r_err["Err"], dtype=float) median = np.median(covMat) c, loc, scale = genextreme.fit(covMat, floc=median) min_extreme1,max_extreme1 = genextreme.interval(interval1,c,loc,scale) min_extreme2,max_extreme2 = genextreme.interval(interval2,c,loc,scale) x = np.linspace(min(covMat),max(covMat),2000) fig,ax = plt.subplots(figsize = (30,10)) plt.xlim(0,0.4) plt.plot(x, genextreme.pdf(x, *genextreme.fit(covMat)), linewidth=5) plt.hist(np.array(r_err["Err"], dtype=float),bins=bin1,alpha=0.3,density=True,edgecolor='black',facecolor='gray', linewidth=3,histtype='stepfilled') #{'bar', 'barstacked', 'step', 'stepfilled'}) plt.hist(np.asarray(GT_val["Err"]), bins=bin2, alpha=0.3,density=True,edgecolor='red',facecolor='red', linewidth=3,histtype='stepfilled') plt.xlabel('Lengths Counts') plt.ylabel('Probability') plt.title(r'max_extreme1=%.3f,max_extreme2=%.3f' %(max_extreme1, max_extreme2)) ax.tick_params(left = False, bottom = False) ax.axvline(min_extreme1, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.axvline(max_extreme1, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.text(min_extreme1, 8, "5th", size = 20, alpha = 0.8,color="red") ax.text(max_extreme1, 8, "95th", size = 20, alpha =.8,color="red") ax.axvline(min_extreme2, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.axvline(max_extreme2, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.text(min_extreme2, 8, "1st", size = 20, alpha = 0.8,color="red") ax.text(max_extreme2, 8, "99th", size = 20, alpha =.8,color="red") print("95% CI upper bound:",max_extreme1) print("99% CI upper bound:",max_extreme2) print("Median RE:",np.median(np.array(GT_val["Err"], dtype=float))) return c, loc, scale, fig,ax
def getZScoreDistExpFunction(y_data): from scipy.stats import genextreme as ge fit_params = ge.fit(y_data) mean = fit_params[1] sigma = fit_params[2] shape = fit_params[0] return lambda x: ge.pdf(x, shape, loc=mean, scale=sigma)
def plotPDF(x, gevfit, bins, xLabel, Title, fname=None): ''' Plot the PDF of data x. ---------------------------------------------------------- Input: x: Pandas series gevfit: Tuple with the three fitted GEV parameters bins: Integer indicating number of bins or a numpy array with the bin edges xLabel: Str label to use for x-axis Title: Str chart title fname: (Optional) Full path to filename to save the figure in *.png format ''' fig, ax = plt.subplots(1, 1) h = ax.hist(x, bins, density=True, color=[0, 1, 1], edgecolor='k', linewidth=.5, facecolor=colors[0]) p = ax.plot(x, genextreme.pdf(x, gevfit[0], gevfit[1], gevfit[2]), color='k') plt.xlabel(xLabel) plt.ylabel('Probability density [-]') plt.title(Title) if fname: plt.savefig(fname, dpi=600.) else: plt.show()
def gevfit(sr): gev_fit = gev.fit(sr) c = gev_fit[0] mu = gev_fit[1] sigma = gev_fit[2] print(""" GEV Fit Parameters: shape parameter c: %s location parameter mu: %s scale parameter sigma: %s """ % (c, sigma, mu)) print("Median", gev.median(c, mu, sigma)) print("Mean", gev.mean(c, mu, sigma)) print("Std dev", gev.std(c, mu, sigma)) print("95% interval: ", gev.interval(0.95, c, mu, sigma)) if (c > 0): lBnd = mu - sigma / c else: lBnd = mu + sigma / c srmax = np.max(sr) * 1.1 bins = sr.size x = np.linspace(np.min(sr) - 5, np.max(sr) + 5, 500) #x=np.linspace(lBnd,srmax,500) gev_pdf = gev.pdf(x, c, mu, sigma) gev_cdf = gev.cdf(x, c, mu, sigma) plt.figure(figsize=(12, 6)) ax1 = plt.subplot(1, 2, 1) plt.hist(sr, normed=True, alpha=0.2, label='Raw Data', bins='auto') plt.plot(x, gev_pdf, 'r--', label='GEV Fit') plt.legend(loc='upper left') ax1.set_title('%s_Probability Density Fraction' % (sr.name)) ax1.set_xlabel('Predicted Fatigue Limit (MPa)') ax1.set_ylabel('Probability') ax1.grid() ax2 = plt.subplot(1, 2, 2) plt.hist(sr, normed=True, alpha=0.2, label='Raw Data', cumulative=True, bins='auto') plt.plot(x, gev_cdf, 'r--', label='GEV Fit') plt.legend(loc='upper left') ax2.set_title('%s_Cumulative Density Fraction' % (sr.name)) ax2.set_xlabel('Predicted Fatigue Limit (MPa)') ax2.set_ylabel('Density') ax2.grid() plt.show() pass
def test_with_scipy(self): if not SP: raise nose.SkipTest("SciPy not installed.") x = [1, 2, 3, 4] scipy_y = log(genextreme.pdf(x, -.3, 4, 2)) flib_y = [] for i in x: flib_y.append(flib.gev(i, .3, 4, 2)) assert_array_almost_equal(scipy_y, flib_y, 5)
def density(self, x, data, distribution): den = 0.0 if distribution == 'gev': den = gev.pdf(x, data[0], loc=data[1], scale=data[2]) elif distribution == 'LN': den = self.lognormal_pdf(x, data[0], data[1]) elif distribution == 'TN': den = self.TN_pdf(x, data[0], data[1]) return den
def srednie(plik_in): listy = [] domeny = [] li = 0 d1 = 0 with open(plik_in, 'r+') as f: for line in f: w = line.split() d = line.split() w = float(w[1]) d = float(d[2]) listy.append(w) # print(listy) domeny.append(d) for x, el in enumerate(domeny): if el == 0.0: domeny[x] = 1.0 for x in domeny: li += 1 if x == 1.0: d1 += 1 # -------------------------DANIO RERIO REVIEWED---------------------- data4 = pd.read_csv( 'Danio_reviewed_out.txt', sep='\t', names=['Nazwa białka', 'Długość łańcucha', 'Liczba domen']) # histogram długosc łańcucha dwiekolumny4 = data4[data4.columns[1:3]] np.seterr(divide='ignore', invalid='ignore') dwiekolumny4.hist(column='Długość łańcucha', bins=100, figsize=(10, 10), color='mediumvioletred', density=True) p = genextreme.fit(listy, -1) print(p) ss.genextreme.fit(listy) plt.plot(np.linspace(0, 3500), genextreme.pdf(np.linspace(0, 3500), p[0], p[1], p[2]), 'b--', lw=3, label='Generalized extreme value distribution ') plt.title('Danio rerio reviewed - Histogram długości łańucha', color='black') plt.xlabel('Długość łańcucha') plt.ylabel('Liczebność') plt.legend(loc='upper right') pylab.xlim([-10, 3500]) plt.show()
def plot_GEV_fit(series, shape, loc, scale): """`scipy` order, e.g. scale, loc, shape""" xx = np.linspace(l+0.00001, l+0.00001+35, num=71) yy = gev.pdf(xx, scale, loc, shape) fig, ax = plt.subplots() # plot histogram of observed data series.plot.hist(ax=ax) ax.plot(xx, yy, 'ro') plt.show()
def plotgev(dados, ndivh, titulo): plt.figure() shape, loc, scale = gev.fit(dados) plt.hist(dados, bins=ndivh, density=True) xmin, xmax = plt.xlim() xx = np.linspace(xmin, xmax, num=100) yy = gev.pdf(xx, shape, loc, scale) plt.title(titulo + " | GEV") plt.xlabel("") plt.ylabel("") plt.plot(xx, yy, 'orange') plt.draw()
def graDensidade(self, dados, forma, posicao, escala): dados.sort() ''' dadosExt = [] for i in range(1, 1001): dadosExt.append(self.ler.serieExtensa(i, 'Fluviometrico')) dadosExt.sort() yExt = gev.pdf(dadosExt, -0.168462, 6286.926278, 1819.961392) ''' yd = gev.pdf(dados, forma, posicao, escala) plt.plot(dados,yd,'-r', label = 'Forma: %s\nPosicao: %s\nEscala: %s' % (forma, posicao, escala)) #plt.plot(dadosExt, yExt,'-r') #plt.title('Série Extensa') plt.ylabel('Densidade') plt.xlabel('Vazão(m³/s)') plt.legend(numpoints = 1, loc = "best") plt.show()
def plot_probability_density(annual_max, station_id): mle = genextreme.fit(sorted(annual_max), 0) mu = mle[1] sigma = mle[2] xi = mle[0] min_x = min(annual_max)-0.5 max_x = max(annual_max)+0.5 x = np.linspace(min_x, max_x, num=100) y = [genextreme.pdf(z, xi, loc=mu, scale=sigma) for z in x] fig = plt.figure(figsize=(12,6)) axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) xlabel = (station_id + " - Annual Max Wind Speed (m/s)") axes.set_title("Probability Density & Normalized Histogram") axes.set_xlabel(xlabel) axes.plot(x, y, color='Red') axes.hist(annual_max, bins=arange(min_x, max_x, abs((max_x-min_x)/10)), normed=1, color='Yellow')
def gev(self, x): ##cost function value = 0.0 for i in range(0, len(self.obs)): mu = ngr_mean(x, self.hres[i], self.ctrl[i], self.mean[i], self.var[i]) sigma = ngr_var(x, self.hres[i], self.ctrl[i], self.mean[i], self.var[i]) shape_para = x[4] #normalized_x = ( self.obs[i] - mu ) / sigma #y = gev.pdf(self.obs[i], shape_para, loc=mu, scale=sigma) + sys.float_info.epsilon value += np.log( gev.pdf(self.obs[i], shape_para, loc=mu, scale=sigma) + sys.float_info.epsilon) value = -1 * value / self.sample_size # negative log likelihood which is one of strictly proper score return value
def CalIgnoranceScore(self, data, obs_list, distribution=None, discrete=False): sample_size = len(obs_list) IgnoranceList = [None] * sample_size if discrete == True: #Dawid-Sebastiani ENSEMBLES = data OBS = obs_list for sample_index in range(sample_size): #Dawid-Sebastiani ensemble = ENSEMBLES[sample_index] obs = OBS[sample_index] pdf = KernelDensity.kde_gaussian(obs, ensemble) IgnoranceList[sample_index] = -1.0 * np.log( pdf + sys.float_info.epsilon) elif discrete == False: if distribution == 'gev': for i in range(sample_size): shape = data[i][0] mu = data[i][1] sigma = data[i][2] pdf = gev.pdf(obs_list[i], c=shape, loc=mu, scale=sigma) IgnoranceList[i] = -1.0 * np.log(pdf + sys.float_info.epsilon) elif distribution == 'TN' or 'LN': COEFF = data for sample_index in range(sample_size): mu = COEFF[sample_index][0] sigma = COEFF[sample_index][1] if distribution == 'TN': pdf = TN_PDF(obs_list[sample_index], mu, sigma, a=0.0, b=np.inf) elif distribution == 'LN': pdf = LN_PDF(obs_list[sample_index], mu, sigma) IgnoranceList[sample_index] = -1.0 * np.log( pdf + sys.float_info.epsilon) return IgnoranceList
def plot_histograma_e_gev(str_fam_sinal, df_sinais, c, loc, scale, num_inicio, num_final, num_total, nome_coluna='valor'): arr_valores_atuais = df_sinais[nome_coluna].to_numpy() histogram, bins_edge = np.histogram(arr_valores_atuais, bins=20) width = 0.7 * (bins_edge[1] - bins_edge[0]) center = (bins_edge[:-1] + bins_edge[1:]) / 2 # plot histograma # fig, ax = plt.subplots(1, 1) fig, ax1 = plt.subplots() color = 'tab:blue' plt.bar(center, histogram, align='center', width=width) plt.title('Histograma da Série {}'.format(str_fam_sinal)) plt.xlabel("bin") plt.ylabel("Quantidade") ax1.tick_params(axis='y', labelcolor=color) # plot PDF ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:ref' x = np.linspace(genextreme.ppf(0.01, c), genextreme.ppf(0.99, c), 100) x = np.linspace(num_inicio, num_final, num_total) ax2.get_yaxis().set_ticks([]) ax2.plot(x, genextreme.pdf(x, c, loc, scale), 'r-', lw=5, alpha=0.6, label='genextreme pdf') fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig("./histograma_familia_{}.png".format(str_fam_sinal)) plt.show() plt.close()
def plot_probability_density(annual_max, station_id): mle = genextreme.fit(sorted(annual_max), 0) mu = mle[1] sigma = mle[2] xi = mle[0] min_x = min(annual_max) - 0.5 max_x = max(annual_max) + 0.5 x = np.linspace(min_x, max_x, num=100) y = [genextreme.pdf(z, xi, loc=mu, scale=sigma) for z in x] fig = plt.figure(figsize=(12, 6)) axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) xlabel = (station_id + " - Annual Max Wind Speed (m/s)") axes.set_title("Probability Density & Normalized Histogram") axes.set_xlabel(xlabel) axes.plot(x, y, color='Red') axes.hist(annual_max, bins=arange(min_x, max_x, abs((max_x - min_x) / 10)), normed=1, color='Yellow')
def plotajuste_completo(dados, ndivh, titulo): plt.figure() # dados e histograma do ajuste plt.hist(dados, bins=ndivh, density=True, label='histogram') xmin, xmax = plt.xlim() xx = np.linspace(xmin, xmax, num=100) # Calcula ajuste GEV shape, loc, scale = gev.fit(dados) ygev = gev.pdf(xx, shape, loc, scale) plt.plot(xx, ygev, 'orange', label='GEV') # Calcula ajuste gaussiana mean, std = nrm.fit(dados) ygaus = nrm.pdf(xx, mean, std) plt.plot(xx, ygaus, 'green', label='Gaussian') plt.title(titulo + " | GEV (orange) - Gaussian (green)") plt.draw()
def f_p50_prior(p50): return np.log(gev.pdf(-p50, 1.08, -2.86, 2.92)+1e-20) p50_init = 2.86
def f_p50_prior(p50): return np.log(gev.pdf(-p50, 0.76, -3.64, 2.55)+1e-20) p50_init = 3.64
def Plot_FitSim_GevFit(data_fit, data_sim, vn, xds_GEV_Par, kma_fit, n_bins=30, color_1='white', color_2='skyblue', alpha_1=0.7, alpha_2=0.4, label_1='Historical', label_2='Simulation', gs_1=1, gs_2=1, n_clusters=1, vlim=1, show=True): 'Plots fit vs sim histograms and gev fit by clusters for variable "vn"' # plot figure fig = plt.figure(figsize=(_fsize * gs_2 / 2, _fsize * gs_1 / 2.3)) # grid spec gs = gridspec.GridSpec(gs_1, gs_2) #, wspace=0.0, hspace=0.0) # clusters for c in range(n_clusters): # select wt data wt = c + 1 ph_wt = np.where(kma_fit.bmus == wt)[0] ps_wt = np.where(data_sim.DWT == wt)[0] dh = data_fit[vn].values[:][ph_wt] #; dh = dh[~np.isnan(dh)] ds = data_sim[vn].values[:][ps_wt] #; ds = ds[~np.isnan(ds)] # TODO: problem if gumbell? # select wt GEV parameters pars_GEV = xds_GEV_Par[vn] sha = pars_GEV.sel(parameter='shape').sel(n_cluster=wt).values sca = pars_GEV.sel(parameter='scale').sel(n_cluster=wt).values loc = pars_GEV.sel(parameter='location').sel(n_cluster=wt).values # compare histograms ax = fig.add_subplot(gs[c]) axplot_compare_histograms( ax, dh, ds, ttl='WT: {0}'.format(wt), density=True, n_bins=n_bins, color_1=color_1, color_2=color_2, alpha_1=alpha_1, alpha_2=alpha_2, label_1=label_1, label_2=label_2, ) # add gev fit x = np.linspace(genextreme.ppf(0.001, -1 * sha, loc, sca), vlim, 100) ax.plot(x, genextreme.pdf(x, -1 * sha, loc, sca), label='GEV fit') # customize axis ax.legend(prop={'size': 8}) # fig suptitle #fig.suptitle('{0}'.format(vn), fontsize=14, fontweight = 'bold') # show and return figure if show: plt.show() return fig
def f_p50_prior(p50): return np.log(gev.pdf(-p50, 0.77, -1.86, 1.25)+1e-20) p50_init = 1.86
def calc_ocean_parameter(FP_MANAGER, fp, datasource, recalc=False): """ http://www.jamesphoughton.com/2013/08/making-gif-animations-with-matplotlib.html """ print "calcOceanStatistics function start" db_ocean = DB_connector("default") # chembl cursor = db_ocean.cursor ds = DataSources.objects.get(name=datasource.name) if recalc: print "delete rnd set items for fp",fp Rnd_set_comparison.objects.all().filter(fp=fp).filter(datasource=ds).delete() print "done" print "delete parameter entries for fp",fp FP_Parameter.objects.all().filter(fp_id=fp).filter(datasource=ds).delete() print "done" if not recalc and Rnd_set_comparison.objects.all().filter(fp=fp).filter(datasource=ds).count()==0: return "no entries for fp %d, try ?recalc=True" % fp repeats = settings.CALC_OCEAN_PARAMETER_REPEATS start = settings.CALC_OCEAN_PARAMETER_START end = settings.CALC_OCEAN_PARAMETER_END steps = settings.CALC_OCEAN_PARAMETER_STEPS thresh_start = settings.CALC_OCEAN_PARAMETER_THRESH_START thresh_end = settings.CALC_OCEAN_PARAMETER_THRESH_END thresh_steps = settings.CALC_OCEAN_PARAMETER_THRESH_STEPS animatedGif = True try: from PIL import Image from images2gif import writeGif except: print >> sys.stderr, "Couldn't import Image from PIL or writeGif from images2gif, so plotting is deactivated now" animatedGif = False plotting = True try: import matplotlib.pyplot as plt except: plotting = True animatedGif = False processes = settings.PARALLEL_PROCESSES if recalc: walker = Pool(processes=processes) thresh_list = np.arange(thresh_start,thresh_end,thresh_steps) molecule_ids = np.asarray(FP_MANAGER[datasource][fp].keys()) ds = DataSources.objects.get(name=datasource.name) for runde in range(repeats): if not recalc: continue print "runde %d" % runde result = {} rand_lists1 = createRandLists(start,end,steps,molecule_ids) rand_lists2 = createRandLists(start,end,steps,molecule_ids) tasks = [([FP_MANAGER[datasource][fp].get(x1) for x1 in rand_lists1[i]],[FP_MANAGER[datasource][fp].get(x2) for x2 in rand_lists2[i]]) for i in range(len(rand_lists2))] if processes>1: np.random.shuffle(tasks) result2 = {} for data_entry in walker.imap_unordered(get_tc_list_para,tasks,20): result2[data_entry[0]] = data_entry[1] print "addet %d of %d" % (len(result2),len(tasks)) else: result2 = {} while (len(tasks)>0): task = tasks.pop() score = get_tc_list_para(task) result2[score[0]] = score[1] print "addet %d of %d" % (len(result2),len(tasks)) print "create %d Result-Objects for DB-Table rnd_set_comparison" % (len(thresh_list) * len(result2)) with transaction.atomic(): buffer = [] for threshold in thresh_list: for key,value in result2.iteritems(): raw_score = np.sum(value[value>=threshold]) item = (key**2,fp,threshold,raw_score) buffer.append(item) print "created %d buffered items" % len(buffer) for w,x,y,z in buffer: obj = Rnd_set_comparison(setsize=w,fp=x,threshold=y,rawscore=z,datasource=ds) obj.save() figures = [] data_cache = {} min_mean = None max_mean = None min_stddev = None max_stddev = None for threshold in thresh_list: if db_ocean.db_type=='postgre': query = "select setsize,threshold, round(stddev_pop(rawscore)::numeric,2) as stddev_pop,round(avg(rawscore)::numeric,2) as mean from ocean_rnd_set_comparison where fp=%d and threshold=%f and datasource_id=%d group by setsize,threshold order by setsize" % (fp,threshold,ds.id) else: query = "select setsize,threshold,round(stddev(rawscore),2) as stddev,round(avg(rawscore),2) as mean from ocean_rnd_set_comparison where fp=%d and threshold=%f and datasource_id=%d group by setsize,threshold order by setsize" % ( fp, threshold, ds.id) cursor.execute(query) x_data = [] stddev_data = [] mean_data = [] for result in cursor.fetchall(): x_data.append(float(result[0])) mean_data.append(float(result[3])) stddev_data.append(float(result[2])) if min_mean is None: if len(mean_data) > 0: min_mean,max_mean = min(mean_data),max(mean_data) if len(stddev_data) > 0: min_stddev,max_stddev = min(stddev_data),max(stddev_data) else: if len(mean_data) > 0: min_mean, max_mean = min([min_mean,min(mean_data)]), max([max_mean,max(mean_data)]) if len(stddev_data) > 0: min_stddev, max_stddev = min([min_stddev, min(stddev_data)]), max([max_stddev, max(stddev_data)]) data_cache[threshold] = (x_data,mean_data,stddev_data) skip_3_to_6 = True for threshold in thresh_list: x_data,mean_data,stddev_data = data_cache[threshold] if len(x_data) == 0 or len(mean_data)==0 or len(stddev_data)==0: continue if plotting: plt.clf() if plotting: if skip_3_to_6: fig,(r0,r1,r2,r6) = plt.subplots(nrows=4,figsize=(12,14)) else: fig,(r0,r1,r2,r3,r4,r5,r6) = plt.subplots(nrows=7,figsize=(6,14)) raw_mean_func = Calculator.getRawScoreExpFunction(x_data,mean_data) print "\nmean function for threshold: %f is [%s]" % (threshold,raw_mean_func.func_name) exp_mean_data = [raw_mean_func(en) for en in x_data] if plotting: r0.plot(np.array(x_data),np.array(mean_data),linewidth=1.0) r0.plot(x_data,exp_mean_data,alpha=0.5,linewidth=2.5) r0.set_title("Mean, Threshold: %.2f" % threshold) r0.set_ylim((min_mean,max_mean)) r1.set_ylim((min_stddev,max_stddev)) r2.set_xlim((-1,1.5)) r2.set_ylim((0,2.5)) new_std_function = Calculator.getRawScoreStdDevExpFunction(x_data,stddev_data) print "stddev function for threshold: %f is [%s]" % (threshold,new_std_function.func_name) newdata2 = new_std_function(x_data) if plotting: r1.plot(x_data,stddev_data) r1.plot(x_data, newdata2, alpha=0.8, linewidth=2.0) r1.set_title("StdDev") z_Scores = Calculator.getZScores(x_data,mean_data,raw_mean_func,new_std_function) histo_bins = 50 counts,bin_edges = np.histogram(z_Scores,histo_bins,normed=True) bin_centres = (bin_edges[:-1] + bin_edges[1:])/2. if plotting: n,bins,patches = r2.hist(z_Scores,bins=histo_bins,normed=True,alpha=0.5) r2.set_title("z-Scores") e_val_function = Calculator.getZScoreDistExpFunction(z_Scores) e_val_data_x = np.linspace(min(z_Scores),max(z_Scores),num=500) e_val_data = [e_val_function(entry) for entry in e_val_data_x] if plotting: if not skip_3_to_6: r3.plot(e_val_data_x,e_val_data,alpha=0.5) c=-0.1 for c in [-0.05]: x_ls = np.linspace(ge.ppf(0.01,c),ge.ppf(0.99,c),100) if plotting: if not skip_3_to_6: r4.plot(x_ls,ge.pdf(x_ls,c),linewidth=1.6-c*4) (shape_evd,loc_evd,scale_evd) = ge.fit(z_Scores) loc_norm,scale_norm = norm.fit(z_Scores) x = ge.pdf(bin_centres,shape_evd,loc=loc_evd,scale=scale_evd) if plotting: evd_plot, = r2.plot(bin_centres,x,'b',color='black',label='Extreme Value Distribution') ndist = norm.pdf(bin_centres,loc=loc_norm,scale=scale_norm) if plotting: norm_plot, = r2.plot(bin_centres,ndist,'b',color="red",label='Normal Distribution') r2.legend([evd_plot,norm_plot],['Extreme Value Distribution','Normal Distribution'],loc=1) def getDecNpArray(value): return np.asarray(value).astype(float) expected_evd = getDecNpArray(x) expected_norm = getDecNpArray(ndist) observed = getDecNpArray(counts) def normalizedChisquare(observed,expected): if len(observed) != len(expected): raise Exception("len of observed and expected has to be the same") zipped = zip(observed,expected) fun = lambda input: ((input[0]-input[1])**2 / (input[0]+input[1])) result = sum(map(fun,zipped)) return result chisq_mean = normalizedChisquare(observed,expected_norm) chisq_evd = normalizedChisquare(observed,expected_evd) print "chisquare_norm",chisq_mean print "chisquare_evd",chisq_evd #django doesn't like inf or -inf in float-fields of oracle database, so we change it.. if isinf(chisq_mean) or isnan(chisq_mean): print "chisquare_norm seems to be inf or nan (%s), change to -1.0" % str(chisq_mean) chisq_mean = -1.0 if isinf(chisq_evd) or isnan(chisq_evd): print "chisquare_evd seems to be inf or nan (%s), change to -1.0" % str(chisq_evd) chisq_evd = -1.0 if plotting: if not skip_3_to_6: n,bins,patches = r5.hist(z_Scores,bins=histo_bins,normed=True,alpha=0.75)#,bins=20) if not skip_3_to_6: import matplotlib.mlab as mlab y = mlab.normpdf(bins,loc_evd,scale_evd) fp_parameter = FP_Parameter(fp_id=fp, threshold=threshold, formula_raw_mean=raw_mean_func.func_name, formula_raw_stddev=new_std_function.func_name, chisquare_mean=chisq_mean, chisquare_evd=chisq_evd, datasource=ds) fp_parameter.save() if plotting: if not skip_3_to_6: r5.plot(bins,y) if threshold==thresh_list[-1]: #this is last round print "last round" query = "select threshold,chisquare_mean,chisquare_evd from ocean_fp_parameter where fp_id=%d and datasource_id=%d order by threshold" % (fp,ds.id) cursor.execute(query) data_chi2_mean = [] data_chi2_evd = [] x_chidata = [] for val in cursor.fetchall(): x_chidata.append(float(val[0])) data_chi2_mean.append(float(val[1])) data_chi2_evd.append(float(val[2])) print x_chidata,data_chi2_mean,data_chi2_evd if plotting: if not skip_3_to_6: r6.plot(x_chidata,data_chi2_mean,'o') if not skip_3_to_6: r6.plot(x_chidata,data_chi2_evd,'.') chi2_mean, = r6.plot(x_chidata,data_chi2_mean,'o') chi2_evd, = r6.plot(x_chidata,data_chi2_evd,'.') r6.legend([chi2_mean,chi2_evd],['ChiSquare Normal Distribution','ChiSquare Extreme Value Distribution'],loc=1) def fitfunc(p,x): if p[0]==0: return np.exp(-np.exp(-x))*np.exp(-x) else: print p[0],type(x) return np.exp(-(1-p[0]*x)**(1/p[0]))*(1-p[0]*x)**(1/p[0]-1) errfunc = lambda p,x,y: (y-fitfunc(p,x)) init = [0.2] bins = bins[:-1] bins = np.array(bins) n = np.array(n) if plotting: plt.tight_layout() filename = "%f.png" % threshold plt.savefig(filename) figures.append(filename) if animatedGif: file_names = figures print "d",file_names images = [Image.open(fn) for fn in file_names] writeGif("animation_mean_stddev.gif",images,duration=0.5) for image in images: image.close()
def gev_pdf(x): return genextreme.pdf(x, xi, loc=mu, scale=sigma)
from scipy.stats import genextreme import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculate a few first moments: c = -0.1 mean, var, skew, kurt = genextreme.stats(c, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(genextreme.ppf(0.01, c), genextreme.ppf(0.99, c), 100) ax.plot(x, genextreme.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genextreme pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = genextreme(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = genextreme.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genextreme.cdf(vals, c)) # True # Generate random numbers:
def extremal_distribution_fit(data, var_name, sample, threshold, fit_type, x_min, x_max, n_points, loc=None, scale=None, cumulative=True): # Initialization of the output variables param = None x = None y = None y_rp = None if fit_type == 'gpd': # Fit the exceedances over threshold to Generalized Pareto distribution param = generalized_pareto_distribution_fit(sample, threshold, loc, scale) # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) if cumulative: y = genpareto.cdf(x, param[0], param[1], param[2]) # Calculate the number of extreme peaks per year n_peaks_year = len(sample) / len( data[var_name].index.year.unique()) y_rp = return_period_curve(n_peaks_year, y) else: y = genpareto.pdf(x, param[0], param[1], param[2]) elif fit_type == 'coles': # Fit the exceedances over threshold to Generalized Pareto distribution param = generalized_pareto_distribution_fit(sample, threshold, loc, scale) x = np.arange(1, 501) u = param[1] sigma = param[2] xi = param[0] # Mean number of data in a year (numero medio de datos en un año) n_y = len(data[var_name]) / len(data[var_name].index.year.unique()) # Total number of POT / number of years z_u = len(sample) / len(data[var_name]) # n_y*z_u is the number of POT / number of years -- > numer of POT per year y_rp = u + (sigma / xi) * (((x * n_y * z_u)**xi) - 1) elif fit_type == 'gev': param = generalized_extreme_value_distribution_fit(sample, loc, scale) # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) if cumulative: y = genextreme.cdf(x, param[0], param[1], param[2]) # Calculate the number of extreme peaks per year n_peaks_year = 1 y_rp = return_period_curve(n_peaks_year, y) else: y = genpareto.pdf(x, param[0], param[1], param[2]) elif fit_type == 'poisson': # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) # Fit the exceedances over threshold to Generalized Pareto distribution gpd_param = generalized_pareto_distribution_fit( sample, threshold, loc, scale) # Poisson parameter (número de eventos extraños al año) poisspareto_param = len(sample) / len( data[var_name].index.year.unique()) # Poisson pareto parameters poisspareto_param = [ poisspareto_param, gpd_param[0], gpd_param[2], gpd_param[1] ] # Equivalent gev parameters param = [0, 0, 0] param[0] = -poisspareto_param[1] param[1] = poisspareto_param[2] * (poisspareto_param[0]** poisspareto_param[1]) param[2] = poisspareto_param[3] + ( (poisspareto_param[2] / poisspareto_param[1]) * ((poisspareto_param[0]**poisspareto_param[1]) - 1)) if cumulative: y = genextreme.cdf(x, param[0], param[2], param[1]) # Calculate the number of extreme peaks per year n_peaks_year = 1 y_rp = return_period_curve(n_peaks_year, y) else: y = genextreme.pdf(x, param[0], param[2], param[1]) return param, x, y, y_rp
def sea_levels_gev_pdf(x): return genextreme.pdf(x, xi, loc=mu, scale=sigma)
def f_p50_prior(p50): return np.log(gev.pdf(-p50, 0.71, -2.23, 1.49)+1e-20) p50_init = 2.23
def plot_histogram(site, data1, data2, label1='Data1', label2='Data2', subset_label=None, variable=None): """ Plot a normalized histogram of two temperature distributions Fit GEV curve to distribution :param site: site string :param data1: array of data from reference period :param data2: array of data from new (warmer climate) period :param label1: string label for data1 :param label2: string label for data2 :param subset_label: string label for the subset of data (e.g. month/season) :return some statistics maybe (TBD) """ # print some parameters of data print('Ref data: {}'.format(len(data1))) print('New data: {}'.format(len(data2))) # get histogram parameters range_min = np.nanmin(np.hstack( (data1, data2))) - np.nanmin(np.hstack((data1, data2))) % 10 range_max = np.nanmax(np.hstack( (data1, data2))) + (10 - np.nanmax(np.hstack((data1, data2))) % 10) bins = int(range_max - range_min) # compute histograms hist1, bin_edges = np.histogram(data1, bins=bins, range=(range_min, range_max), density=True) hist2, bin_edges = np.histogram(data2, bins=bins, range=(range_min, range_max), density=True) # gev fitting--use function to try a couple times to get a good fit shape1, loc1, scale1 = get_gev_fit(data1) shape2, loc2, scale2 = get_gev_fit(data2) x_gev = np.linspace(range_min, range_max, bins * 10 + 1) y1_gev = gev.pdf(x_gev, shape1, loc1, scale1) y2_gev = gev.pdf(x_gev, shape2, loc2, scale2) # compute POD and FAR of 2.5-sigma event (from reference climate) mean1 = gev.mean(shape1, loc=loc1, scale=scale1) mean2 = gev.mean(shape2, loc=loc2, scale=scale2) std1 = np.sqrt(gev.var(shape1, loc=loc1, scale=scale1)) std2 = np.sqrt(gev.var(shape2, loc=loc2, scale=scale2)) # calculate a, b, and c params from Durran 2019 sig20_thres = np.where((x_gev > mean1 + 2.0 * std1)) sig25_thres = np.where((x_gev > mean1 + 2.5 * std1)) sig35_thres = np.where((x_gev > mean1 + 3.5 * std1)) c_val = np.sum(y1_gev[sig25_thres]) a_val = np.sum(y2_gev[sig25_thres]) - c_val b_val = np.sum(y2_gev[sig20_thres]) - np.sum(y1_gev[sig20_thres]) - a_val pod = a_val / (a_val + b_val) far = c_val / (a_val + c_val) print('POD = {} FAR = {}'.format(pod, far)) fig = plt.figure() fig.set_size_inches(6, 4) # stats of gev fit #mean1, var1, skew1, kurt1 = gev.stats(shape1, moments='mvsk') mu1 = np.mean(data1) sigma1 = np.std(data1) mu2 = np.mean(data2) sigma2 = np.std(data2) plt.bar(bin_edges[:-1], hist1, width=1, align='edge', color='blue', alpha=0.5, label=label1) plt.bar(bin_edges[:-1], hist2, width=1, align='edge', color='red', alpha=0.5, label=label2) plt.plot(x_gev, y1_gev, color='blue') plt.plot(x_gev, y2_gev, color='red') plt.plot([x_gev[sig20_thres[0][0]], x_gev[sig20_thres[0][0]]], [0, y2_gev[sig20_thres[0][0]]], color='k', lw=1.0) plt.plot([x_gev[sig25_thres[0][0]], x_gev[sig25_thres[0][0]]], [0, y2_gev[sig25_thres[0][0]]], color='k', lw=1.0) #plt.plot([x_gev[sig35_thres[0][0]], x_gev[sig35_thres[0][0]]], [0, y2_gev[sig35_thres[0][0]]], color='k', lw=1.0) plt.plot([mu1, mu1], [0, 1], color='blue', linestyle=':') plt.plot([mu2, mu2], [0, 1], color='red', linestyle=':') plt.ylabel('PDF') plt.xlabel('Temperature') plt.ylim( 0, np.max( (np.max(hist1), np.max(hist2), np.max(y1_gev), np.max(y2_gev))) + 0.02) plt.legend() plt.title('{} {}'.format(site, subset_label)) plt.savefig('{}{}_{}{}.png'.format(config['PLOT_DIR'], site, subset_label, variable), bbox_inches='tight', dpi=200) print('Plotted histogram for {}'.format(site)) return
from matplotlib import pyplot as plt from matplotlib import rc from scipy.stats import genextreme as gev # set up fonts for plotting rc('text', usetex=True) rc('font', family='serif') rc('font', size=12) x = np.linspace(-4,7,100) # to evaluate GEVs over fig = plt.figure(1, figsize=(7, 3.25)) fig.clf() ax1 = fig.add_subplot(121) ax1.plot(x, gev.pdf(x, 0), '-r', label='$\gamma=0$') ax1.plot(x, gev.pdf(x, 0.28), '-g', label='$\gamma=0.28$') ax1.plot(x, gev.pdf(x, 0.56), '-b', label='$\gamma=0.56$') ax1.set_xlim([-4,7]) ax1.set_xlabel('$x$') ax1.set_ylabel('$G_{\mathrm{GEV}}(x)$') ax1.legend(loc='upper right', frameon=False, handletextpad=0) ax1.yaxis.set_ticks([0,0.1,0.2,0.3,0.4]) ax2 = fig.add_subplot(122) ax2.plot(x, gev.pdf(x, 0), '-r', label='$\gamma=0$') ax2.plot(x, gev.pdf(x, -0.26), '-g', label='$\gamma=-0.28$') ax2.plot(x, gev.pdf(x, -0.56), '-b', label='$\gamma=-0.56$') ax2.set_xlim([-4,7]) ax2.set_xlabel('$x$') ax2.legend(loc='upper right', frameon=False, handletextpad=0)
def f_p50_prior(p50): return np.log(gev.pdf(-p50, 0.65, -4.43, 1.94)+1e-20) p50_init = 4.43
def plot_pod_vs_far(site, data1_hi, data1_lo, subset_label=None): """ Compare the POD and FAR from 2-sigma to 4-sigma for high and low :param site: site string :param data1_hi: array of high temp data from reference period :param data1_lo: array of low temp data from reference period :param subset_label: string label for the subset of data (e.g. month/season) """ # get histogram parameters range_min_hi = np.nanmin(np.hstack( (data1_hi))) - np.nanmin(np.hstack((data1_hi))) % 10 range_max_hi = np.nanmax(np.hstack( (data1_hi))) + (10 - np.nanmax(np.hstack((data1_hi))) % 10 + 20) bins_hi = int(range_max_hi - range_min_hi) range_min_lo = np.nanmin(np.hstack( (data1_lo))) - np.nanmin(np.hstack((data1_lo))) % 10 range_max_lo = np.nanmax(np.hstack( (data1_lo))) + (10 - np.nanmax(np.hstack((data1_lo))) % 10) + 10 bins_lo = int(range_max_lo - range_min_lo) # gev fitting--use function to try a couple times to get a good fit shape1_hi, loc1_hi, scale1_hi = get_gev_fit(data1_hi) x_gev_hi = np.linspace(range_min_hi, range_max_hi, bins_hi * 10 + 1) y1_gev_hi = gev.pdf(x_gev_hi, shape1_hi, loc1_hi, scale1_hi) sigma_array = np.linspace(2, 5, 7) # do 30 for longer one pod_hi = np.zeros(len(sigma_array)) far_hi = np.zeros(len(sigma_array)) # compute POD and FAR of 2.5-sigma event (from reference climate) mean1_hi = gev.mean(shape1_hi, loc=loc1_hi, scale=scale1_hi) std1_hi = np.sqrt(gev.var(shape1_hi, loc=loc1_hi, scale=scale1_hi)) # same for low shape1_lo, loc1_lo, scale1_lo = get_gev_fit(data1_lo) x_gev_lo = np.linspace(range_min_lo, range_max_lo, bins_lo * 10 + 1) y1_gev_lo = gev.pdf(x_gev_lo, shape1_lo, loc1_lo, scale1_lo) pod_lo = np.zeros(len(sigma_array)) far_lo = np.zeros(len(sigma_array)) # compute POD and FAR of 2.5-sigma event (from reference climate) mean1_lo = gev.mean(shape1_lo, loc=loc1_lo, scale=scale1_lo) std1_lo = np.sqrt(gev.var(shape1_lo, loc=loc1_lo, scale=scale1_lo)) #define dataframes of what we are pulling warming_levels = np.linspace(0.1, 1, 10) pod_hi = pd.DataFrame(index=sigma_array, columns=warming_levels) pod_lo = pd.DataFrame(index=sigma_array, columns=warming_levels) far_hi = pd.DataFrame(index=sigma_array, columns=warming_levels) far_lo = pd.DataFrame(index=sigma_array, columns=warming_levels) far_lo = far_lo.fillna(0) y_curves_hi = pd.DataFrame(index=warming_levels, columns=x_gev_hi) y_curves_lo = pd.DataFrame(index=warming_levels, columns=x_gev_lo) hi_locs = np.zeros(len(warming_levels)) lo_locs = np.zeros(len(warming_levels)) for i, level in enumerate(warming_levels): loc1_hi_new = loc1_hi + level * std1_hi hi_locs[i] = loc1_hi_new y2_gev_hi = gev.pdf(x_gev_hi, shape1_hi, loc1_hi_new, scale1_hi) y_curves_hi.loc[level] = y2_gev_hi for sigma in sigma_array: pod, far = get_pod_far_curve(x_gev_hi, y1_gev_hi, y2_gev_hi, mean1_hi, std1_hi, sigma, sig_thresh=2.0) pod_hi[level][sigma] = pod * 100. far_hi[level][sigma] = far * 100. loc1_lo_new = loc1_lo + level * std1_lo lo_locs[i] = loc1_lo_new y2_gev_lo = gev.pdf(x_gev_lo, shape1_lo, loc1_lo_new, scale1_lo) y_curves_lo.loc[level] = y2_gev_lo for sigma in sigma_array: pod, far = get_pod_far_curve(x_gev_lo, y1_gev_lo, y2_gev_lo, mean1_lo, std1_lo, sigma, sig_thresh=2.0) pod_lo[level][sigma] = pod * 100. far_lo[level][sigma] = far * 100. # POD vs FAR plot # labels labels = [ '2.0$\sigma$', '2.5$\sigma$', '3.0$\sigma$', '3.5$\sigma$', '4.0$\sigma$', '4.5$\sigma$', '5.0$\sigma$' ] # another way of plotting POD vs FAR fig = plt.figure() fig.set_size_inches(6, 4) for i, level in enumerate(warming_levels): plt.plot(far_hi[level], pod_hi[level], color=plt.cm.Reds(level - 0.05), marker='o', lw=4, label='$\mu$+{}$\sigma$'.format(np.around(level, 1))) for j, ind in enumerate(far_hi.index): if i == 2: plt.text(far_hi[level][ind] + 2, pod_hi[level][ind] - 4, labels[j], color='black', fontsize=8) plt.ylabel('POD (%)') plt.xlabel('FAR (%)') plt.ylim(0, 100) plt.xlim(0, 100) plt.legend(fontsize=5, loc='upper left') plt.title('POD vs FAR {} {} (2.0-$\sigma$ threshold)'.format( site, subset_label)) plt.savefig('{}pod_vs_far_warming_hi_{}_{}.png'.format( config['PLOT_DIR'], site, subset_label), bbox_inches='tight', dpi=200) print('Plotted pod_vs_far for {}'.format(site)) # same for low -------------------- fig = plt.figure() fig.set_size_inches(6, 4) for i, level in enumerate(warming_levels): plt.plot(far_lo[level], pod_lo[level], color=plt.cm.Blues(level - 0.05), marker='o', lw=4, label='$\mu$+{}$\sigma$'.format(np.around(level, 1))) for j, ind in enumerate(far_lo.index): if i == 2: plt.text(far_lo[level][ind] + 2, pod_lo[level][ind] - 4, labels[j], color='black', fontsize=8) plt.ylabel('POD (%)') plt.xlabel('FAR (%)') plt.ylim(0, 100) plt.xlim(0, 100) plt.legend(fontsize=5, loc='upper right') plt.title('POD vs FAR {} {} (2.0-$\sigma$ threshold)'.format( site, subset_label)) plt.savefig('{}pod_vs_far_warming_lo_{}_{}.png'.format( config['PLOT_DIR'], site, subset_label), bbox_inches='tight', dpi=200) print('Plotted pod_vs_far for {}'.format(site)) pdb.set_trace() # plot the different temperature curves... -------------------------------- fig = plt.figure() fig.set_size_inches(6, 4) #plot mean plt.plot(x_gev_hi, y1_gev_hi, color='black', label='1950-1979') for level in warming_levels: plt.plot(x_gev_hi, y_curves_hi.loc[level].values, color=plt.cm.Reds(level - 0.05), label='$\mu$+{}$\sigma$'.format(np.around(level, 1))) plt.plot([mean1_hi, mean1_hi], [0, 1], color='red', linestyle=':') plt.plot([mean1_hi + std1_hi * 2, mean1_hi + std1_hi * 2], [0, 1], color='black', linestyle=':') plt.plot([mean1_hi + std1_hi * 2.5, mean1_hi + std1_hi * 2.5], [0, 1], color='black', linestyle=':') plt.ylabel('PDF') plt.xlabel('Temperature') plt.ylim(0, np.max(y_curves_hi.values) + 0.02) plt.legend(fontsize=5) plt.title('{} {}'.format(site, subset_label)) plt.savefig('{}shift_mean_hi_{}_{}.png'.format(config['PLOT_DIR'], site, subset_label), bbox_inches='tight', dpi=200) plt.close() # low temp fig = plt.figure() fig.set_size_inches(6, 4) plt.plot(x_gev_lo, y1_gev_lo, color='black', label='1950-1979') for level in warming_levels: plt.plot(x_gev_lo, y_curves_lo.loc[level].values, color=plt.cm.Blues(level - 0.05), label='$\mu$+{}$\sigma$'.format(np.around(level, 1))) plt.plot([mean1_lo, mean1_lo], [0, 1], color='blue', linestyle=':') plt.plot([mean1_lo + std1_lo * 2, mean1_lo + std1_lo * 2], [0, 1], color='black', linestyle=':') plt.plot([mean1_lo + std1_lo * 2.5, mean1_lo + std1_lo * 2.5], [0, 1], color='black', linestyle=':') plt.ylabel('PDF') plt.xlabel('Temperature') plt.ylim(0, np.max(y_curves_lo.values) + 0.02) plt.legend(fontsize=5) plt.title('{} {}'.format(site, subset_label)) plt.savefig('{}shift_mean_lo_{}_{}.png'.format(config['PLOT_DIR'], site, subset_label), bbox_inches='tight', dpi=200) pdb.set_trace() return
print(i) x[i, :] = data_series[i:i + n] # creation of learning model (adaptive filter) f = pa.filters.FilterNLMS(n, mu=1., w=np.ones(n)) y, e, w = f.run(d, x) np.save('e_data', e) cislo_vahy = 1 w_pokus = w[1:12000, cislo_vahy] print('SELEKCE VAHY:', w_pokus.shape) fit = genextreme.fit(w_pokus[1:9400]) print('FIT:', fit) hpp = genextreme.pdf(w_pokus, fit[0], loc=fit[1], scale=fit[2]) * fit[2] print('minimum:', min(hpp[0:12000])) print('minimum index:', np.argmin(hpp[0:12000])) dw = np.copy(w) dw[1:] = np.abs(np.diff(dw, n=1, axis=0)) dw = dw[:, cislo_vahy] # np.sum(dw, axis=1) print(dw.shape) fit2 = genextreme.fit(dw[10:13000]) print('FIT2:', fit2) hpp2 = genextreme.pdf(dw[10:13000], fit2[0], loc=fit2[1], scale=fit2[2]) * fit2[2] print('odhad hpp2:') print('minimum2:', min(hpp2)) print('minimum index2:', np.argmin(hpp2))
# creation of learning model (adaptive filter) f = pa.filters.FilterNLMS(n, mu=1., w=np.ones(n)) y, e, w = f.run(d, x) print(w.shape) # process tap updates in gev_window sized window dw = np.copy(w) dw[1:] = np.abs(np.diff(dw, n=1, axis=0)) dw_count = int(dw.shape[0]) print(dw_count) hpp = np.zeros((dw_count, n)) for i in range(gev_window, dw.shape[0]): print((str(datetime.now())), " processing: ", i) for j in range(n): fit = genextreme.fit(dw[i - gev_window:i, j]) hpp[i - gev_window, j] = genextreme.pdf( dw[i, j], fit[0], loc=fit[1], scale=fit[2]) * fit[2] np.save('hpp_data' + str(gev_window), hpp) # cislo_vahy = 1 # # w_pokus = w[1:12000, cislo_vahy] # print('SELEKCE VAHY:', w_pokus.shape) # fit = genextreme.fit(w_pokus[1:9400]) # print('FIT:', fit) # # hpp = genextreme.pdf(w_pokus, fit[0], loc=fit[1], scale=fit[2])*fit[2] # print('minimum:', min(hpp[0:12000])) # print('minimum index:', np.argmin(hpp[0:12000])) # # #
def StatisticalProperties(self, PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles=False, Filter=False, Distibution="GEV", EstimateParameters=False, Quartile=0, RIMResults=False, SignificanceLevel=0.1): """ ============================================================================= StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles = False, Filter = False, RIMResults = False) ============================================================================= StatisticalProperties method reads the SWIM output file (.dat file) that contains the time series of discharge for some computational nodes and calculate some statistical properties the code assumes that the time series are of a daily temporal resolution, and that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH). Parameters ---------- 1-PathNodes : [String] the name of the file which contains the ID of the computational nodes you want to do the statistical analysis for, the ObservedFile should contain the discharge time series of these nodes in order. 2-PathTS : [String] the name of the SWIM result file (the .dat file). 3-StartDate : [string] the begining date of the time series. 4-WarmUpPeriod : [integer] the number of days you want to neglect at the begining of the Simulation (warm up period). 5-SavePlots : [Bool] DESCRIPTION. 6-SavePath : [String] the path where you want to save the statistical properties. 7-SeparateFiles: [Bool] if the discharge data are stored in separate files not all in one file SeparateFiles should be True, default [False]. 8-Filter: [Bool] for observed or RIMresult data it has gaps of times where the model did not run or gaps in the observed data if these gap days are filled with a specific value and you want to ignore it here give Filter = Value you want 9-RIMResults: [Bool] If the files are results form RIM or observed, as the format differes between the two. default [False] Returns ------- 1-Statistical Properties.csv: file containing some statistical properties like mean, std, min, 5%, 25%, median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50, q100, q200, q500. """ ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16) # hydrographs if SeparateFiles: TS = pd.DataFrame() if RIMResults: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = self.ReadRIMResult( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') else: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = np.loadtxt( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') #,skiprows = 0 StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) ind = pd.date_range(StartDate, EndDate) TS.index = ind else: TS = pd.read_csv(PathTS, delimiter=r'\s+', header=None) StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) TS.index = pd.date_range(StartDate, EndDate, freq="D") # delete the first two columns del TS[0], TS[1] TS.columns = ComputationalNodes # neglect the first year (warmup year) in the time series TS = TS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :] # List of the table output, including some general data and the return periods. col_csv = [ 'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max', 't_beg', 't_end', 'nyr' ] rp_name = [ 'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500', 'q1000' ] col_csv = col_csv + rp_name # In a table where duplicates are removed (np.unique), find the number of # gauges contained in the .csv file. # no_gauge = len(ComputationalNodes) # Declare a dataframe for the output file, with as index the gaugne numbers # and as columns all the output names. StatisticalPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=col_csv) StatisticalPr.index.name = 'ID' DistributionPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=['loc', 'scale']) DistributionPr.index.name = 'ID' # required return periods T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500, 1000] T = np.array(T) # these values are the Non Exceedance probability (F) of the chosen # return periods F = 1 - (1/T) # Non Exceedance propabilities #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998] F = 1 - (1 / T) # Iteration over all the gauge numbers. for i in ComputationalNodes: QTS = TS.loc[:, i] # The time series is resampled to the annual maxima, and turned into a # numpy array. # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH). amax = QTS.resample('A-OCT').max().values if type(Filter) != bool: amax = amax[amax != Filter] if EstimateParameters: # estimate the parameters through an optimization # alpha = (np.sqrt(6) / np.pi) * amax.std() # beta = amax.mean() - 0.5772 * alpha # param_dist = [beta, alpha] threshold = np.quantile(amax, Quartile) if Distibution == "GEV": print("Still to be finished later") else: param = Gumbel.EstimateParameter(amax, Gumbel.ObjectiveFn, threshold) param_dist = [param[1], param[2]] else: # estimate the parameters through an maximum liklehood method if Distibution == "GEV": param_dist = genextreme.fit(amax) else: # A gumbel distribution is fitted to the annual maxima param_dist = gumbel_r.fit(amax) if Distibution == "GEV": DistributionPr.loc[i, 'c'] = param_dist[0] DistributionPr.loc[i, 'loc'] = param_dist[1] DistributionPr.loc[i, 'scale'] = param_dist[2] else: DistributionPr.loc[i, 'loc'] = param_dist[0] DistributionPr.loc[i, 'scale'] = param_dist[1] # Return periods from the fitted distribution are stored. # get the Discharge coresponding to the return periods if Distibution == "GEV": Qrp = genextreme.ppf(F, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1]) # to get the Non Exceedance probability for a specific Value # sort the amax amax.sort() # calculate the F (Exceedence probability based on weibul) cdf_Weibul = ST.Weibul(amax) # Gumbel.ProbapilityPlot method calculates the theoretical values based on the Gumbel distribution # parameters, theoretical cdf (or weibul), and calculate the confidence interval if Distibution == "GEV": Qth, Qupper, Qlower = GEV.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = genextreme.pdf(Qx, param_dist[0], loc=param_dist[2], scale=param_dist[2]) cdf_fitted = genextreme.cdf(Qx, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qth, Qupper, Qlower = Gumbel.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # gumbel_r.interval(SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = gumbel_r.pdf(Qx, loc=param_dist[0], scale=param_dist[1]) cdf_fitted = gumbel_r.cdf(Qx, loc=param_dist[0], scale=param_dist[1]) # then calculate the the T (return period) T = 1/(1-F) if SavePlots: fig = plt.figure(60, figsize=(20, 10)) gs = gridspec.GridSpec(nrows=1, ncols=2, figure=fig) # Plot the histogram and the fitted distribution, save it for each gauge. ax1 = fig.add_subplot(gs[0, 0]) ax1.plot(Qx, pdf_fitted, 'r-') ax1.hist(amax, density=True) ax1.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax1.set_ylabel('pdf', fontsize=15) ax2 = fig.add_subplot(gs[0, 1]) ax2.plot(Qx, cdf_fitted, 'r-') ax2.plot(amax, cdf_Weibul, '.-') ax2.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax2.set_ylabel('cdf', fontsize=15) plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png', format='png') plt.close() fig = plt.figure(70, figsize=(10, 8)) plt.plot(Qth, amax, 'd', color='#606060', markersize=12, label='Gumbel Distribution') plt.plot(Qth, Qth, '^-.', color="#3D59AB", label="Weibul plotting position") if Distibution != "GEV": plt.plot(Qth, Qlower, '*--', color="#DC143C", markersize=12, label='Lower limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.plot(Qth, Qupper, '*--', color="#DC143C", markersize=12, label='Upper limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.legend(fontsize=15, framealpha=1) plt.xlabel('Theoretical Annual Discharge(m3/s)', fontsize=15) plt.ylabel('Annual Discharge(m3/s)', fontsize=15) plt.savefig(SavePath + "/" + "Figures/F-" + str(i) + '.png', format='png') plt.close() StatisticalPr.loc[i, 'mean'] = QTS.mean() StatisticalPr.loc[i, 'std'] = QTS.std() StatisticalPr.loc[i, 'min'] = QTS.min() StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05) StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25) StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50) StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75) StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95) StatisticalPr.loc[i, 'max'] = QTS.max() StatisticalPr.loc[i, 't_beg'] = QTS.index.min() StatisticalPr.loc[i, 't_end'] = QTS.index.max() StatisticalPr.loc[ i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] - StatisticalPr.loc[i, 't_beg']).days / 365.25 for irp, irp_name in zip(Qrp, rp_name): StatisticalPr.loc[i, irp_name] = irp # Print for prompt and check progress. print("Gauge", i, "done.") # # Output file StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv") self.StatisticalPr = StatisticalPr DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv") self.DistributionPr = DistributionPr