def sim_data(n: int, j: int, theta: np.array) -> dict: """Takes input values n and j to specify the shape of the output data. The k dimension is inferred from the length of theta. Creates a y column vector that are the choice that maximises utility, and a x matrix that are the covariates, drawn from a random normal distribution. Args: n (int): Number of households.' j (int): Number of choices. theta (np.array): The true value of the coefficients. Returns: dict: Returns a dict with keys "y" and "x". """ k = theta.size x = rng.normal(size=(n, j, k)) v = x @ theta e = genextreme.ppf(rng.uniform(size=(n, j)), c=0) u = v + e # Find which choice that maximises value. u_index = u.argmax(axis=1) label = ['y', 'x'] return dict(zip(label, [u_index, x]))
def sim_data(n: int, j: int, theta: np.array) -> dict: """Takes input values n and j to specify the shape of the output data. The k dimension is inferred from the length of theta. Creates a y column vector that are the choice that maximises utility, and a x matrix that are the covariates, drawn from a random normal distribution. Args: n (int): Number of households. j (int): Number of choices. theta (np.array): The true value of the coefficients. Returns: dict: Returns a dict with keys "y" and "x". """ k = int(theta.size / (j - 1)) const = np.ones((n, 1)) x0 = rng.normal(size=(n, k - 1)) x = np.hstack((const, x0)) # There are three choices, but the first choice is the reference # category, and is therefore only filled with zeros. v = np.zeros((n, j)) for i in range(1, j): v[:, i] = x @ theta[:, i - 1] e = genextreme.ppf(rng.uniform(size=(n, j)), c=0) u = v + e # Find which choice that maximises value. u_index = u.argmax(axis=1) label = ['y', 'x'] return dict(zip(label, [u_index, x]))
def value(self, p, estimador=None): try: return genextreme.ppf(p, c=self.shape, loc=self.loc, scale=self.scale) except AttributeError: if estimador not in self.estimadores: raise ValueError('Estimador não existe') else: eval('self.' + estimador)() return self.value(p, estimador=estimador)
def get_coverage_interval(self, COEFF, distribution, coverage): interval_list = [None] * len(COEFF) lower_q = float(coverage) / 2.0 upper_q = float(coverage) / 2.0 yes = np.array([]) no = np.array([]) if distribution == 'gev': for sample_index in range(len(COEFF)): sub_interval = [None, None] shape = COEFF[sample_index][0] mu = COEFF[sample_index][1] sigma = COEFF[sample_index][2] lower = gev.ppf(lower_q, mu, sigma, shape) upper = gev.ppf(upper_q, mu, sigma, shape) sub_interval[0] = lower sub_interval[1] = upper interval_list[sample_index] = sub_interval return interval_list
def plot_histograma_e_gev(str_fam_sinal, df_sinais, c, loc, scale, num_inicio, num_final, num_total, nome_coluna='valor'): arr_valores_atuais = df_sinais[nome_coluna].to_numpy() histogram, bins_edge = np.histogram(arr_valores_atuais, bins=20) width = 0.7 * (bins_edge[1] - bins_edge[0]) center = (bins_edge[:-1] + bins_edge[1:]) / 2 # plot histograma # fig, ax = plt.subplots(1, 1) fig, ax1 = plt.subplots() color = 'tab:blue' plt.bar(center, histogram, align='center', width=width) plt.title('Histograma da Série {}'.format(str_fam_sinal)) plt.xlabel("bin") plt.ylabel("Quantidade") ax1.tick_params(axis='y', labelcolor=color) # plot PDF ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:ref' x = np.linspace(genextreme.ppf(0.01, c), genextreme.ppf(0.99, c), 100) x = np.linspace(num_inicio, num_final, num_total) ax2.get_yaxis().set_ticks([]) ax2.plot(x, genextreme.pdf(x, c, loc, scale), 'r-', lw=5, alpha=0.6, label='genextreme pdf') fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig("./histograma_familia_{}.png".format(str_fam_sinal)) plt.show() plt.close()
def calculate_required_effort(data_before_capture, n_bootstrapping, success_probability): effort_per_sighting = calculate_effort_per_sighting(data_before_capture) n_effort_per_sighting = len(effort_per_sighting) required_effort: np.array = np.zeros(n_bootstrapping) for i in range(n_bootstrapping): resampled_effort_per_sighting = np.random.choice( effort_per_sighting, n_effort_per_sighting) fit = genextreme.fit(resampled_effort_per_sighting) required_effort[i] = genextreme.ppf(success_probability, fit[0], fit[1], fit[2]) return required_effort
def plot(self, FAPname, Nlevels, cheat=True): '''Plots the periodogram with significance levels provided by the bootstrap. Number of displayed significance levels adjusted with Nlevels. cheat shows the FAP calculated by astrop, as well as a marker showing a tabulated value for the frequency of the planets oscillation. Not finished code for calcualting the FAP based on the z-levels is still present.''' P = self.search() Ptop = np.amax(P) ftop = self.flist[np.where(P == Ptop)[0][0]] Levels = np.array([50, 90, 95, 99, 99.9])[:Nlevels] # Neff = 0 # for i in range(self.Neval-2): # if (P[i]<P[i+1]) & (P[i+1]>P[i+2]): # Neff = Neff + 1 # # print(Neff) # Neff = self.fmax * 1/(self.flist[1]-self.flist[0]) # # print(Neff) FAPfile = np.loadtxt(FAPname + 'FAPNormTest.txt') #PLevels = scoreperc(FAPfile,Levels) fit = gev.fit(FAPfile) PLevels = gev.ppf(Levels / 100, *fit) plt.figure(figsize=(20, 14)) plt.hlines(PLevels, self.fmin, self.fmax, 'g') plt.plot(self.flist, P) plt.text(self.fmax - (self.fmax - self.fmin) / 2.25, plt.ylim()[1], 'False alarm probability') plt.ylim(0, Ptop + 0.1) for i in range(Nlevels): plt.text(self.fmax - (self.fmax - self.fmin) / 3, PLevels[i] + 0.003, str(np.round(1 - Levels[i] / 100, 3))) plt.plot(ftop, Ptop, 'r', marker='o', linestyle='none', markerfacecolor='none', markersize=35) if cheat == True: plt.vlines(1 / self.cheat, 0, Ptop, 'g') CheatLevels = self.APFAP(1 - Levels / 100) plt.hlines(CheatLevels, self.fmin, self.fmax, 'r') plt.xlabel('Frequency [1/day]') plt.ylabel('Lomb-Scargle Power') plt.title( 'Lomb - Scargle Periodogram for {planet}'.format(planet=self.name)) print('Highest probability of period = {p} days'.format( p=round(1 / ftop, 3)))
def sim_data(N, J, theta) -> tuple: k = theta.size x = random.normal(size=(N, J, k)) + np.linspace(3, 5, J).reshape(1, J, 1) v = utiliy(theta, x) e = genextreme.ppf(random.uniform(size=(N, J)), c=0) u = v + e # utility # Find which choice that maximizes utility. y = u.argmax(axis=1) label = ['y', 'x'] d = dict(zip(label, [y, x])) return d
def doGev(dis, retPerYr): prob = 1-1/retPerYr npt = dis.shape[1] nretper = len(retPerYr) retLev = np.ones([npt, nretper])*np.nan for ipt in range(npt): disii = dis[:,ipt] disii = disii[~np.isnan(disii)] if len(disii) > 15: shape, loc, scale = gev.fit(-disii) retLevII = -gev.ppf(prob, shape, loc=loc, scale=scale) if sum(retLevII < 0) == 0: retLev[ipt, :] = retLevII return retLev
def ProbapilityPlot(param, cdf, data, SignificanceLevel): """ still not finished the equations are the same of the gumbel dist and have to be changed to GEV equations =================================================================== ProbapilityPlot(param, cdf, data, SignificanceLevel) =================================================================== this method calculates the theoretical values based on the Gumbel distribution parameters, theoretical cdf (or weibul), and calculate the confidence interval. Parameters ---------- param : [list] list of the distribution parameters [loc, scale]. cdf : [list] theoretical cdf calculated using weibul or using the distribution cdf function. data : [list/array] list of the values. SignificanceLevel : [float] value between 0 and 1. Returns ------- Qth : [list] theoretical generated values based on the theoretical cdf calculated from weibul or the distribution parameters. Qupper : [list] upper bound coresponding to the confidence interval. Qlower : [list] lower bound coresponding to the confidence interval. """ # Qth = [param[0] - param[1]*(np.log(-np.log(j))) for j in cdf] Qth = genextreme.ppf(cdf, c=param[0], loc=param[1], scale=param[2]) Y = [-np.log(-np.log(j)) for j in cdf] StdError = [(param[1] / np.sqrt(len(data))) * np.sqrt(1.1087 + 0.5140 * j + 0.6079 * j**2) for j in Y] v = norm.ppf(1 - SignificanceLevel / 2) Qupper = [Qth[j] + v * StdError[j] for j in range(len(data))] Qlower = [Qth[j] - v * StdError[j] for j in range(len(data))] return Qth, Qupper, Qlower
def EstimaMagnitudes(self, Parametros): Quantis = [] TRs = [1.000111,2,5,10,20,50] for TR in TRs: if self.tipoSerie == 'Parcial': Quantil = genpareto.ppf(1-(1/TR), Parametros[0], loc = Parametros[1], scale = Parametros[2]) Quantis.append(Quantil) print('Tempo de Retorno: %i '%TR) print('PARETO=> Magnitude: %.2f'%(Quantil)) elif self.tipoSerie == 'Anual': Quantil = genextreme.ppf(1-(1/TR), Parametros[0], loc = Parametros[1], scale = Parametros[2]) Quantis.append(Quantil) print('Tempo de Retorno: %i '%TR) print('GEV=> Magnitude: %.2f' % (Quantil)) return Quantis
def fitGEV(x, Tmax): ''' Fit a GEV distribution to the data in x. Inverse function values are calculateded for returnperiods up to Tmax. --------------------------------------------------------------------------------------------------------------- Input: x: Pandas series of maxima Tmax: Maximum return period to consider to fit GEV distribution for --------------------------------------------------------------------------------------------------------------- Returns: gev_fit: Tuple of GEV fit parameters gev_inv: Inverse of CDF for each T ''' T = np.linspace(1, Tmax, 100000) probs = 1 / T #-initial guess of shape parameter c = 0 #-fit GEV and calculate inverse gev_fit = genextreme.fit(x, c) gev_inv = genextreme.ppf(1 - probs, gev_fit[0], gev_fit[1], gev_fit[2]) return gev_fit, gev_inv
def sim_data(n: int, j: int, theta: np.array) -> dict: """Takes input values n and j to specify the shape of the output data. The k dimension is inferred from the length of theta. Creates a y column vector that are the choice that maximises utility, and a x matrix that are the covariates, drawn from a random normal distribution. Args: n (int): Number of households. j (int): Number of choices. theta (np.array): The true value of the coefficients. Returns: dict: Returns a dict with keys "y" and "x". """ k = int(theta.size / (j - 1)) const = np.ones((n, 1)) x0 = rng.normal(size=(n, k - 1)) x = np.hstack((const, x0)) # FILL IN # Initialize a v matrix that are filled with zeros, # and has shapes n, j # Then loop over the columns of v, the first column should not be # changed and therefore still be filled with zeros. The other # columns should be filled using x and the correct column from the # theta matrix. e = genextreme.ppf(rng.uniform(size=(n, j)), c=0) u = v + e # Find which choice that maximises value. u_index = u.argmax(axis=1) label = ['y', 'x'] return dict(zip(label, [u_index, x]))
def StatisticalProperties(self, PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles=False, Filter=False, Distibution="GEV", EstimateParameters=False, Quartile=0, RIMResults=False, SignificanceLevel=0.1): """ ============================================================================= StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles = False, Filter = False, RIMResults = False) ============================================================================= StatisticalProperties method reads the SWIM output file (.dat file) that contains the time series of discharge for some computational nodes and calculate some statistical properties the code assumes that the time series are of a daily temporal resolution, and that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH). Parameters ---------- 1-PathNodes : [String] the name of the file which contains the ID of the computational nodes you want to do the statistical analysis for, the ObservedFile should contain the discharge time series of these nodes in order. 2-PathTS : [String] the name of the SWIM result file (the .dat file). 3-StartDate : [string] the begining date of the time series. 4-WarmUpPeriod : [integer] the number of days you want to neglect at the begining of the Simulation (warm up period). 5-SavePlots : [Bool] DESCRIPTION. 6-SavePath : [String] the path where you want to save the statistical properties. 7-SeparateFiles: [Bool] if the discharge data are stored in separate files not all in one file SeparateFiles should be True, default [False]. 8-Filter: [Bool] for observed or RIMresult data it has gaps of times where the model did not run or gaps in the observed data if these gap days are filled with a specific value and you want to ignore it here give Filter = Value you want 9-RIMResults: [Bool] If the files are results form RIM or observed, as the format differes between the two. default [False] Returns ------- 1-Statistical Properties.csv: file containing some statistical properties like mean, std, min, 5%, 25%, median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50, q100, q200, q500. """ ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16) # hydrographs if SeparateFiles: TS = pd.DataFrame() if RIMResults: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = self.ReadRIMResult( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') else: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = np.loadtxt( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') #,skiprows = 0 StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) ind = pd.date_range(StartDate, EndDate) TS.index = ind else: TS = pd.read_csv(PathTS, delimiter=r'\s+', header=None) StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) TS.index = pd.date_range(StartDate, EndDate, freq="D") # delete the first two columns del TS[0], TS[1] TS.columns = ComputationalNodes # neglect the first year (warmup year) in the time series TS = TS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :] # List of the table output, including some general data and the return periods. col_csv = [ 'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max', 't_beg', 't_end', 'nyr' ] rp_name = [ 'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500', 'q1000' ] col_csv = col_csv + rp_name # In a table where duplicates are removed (np.unique), find the number of # gauges contained in the .csv file. # no_gauge = len(ComputationalNodes) # Declare a dataframe for the output file, with as index the gaugne numbers # and as columns all the output names. StatisticalPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=col_csv) StatisticalPr.index.name = 'ID' DistributionPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=['loc', 'scale']) DistributionPr.index.name = 'ID' # required return periods T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500, 1000] T = np.array(T) # these values are the Non Exceedance probability (F) of the chosen # return periods F = 1 - (1/T) # Non Exceedance propabilities #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998] F = 1 - (1 / T) # Iteration over all the gauge numbers. for i in ComputationalNodes: QTS = TS.loc[:, i] # The time series is resampled to the annual maxima, and turned into a # numpy array. # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH). amax = QTS.resample('A-OCT').max().values if type(Filter) != bool: amax = amax[amax != Filter] if EstimateParameters: # estimate the parameters through an optimization # alpha = (np.sqrt(6) / np.pi) * amax.std() # beta = amax.mean() - 0.5772 * alpha # param_dist = [beta, alpha] threshold = np.quantile(amax, Quartile) if Distibution == "GEV": print("Still to be finished later") else: param = Gumbel.EstimateParameter(amax, Gumbel.ObjectiveFn, threshold) param_dist = [param[1], param[2]] else: # estimate the parameters through an maximum liklehood method if Distibution == "GEV": param_dist = genextreme.fit(amax) else: # A gumbel distribution is fitted to the annual maxima param_dist = gumbel_r.fit(amax) if Distibution == "GEV": DistributionPr.loc[i, 'c'] = param_dist[0] DistributionPr.loc[i, 'loc'] = param_dist[1] DistributionPr.loc[i, 'scale'] = param_dist[2] else: DistributionPr.loc[i, 'loc'] = param_dist[0] DistributionPr.loc[i, 'scale'] = param_dist[1] # Return periods from the fitted distribution are stored. # get the Discharge coresponding to the return periods if Distibution == "GEV": Qrp = genextreme.ppf(F, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1]) # to get the Non Exceedance probability for a specific Value # sort the amax amax.sort() # calculate the F (Exceedence probability based on weibul) cdf_Weibul = ST.Weibul(amax) # Gumbel.ProbapilityPlot method calculates the theoretical values based on the Gumbel distribution # parameters, theoretical cdf (or weibul), and calculate the confidence interval if Distibution == "GEV": Qth, Qupper, Qlower = GEV.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = genextreme.pdf(Qx, param_dist[0], loc=param_dist[2], scale=param_dist[2]) cdf_fitted = genextreme.cdf(Qx, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qth, Qupper, Qlower = Gumbel.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # gumbel_r.interval(SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = gumbel_r.pdf(Qx, loc=param_dist[0], scale=param_dist[1]) cdf_fitted = gumbel_r.cdf(Qx, loc=param_dist[0], scale=param_dist[1]) # then calculate the the T (return period) T = 1/(1-F) if SavePlots: fig = plt.figure(60, figsize=(20, 10)) gs = gridspec.GridSpec(nrows=1, ncols=2, figure=fig) # Plot the histogram and the fitted distribution, save it for each gauge. ax1 = fig.add_subplot(gs[0, 0]) ax1.plot(Qx, pdf_fitted, 'r-') ax1.hist(amax, density=True) ax1.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax1.set_ylabel('pdf', fontsize=15) ax2 = fig.add_subplot(gs[0, 1]) ax2.plot(Qx, cdf_fitted, 'r-') ax2.plot(amax, cdf_Weibul, '.-') ax2.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax2.set_ylabel('cdf', fontsize=15) plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png', format='png') plt.close() fig = plt.figure(70, figsize=(10, 8)) plt.plot(Qth, amax, 'd', color='#606060', markersize=12, label='Gumbel Distribution') plt.plot(Qth, Qth, '^-.', color="#3D59AB", label="Weibul plotting position") if Distibution != "GEV": plt.plot(Qth, Qlower, '*--', color="#DC143C", markersize=12, label='Lower limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.plot(Qth, Qupper, '*--', color="#DC143C", markersize=12, label='Upper limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.legend(fontsize=15, framealpha=1) plt.xlabel('Theoretical Annual Discharge(m3/s)', fontsize=15) plt.ylabel('Annual Discharge(m3/s)', fontsize=15) plt.savefig(SavePath + "/" + "Figures/F-" + str(i) + '.png', format='png') plt.close() StatisticalPr.loc[i, 'mean'] = QTS.mean() StatisticalPr.loc[i, 'std'] = QTS.std() StatisticalPr.loc[i, 'min'] = QTS.min() StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05) StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25) StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50) StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75) StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95) StatisticalPr.loc[i, 'max'] = QTS.max() StatisticalPr.loc[i, 't_beg'] = QTS.index.min() StatisticalPr.loc[i, 't_end'] = QTS.index.max() StatisticalPr.loc[ i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] - StatisticalPr.loc[i, 't_beg']).days / 365.25 for irp, irp_name in zip(Qrp, rp_name): StatisticalPr.loc[i, irp_name] = irp # Print for prompt and check progress. print("Gauge", i, "done.") # # Output file StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv") self.StatisticalPr = StatisticalPr DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv") self.DistributionPr = DistributionPr
def Plot_Fit_QQ(data_fit, vn, xds_GEV_Par, kma_fit, color='black', gs_1=1, gs_2=1, n_clusters=1, show=True): 'Plots QQ (empirical-gev) for variable vn and each kma cluster' # plot figure fig = plt.figure(figsize=(_fsize * gs_2 / 2, _fsize * gs_1 / 2.3)) # grid spec gs = gridspec.GridSpec(gs_1, gs_2) #, wspace=0.0, hspace=0.0) # clusters for c in range(n_clusters): # select wt data wt = c + 1 ph_wt = np.where(kma_fit.bmus == wt)[0] dh = data_fit[vn].values[:][ph_wt] dh = dh[~np.isnan(dh)] # prepare data Q_emp = np.sort(dh) bs = np.linspace(1, len(dh), len(dh)) pp = bs / (len(dh) + 1) # TODO: problem if gumbell? # select wt GEV parameters pars_GEV = xds_GEV_Par[vn] sha = pars_GEV.sel(parameter='shape').sel(n_cluster=wt).values sca = pars_GEV.sel(parameter='scale').sel(n_cluster=wt).values loc = pars_GEV.sel(parameter='location').sel(n_cluster=wt).values # calc GEV pdf Q_gev = genextreme.ppf(pp, -1 * sha, loc, sca) # scatter plot ax = fig.add_subplot(gs[c]) ax.plot(Q_emp, Q_gev, 'ok', color=color, label='N = {0}'.format(len(dh))) ax.plot([0, 1], [0, 1], '--b', transform=ax.transAxes) # customize axis ax.set_title('WT: {0}'.format(wt)) ax.axis('equal') #ax.set_xlabel('Empirical') ax.set_ylabel('GEV') ax.legend(prop={'size': 8}) # fig suptitle #fig.suptitle('{0}'.format(vn), fontsize=14, fontweight = 'bold') # show and return figure if show: plt.show() return fig
def Plot_FitSim_GevFit(data_fit, data_sim, vn, xds_GEV_Par, kma_fit, n_bins=30, color_1='white', color_2='skyblue', alpha_1=0.7, alpha_2=0.4, label_1='Historical', label_2='Simulation', gs_1=1, gs_2=1, n_clusters=1, vlim=1, show=True): 'Plots fit vs sim histograms and gev fit by clusters for variable "vn"' # plot figure fig = plt.figure(figsize=(_fsize * gs_2 / 2, _fsize * gs_1 / 2.3)) # grid spec gs = gridspec.GridSpec(gs_1, gs_2) #, wspace=0.0, hspace=0.0) # clusters for c in range(n_clusters): # select wt data wt = c + 1 ph_wt = np.where(kma_fit.bmus == wt)[0] ps_wt = np.where(data_sim.DWT == wt)[0] dh = data_fit[vn].values[:][ph_wt] #; dh = dh[~np.isnan(dh)] ds = data_sim[vn].values[:][ps_wt] #; ds = ds[~np.isnan(ds)] # TODO: problem if gumbell? # select wt GEV parameters pars_GEV = xds_GEV_Par[vn] sha = pars_GEV.sel(parameter='shape').sel(n_cluster=wt).values sca = pars_GEV.sel(parameter='scale').sel(n_cluster=wt).values loc = pars_GEV.sel(parameter='location').sel(n_cluster=wt).values # compare histograms ax = fig.add_subplot(gs[c]) axplot_compare_histograms( ax, dh, ds, ttl='WT: {0}'.format(wt), density=True, n_bins=n_bins, color_1=color_1, color_2=color_2, alpha_1=alpha_1, alpha_2=alpha_2, label_1=label_1, label_2=label_2, ) # add gev fit x = np.linspace(genextreme.ppf(0.001, -1 * sha, loc, sca), vlim, 100) ax.plot(x, genextreme.pdf(x, -1 * sha, loc, sca), label='GEV fit') # customize axis ax.legend(prop={'size': 8}) # fig suptitle #fig.suptitle('{0}'.format(vn), fontsize=14, fontweight = 'bold') # show and return figure if show: plt.show() return fig
#======================================== c = controle('Banco_Hidro2') prob = [0.001, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.999] metodo = ['MML','MOM','MVS'] tamanhoAmostra = 100 aux = [] for m in metodo: aux.append(c.prepGra(metodo=m, tamanhoAmostra=tamanhoAmostra, probabilidade=prob)) dadosExt = [] for i in prob: dadosExt.append(gev.ppf(i, -0.168462, 6286.926278, 1819.961392)) e = est.estatistica() accu = [] for me in range(3): ac = [] for d in range(9): ac.append(dadosExt[d] - e.calculoAccu(dadosAmostra=aux[me][d], dadoSintetico=dadosExt[d])) accu.append(ac) print(accu) ax1 = plt.subplot(221) ax2 = plt.subplot(223) ax3 = plt.subplot(122) c.plotGraf2(axes=ax1, accu=accu[1], dadosExt=dadosExt, yd=prob, quan=aux[1], metodo='MOM', tamanhoAmostra=tamanhoAmostra, prob=prob) c.plotGraf2(axes=ax2, accu=accu[2], dadosExt=dadosExt, yd=prob, quan=aux[2], metodo='MVS', tamanhoAmostra=tamanhoAmostra, prob=prob)
def calc_ocean_parameter(FP_MANAGER, fp, datasource, recalc=False): """ http://www.jamesphoughton.com/2013/08/making-gif-animations-with-matplotlib.html """ print "calcOceanStatistics function start" db_ocean = DB_connector("default") # chembl cursor = db_ocean.cursor ds = DataSources.objects.get(name=datasource.name) if recalc: print "delete rnd set items for fp",fp Rnd_set_comparison.objects.all().filter(fp=fp).filter(datasource=ds).delete() print "done" print "delete parameter entries for fp",fp FP_Parameter.objects.all().filter(fp_id=fp).filter(datasource=ds).delete() print "done" if not recalc and Rnd_set_comparison.objects.all().filter(fp=fp).filter(datasource=ds).count()==0: return "no entries for fp %d, try ?recalc=True" % fp repeats = settings.CALC_OCEAN_PARAMETER_REPEATS start = settings.CALC_OCEAN_PARAMETER_START end = settings.CALC_OCEAN_PARAMETER_END steps = settings.CALC_OCEAN_PARAMETER_STEPS thresh_start = settings.CALC_OCEAN_PARAMETER_THRESH_START thresh_end = settings.CALC_OCEAN_PARAMETER_THRESH_END thresh_steps = settings.CALC_OCEAN_PARAMETER_THRESH_STEPS animatedGif = True try: from PIL import Image from images2gif import writeGif except: print >> sys.stderr, "Couldn't import Image from PIL or writeGif from images2gif, so plotting is deactivated now" animatedGif = False plotting = True try: import matplotlib.pyplot as plt except: plotting = True animatedGif = False processes = settings.PARALLEL_PROCESSES if recalc: walker = Pool(processes=processes) thresh_list = np.arange(thresh_start,thresh_end,thresh_steps) molecule_ids = np.asarray(FP_MANAGER[datasource][fp].keys()) ds = DataSources.objects.get(name=datasource.name) for runde in range(repeats): if not recalc: continue print "runde %d" % runde result = {} rand_lists1 = createRandLists(start,end,steps,molecule_ids) rand_lists2 = createRandLists(start,end,steps,molecule_ids) tasks = [([FP_MANAGER[datasource][fp].get(x1) for x1 in rand_lists1[i]],[FP_MANAGER[datasource][fp].get(x2) for x2 in rand_lists2[i]]) for i in range(len(rand_lists2))] if processes>1: np.random.shuffle(tasks) result2 = {} for data_entry in walker.imap_unordered(get_tc_list_para,tasks,20): result2[data_entry[0]] = data_entry[1] print "addet %d of %d" % (len(result2),len(tasks)) else: result2 = {} while (len(tasks)>0): task = tasks.pop() score = get_tc_list_para(task) result2[score[0]] = score[1] print "addet %d of %d" % (len(result2),len(tasks)) print "create %d Result-Objects for DB-Table rnd_set_comparison" % (len(thresh_list) * len(result2)) with transaction.atomic(): buffer = [] for threshold in thresh_list: for key,value in result2.iteritems(): raw_score = np.sum(value[value>=threshold]) item = (key**2,fp,threshold,raw_score) buffer.append(item) print "created %d buffered items" % len(buffer) for w,x,y,z in buffer: obj = Rnd_set_comparison(setsize=w,fp=x,threshold=y,rawscore=z,datasource=ds) obj.save() figures = [] data_cache = {} min_mean = None max_mean = None min_stddev = None max_stddev = None for threshold in thresh_list: if db_ocean.db_type=='postgre': query = "select setsize,threshold, round(stddev_pop(rawscore)::numeric,2) as stddev_pop,round(avg(rawscore)::numeric,2) as mean from ocean_rnd_set_comparison where fp=%d and threshold=%f and datasource_id=%d group by setsize,threshold order by setsize" % (fp,threshold,ds.id) else: query = "select setsize,threshold,round(stddev(rawscore),2) as stddev,round(avg(rawscore),2) as mean from ocean_rnd_set_comparison where fp=%d and threshold=%f and datasource_id=%d group by setsize,threshold order by setsize" % ( fp, threshold, ds.id) cursor.execute(query) x_data = [] stddev_data = [] mean_data = [] for result in cursor.fetchall(): x_data.append(float(result[0])) mean_data.append(float(result[3])) stddev_data.append(float(result[2])) if min_mean is None: if len(mean_data) > 0: min_mean,max_mean = min(mean_data),max(mean_data) if len(stddev_data) > 0: min_stddev,max_stddev = min(stddev_data),max(stddev_data) else: if len(mean_data) > 0: min_mean, max_mean = min([min_mean,min(mean_data)]), max([max_mean,max(mean_data)]) if len(stddev_data) > 0: min_stddev, max_stddev = min([min_stddev, min(stddev_data)]), max([max_stddev, max(stddev_data)]) data_cache[threshold] = (x_data,mean_data,stddev_data) skip_3_to_6 = True for threshold in thresh_list: x_data,mean_data,stddev_data = data_cache[threshold] if len(x_data) == 0 or len(mean_data)==0 or len(stddev_data)==0: continue if plotting: plt.clf() if plotting: if skip_3_to_6: fig,(r0,r1,r2,r6) = plt.subplots(nrows=4,figsize=(12,14)) else: fig,(r0,r1,r2,r3,r4,r5,r6) = plt.subplots(nrows=7,figsize=(6,14)) raw_mean_func = Calculator.getRawScoreExpFunction(x_data,mean_data) print "\nmean function for threshold: %f is [%s]" % (threshold,raw_mean_func.func_name) exp_mean_data = [raw_mean_func(en) for en in x_data] if plotting: r0.plot(np.array(x_data),np.array(mean_data),linewidth=1.0) r0.plot(x_data,exp_mean_data,alpha=0.5,linewidth=2.5) r0.set_title("Mean, Threshold: %.2f" % threshold) r0.set_ylim((min_mean,max_mean)) r1.set_ylim((min_stddev,max_stddev)) r2.set_xlim((-1,1.5)) r2.set_ylim((0,2.5)) new_std_function = Calculator.getRawScoreStdDevExpFunction(x_data,stddev_data) print "stddev function for threshold: %f is [%s]" % (threshold,new_std_function.func_name) newdata2 = new_std_function(x_data) if plotting: r1.plot(x_data,stddev_data) r1.plot(x_data, newdata2, alpha=0.8, linewidth=2.0) r1.set_title("StdDev") z_Scores = Calculator.getZScores(x_data,mean_data,raw_mean_func,new_std_function) histo_bins = 50 counts,bin_edges = np.histogram(z_Scores,histo_bins,normed=True) bin_centres = (bin_edges[:-1] + bin_edges[1:])/2. if plotting: n,bins,patches = r2.hist(z_Scores,bins=histo_bins,normed=True,alpha=0.5) r2.set_title("z-Scores") e_val_function = Calculator.getZScoreDistExpFunction(z_Scores) e_val_data_x = np.linspace(min(z_Scores),max(z_Scores),num=500) e_val_data = [e_val_function(entry) for entry in e_val_data_x] if plotting: if not skip_3_to_6: r3.plot(e_val_data_x,e_val_data,alpha=0.5) c=-0.1 for c in [-0.05]: x_ls = np.linspace(ge.ppf(0.01,c),ge.ppf(0.99,c),100) if plotting: if not skip_3_to_6: r4.plot(x_ls,ge.pdf(x_ls,c),linewidth=1.6-c*4) (shape_evd,loc_evd,scale_evd) = ge.fit(z_Scores) loc_norm,scale_norm = norm.fit(z_Scores) x = ge.pdf(bin_centres,shape_evd,loc=loc_evd,scale=scale_evd) if plotting: evd_plot, = r2.plot(bin_centres,x,'b',color='black',label='Extreme Value Distribution') ndist = norm.pdf(bin_centres,loc=loc_norm,scale=scale_norm) if plotting: norm_plot, = r2.plot(bin_centres,ndist,'b',color="red",label='Normal Distribution') r2.legend([evd_plot,norm_plot],['Extreme Value Distribution','Normal Distribution'],loc=1) def getDecNpArray(value): return np.asarray(value).astype(float) expected_evd = getDecNpArray(x) expected_norm = getDecNpArray(ndist) observed = getDecNpArray(counts) def normalizedChisquare(observed,expected): if len(observed) != len(expected): raise Exception("len of observed and expected has to be the same") zipped = zip(observed,expected) fun = lambda input: ((input[0]-input[1])**2 / (input[0]+input[1])) result = sum(map(fun,zipped)) return result chisq_mean = normalizedChisquare(observed,expected_norm) chisq_evd = normalizedChisquare(observed,expected_evd) print "chisquare_norm",chisq_mean print "chisquare_evd",chisq_evd #django doesn't like inf or -inf in float-fields of oracle database, so we change it.. if isinf(chisq_mean) or isnan(chisq_mean): print "chisquare_norm seems to be inf or nan (%s), change to -1.0" % str(chisq_mean) chisq_mean = -1.0 if isinf(chisq_evd) or isnan(chisq_evd): print "chisquare_evd seems to be inf or nan (%s), change to -1.0" % str(chisq_evd) chisq_evd = -1.0 if plotting: if not skip_3_to_6: n,bins,patches = r5.hist(z_Scores,bins=histo_bins,normed=True,alpha=0.75)#,bins=20) if not skip_3_to_6: import matplotlib.mlab as mlab y = mlab.normpdf(bins,loc_evd,scale_evd) fp_parameter = FP_Parameter(fp_id=fp, threshold=threshold, formula_raw_mean=raw_mean_func.func_name, formula_raw_stddev=new_std_function.func_name, chisquare_mean=chisq_mean, chisquare_evd=chisq_evd, datasource=ds) fp_parameter.save() if plotting: if not skip_3_to_6: r5.plot(bins,y) if threshold==thresh_list[-1]: #this is last round print "last round" query = "select threshold,chisquare_mean,chisquare_evd from ocean_fp_parameter where fp_id=%d and datasource_id=%d order by threshold" % (fp,ds.id) cursor.execute(query) data_chi2_mean = [] data_chi2_evd = [] x_chidata = [] for val in cursor.fetchall(): x_chidata.append(float(val[0])) data_chi2_mean.append(float(val[1])) data_chi2_evd.append(float(val[2])) print x_chidata,data_chi2_mean,data_chi2_evd if plotting: if not skip_3_to_6: r6.plot(x_chidata,data_chi2_mean,'o') if not skip_3_to_6: r6.plot(x_chidata,data_chi2_evd,'.') chi2_mean, = r6.plot(x_chidata,data_chi2_mean,'o') chi2_evd, = r6.plot(x_chidata,data_chi2_evd,'.') r6.legend([chi2_mean,chi2_evd],['ChiSquare Normal Distribution','ChiSquare Extreme Value Distribution'],loc=1) def fitfunc(p,x): if p[0]==0: return np.exp(-np.exp(-x))*np.exp(-x) else: print p[0],type(x) return np.exp(-(1-p[0]*x)**(1/p[0]))*(1-p[0]*x)**(1/p[0]-1) errfunc = lambda p,x,y: (y-fitfunc(p,x)) init = [0.2] bins = bins[:-1] bins = np.array(bins) n = np.array(n) if plotting: plt.tight_layout() filename = "%f.png" % threshold plt.savefig(filename) figures.append(filename) if animatedGif: file_names = figures print "d",file_names images = [Image.open(fn) for fn in file_names] writeGif("animation_mean_stddev.gif",images,duration=0.5) for image in images: image.close()
from scipy.stats import genextreme import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculate a few first moments: c = -0.1 mean, var, skew, kurt = genextreme.stats(c, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(genextreme.ppf(0.01, c), genextreme.ppf(0.99, c), 100) ax.plot(x, genextreme.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genextreme pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = genextreme(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = genextreme.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genextreme.cdf(vals, c)) # True # Generate random numbers:
] print(' '.join(cdo_cmd)) ret = subprocess.call(cdo_cmd) if not ret == 0: raise Exception('Error with cdo command') with Dataset(maxfile, 'r') as f: fulldata[i, :] = f.variables['IRFroutedRunoff'][0, :] # Calclate GEV fit and return periods for each river segment retperiod_q = np.zeros([len(percentiles), nreaches]) minshape = -0.3 for reach in range(nreaches): qvals = fulldata[:, reach] try: c, loc, scale = genextreme.fit(qvals, -0.01) tmp = genextreme.ppf(percentiles / 100., c, loc, scale) # if min(tmp)<0.: # print('Warning, trying negative shape') # c,loc,scale = genextreme.fit(qvals,-0.01) # tmp = genextreme.ppf(percentiles/100.,c,loc,scale) except Exception as e: print('error fitting', reach, qvals) # try with different shape parameter guess c, loc, scale = genextreme.fit(qvals, 0.0) tmp = genextreme.ppf(percentiles / 100., c, loc, scale) retperiod_q[:, reach] = tmp if tmp.min() < 0 or tmp.max() > 5 * qvals.max(): #print('debug: reach,fit',reach,c,loc,scale) #print('qvals',qvals.min(),np.median(qvals),qvals.max()) #print('fitted vals',tmp) c, loc, scale = genextreme.fit(qvals, f0=minshape)