def plotDiagnostics(data, mu, xi, sigma, figfile): """ Create a 4-panel diagnostics plot of the fitted distribution. :param data: :class:`numpy.ndarray` of observed data values (in units of metres/second). :param float mu: Selected threshold value. :param float xi: Fitted shape parameter. :param float sigma: Fitted scale parameter. :param str figfile: Path to store the file (includes image format) """ LOG.info("Plotting diagnostics") fig, ax = plt.subplots(2, 2) axes = ax.flatten() # Probability plots sortedmax = np.sort(data[data > mu]) gpdf = fittedPDF(data, mu, xi, sigma) pp_x = sm.ProbPlot(sortedmax) pp_x.ppplot(xlabel="Empirical", ylabel="Model", ax=axes[0], line='45') axes[0].set_title("Probability plot") prplot = sm.ProbPlot(sortedmax, genpareto, distargs=(xi, ), loc=mu, scale=sigma) prplot.qqplot(xlabel="Model", ylabel="Empirical", ax=axes[1], line='45') axes[1].set_title("Quantile plot") ax2 = axes[2] rp = np.array( [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]) rate = float(len(sortedmax)) / float(len(data)) rval = returnLevels(rp, mu, xi, sigma, rate) emprp = empiricalReturnPeriod(np.sort(data)) ax2.semilogx(rp, rval, label="Fitted RP curve", color='r') ax2.scatter(emprp[emprp > 1], np.sort(data)[emprp > 1], color='b', label="Empirical RP", s=100) ax2.legend(loc=2) ax2.set_xlabel("Return period") ax2.set_ylabel("Return level") ax2.set_title("Return level plot") ax2.grid(True) maxbin = 4 * np.ceil(np.floor(data.max() / 4) + 1) sns.distplot(sortedmax, bins=np.arange(mu, maxbin, 2), hist=True, axlabel='Wind speed (m/s)', ax=axes[3]) axes[3].plot(sortedmax, gpdf, color='r') axes[3].set_title("Density plot") plt.tight_layout() plt.savefig(figfile) plt.close()
def plotFit(data, mu, xi, sigma, title, figfile): """ Plot a fitted distribution, with approximate 90% confidence interval and empirical return period values. :param data: :class:`numpy.ndarray` of observed data values. :param float mu: Selected threshold value. :param float xi: Fitted shape parameter. :param float sigma: Fitted scale parameter. :param str title: Title string for the plot. :param str figfile: Path to store the file (includes image format) """ LOG.info("Plotting fitted return period curve") rp = np.array( [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]) rate = float(len(data[data > mu])) / float(len(data)) rval = returnLevels(rp, mu, xi, sigma, rate) emprp = empiricalReturnPeriod(data) err = returnPeriodUncertainty(data, mu, xi, sigma, rp) sortedmax = np.sort(data) fig, ax1 = plt.subplots(1, 1, figsize=(12, 12)) ax1.semilogx(rp, rval, label="Fitted RP curve") ax1.semilogx(rp, rval + 1.96 * err, label="90% CI", linestyle='--', color='0.5') ax1.semilogx(rp, rval - 1.96 * err, linestyle='--', color='0.5') ax1.scatter(emprp[emprp > 1], sortedmax[emprp > 1], s=100, color='r', label="Empirical RP") title_str = ( title + "\n" + r"$\mu$ = {0:.2f}, $\xi$ = {1:.5f}, $\sigma$ = {2:.4f}".format( mu, xi, sigma)) ax1.set_title(title_str) ax1.legend(loc=2) ax1.set_ylim((0, 100)) ax1.set_xlim((1, 10000)) ax1.set_ylabel('Wind speed (m/s)') ax1.set_xlabel('Return period (years)') ax1.grid(which='major') ax1.grid(which='minor', linestyle='--', linewidth=1) plt.savefig(figfile) plt.close()
def plotDiagnostics(data, mu, xi, sigma, figfile): """ Create a 4-panel diagnostics plot of the fitted distribution. :param data: :class:`numpy.ndarray` of observed data values (in units of metres/second). :param float mu: Selected threshold value. :param float xi: Fitted shape parameter. :param float sigma: Fitted scale parameter. :param str figfile: Path to store the file (includes image format) """ LOG.info("Plotting diagnostics") fig, ax = plt.subplots(2, 2) axes = ax.flatten() # Probability plots sortedmax = np.sort(data[data > mu]) gpdf = fittedPDF(data, mu, xi, sigma) pp_x = sm.ProbPlot(sortedmax) pp_x.ppplot(xlabel="Empirical", ylabel="Model", ax=axes[0], line='45') axes[0].set_title("Probability plot") prplot = sm.ProbPlot(sortedmax, genpareto, distargs=(xi,), loc=mu, scale=sigma) prplot.qqplot(xlabel="Model", ylabel="Empirical", ax=axes[1], line='45') axes[1].set_title("Quantile plot") ax2 = axes[2] rp = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]) rate = float(len(sortedmax)) / float(len(data)) rval = returnLevels(rp, mu, xi, sigma, rate) emprp = empiricalReturnPeriod(np.sort(data)) ax2.semilogx(rp, rval, label="Fitted RP curve", color='r') ax2.scatter(emprp[emprp > 1], np.sort(data)[emprp > 1], color='b', label="Empirical RP", s=100) ax2.legend(loc=2) ax2.set_xlabel("Return period") ax2.set_ylabel("Return level") ax2.set_title("Return level plot") ax2.grid(True) maxbin = 4 * np.ceil(np.floor(data.max() / 4) + 1) sns.distplot(sortedmax, bins=np.arange(mu, maxbin, 2), hist=True, axlabel='Wind speed (m/s)', ax=axes[3]) axes[3].plot(sortedmax, gpdf, color='r') axes[3].set_title("Density plot") plt.tight_layout() plt.savefig(figfile) plt.close()
def plotFit(data, mu, xi, sigma, title, figfile): """ Plot a fitted distribution, with approximate 90% confidence interval and empirical return period values. :param data: :class:`numpy.ndarray` of observed data values. :param float mu: Selected threshold value. :param float xi: Fitted shape parameter. :param float sigma: Fitted scale parameter. :param str title: Title string for the plot. :param str figfile: Path to store the file (includes image format) """ LOG.info("Plotting fitted return period curve") rp = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]) rate = float(len(data[data > mu])) / float(len(data)) rval = returnLevels(rp, mu, xi, sigma, rate) emprp = empiricalReturnPeriod(data) err = returnPeriodUncertainty(data, mu, xi, sigma, rp) sortedmax = np.sort(data) fig, ax1 = plt.subplots(1, 1, figsize=(12, 12)) ax1.semilogx(rp, rval, label="Fitted RP curve") ax1.semilogx(rp, rval + 1.96 * err, label="90% CI", linestyle='--', color='0.5') ax1.semilogx(rp, rval - 1.96 * err, linestyle='--', color='0.5') ax1.scatter(emprp[emprp > 1], sortedmax[emprp > 1], s=100, color='r', label="Empirical RP") title_str = (title + "\n" + r"$\mu$ = {0:.2f}, $\xi$ = {1:.5f}, $\sigma$ = {2:.4f}". format(mu, xi, sigma)) ax1.set_title(title_str) ax1.legend(loc=2) ax1.set_ylim((0, 100)) ax1.set_xlim((1, 10000)) ax1.set_ylabel('Wind speed (m/s)') ax1.set_xlabel('Return period (years)') ax1.grid(which='major') ax1.grid(which='minor', linestyle='--', linewidth=1) plt.savefig(figfile) plt.close()
def selectThreshold(data, minexc=10): """ Select an appropriate threshold for fitting a generalised pareto distribution. The only constraint placed on the selection is that the shape parameter is negative (such that the distribution is bounded). :param data: :class:`numpy.ndarray` containing the observed values (with missing values removed). :param int minexc: Minimum number of exceedances required. :returns: tuple of the shape, scale and threshold. """ sh = [] sc = [] t = [] q1000list = [] q10000list = [] eps = -0.01 nobs = len(data) mu = np.median(data) while mu < data.max(): # for mu in np.arange(np.median(data), data.max(), 0.002): nexc = len(data[data > mu]) rate = nexc / nobs if nexc < minexc: break pp = calculateShape(mu, data) q1000, q10000 = returnLevels(np.array([1000, 10000]), mu, pp[0], pp[2], rate) if np.isnan(q1000) or np.isnan(q10000): continue qdiff = np.abs(q10000 - q1000) if pp[0] < eps: # and qdiff < 0.2*q10000:# and qdiff > -eps: t.append(mu) sh.append(pp[0]) sc.append(pp[2]) q1000list.append(q1000) q10000list.append(q10000) mu += 0.002 if len(t) == 0: log.warn("No suitable shape parameters identified") return 0, 0, 0 Av1000 = np.mean(np.array(q1000list)) Av10000 = np.mean(np.array(q10000list)) Av1000 = np.ceil(Av1000 + 0.05 * Av1000) Av10000 = np.ceil(Av10000 + 0.05 * Av10000) idx1000 = find_nearest_index(np.array(q1000list), Av1000) idx10000 = find_nearest_index(np.array(q10000list), Av10000) u1000 = t[idx1000] u10000 = t[idx10000] if u1000 > u10000: shmax = sh[idx1000] scmax = sc[idx1000] else: shmax = sh[idx10000] scmax = sc[idx10000] return shmax, scmax, u1000
stndf['DataEndYear'][i]) stnName = stndf['stnName'][i].title().strip() + " " + dataRange fitname = pjoin(output_path, '{0}_gpdfit.png'.format(stnNum)) diagname = pjoin(output_path, '{0}_gpddiag.png'.format(stnNum)) if os.path.exists(filename): log.info("Processing {0}".format(stnName)) df = readDataFile(filename) quality = df['QSpeed'].fillna("X").map( lambda x: x in ['Y', 'N', 'X', ' ', np.nan]) dmax = df['Speed'][df['Speed'].notnull() & quality] if len(dmax) == 0: log.info("No valid data") continue xi, sigma, mu = selectThreshold(dmax, minexc=10) log.debug("Parameters: {0}, {1}, {2}".format(xi, sigma, mu)) rate = float(len(dmax[dmax > mu])) / float(len(dmax)) if xi == 0: continue plotFit(dmax, mu, xi, sigma, stnName, fitname) plotDiagnostics(dmax, mu, xi, sigma, diagname) gpdfile.write("{0}, {1}, {2:.6f}, {3:.6f}, {4:.3f}, {5:.4f}\n".format( stnNum, stnName, xi, sigma, mu, rate)) rpvals = returnLevels(rp, mu, xi, sigma, rate) rpstr = ", ".join(['{:.3f}'] * len(rpvals)).format(*rpvals) rpfile.write("{0}, {1}, {2}\n".format(stnNum, stnName, rpstr)) else: log.info("No data file for {0}".format(stnName)) gpdfile.close() rpfile.close()
def selectThreshold(data, minexc=10): """ Select an appropriate threshold for fitting a generalised pareto distribution. The only constraint placed on the selection is that the shape parameter is negative (such that the distribution is bounded). :param data: :class:`numpy.ndarray` containing the observed values (with missing values removed). :param int minexc: Minimum number of exceedances required. :returns: tuple of the shape, scale and threshold. """ sh = [] sc = [] t = [] q1000list = [] q10000list = [] eps = -0.01 nobs = len(data) mu = np.median(data) while mu < data.max(): # for mu in np.arange(np.median(data), data.max(), 0.002): nexc = len(data[data > mu]) rate = nexc / nobs if nexc < minexc: break pp = calculateShape(mu, data) q1000, q10000 = returnLevels(np.array([1000, 10000]), mu, pp[0], pp[2], rate) if np.isnan(q1000) or np.isnan(q10000): continue qdiff = np.abs(q10000 - q1000) if pp[0] < eps: # and qdiff < 0.2*q10000:# and qdiff > -eps: t.append(mu) sh.append(pp[0]) sc.append(pp[2]) q1000list.append(q1000) q10000list.append(q10000) mu += 0.002 if len(t) == 0: log.warn("No suitable shape parameters identified") return 0, 0, 0 Av1000 = np.mean(np.array(q1000list)) Av10000 = np.mean(np.array(q10000list)) Av1000 = np.ceil(Av1000 + 0.05*Av1000) Av10000 = np.ceil(Av10000 + 0.05*Av10000) idx1000 = find_nearest_index(np.array(q1000list), Av1000) idx10000 = find_nearest_index(np.array(q10000list), Av10000) u1000 = t[idx1000] u10000 = t[idx10000] if u1000 > u10000: shmax = sh[idx1000] scmax = sc[idx1000] else: shmax = sh[idx10000] scmax = sc[idx10000] return shmax, scmax, u1000
stndf['DataEndYear'][i]) stnName = stndf['stnName'][i].title().strip() + " " + dataRange fitname = pjoin(output_path, '{0}_gpdfit.png'.format(stnNum)) diagname = pjoin(output_path, '{0}_gpddiag.png'.format(stnNum)) if os.path.exists(filename): log.info("Processing {0}".format(stnName)) df = readDataFile(filename) quality = df['QSpeed'].fillna("X").map(lambda x: x in ['Y', 'N', 'X', ' ', np.nan]) dmax = df['Speed'][df['Speed'].notnull() & quality] if len(dmax) == 0: log.info("No valid data") continue xi, sigma, mu = selectThreshold(dmax, minexc=10) log.debug("Parameters: {0}, {1}, {2}".format(xi, sigma, mu)) rate = float(len(dmax[dmax > mu])) / float(len(dmax)) if xi == 0: continue plotFit(dmax, mu, xi, sigma, stnName, fitname) plotDiagnostics(dmax, mu, xi, sigma, diagname) gpdfile.write("{0}, {1}, {2:.6f}, {3:.6f}, {4:.3f}, {5:.4f}\n". format(stnNum, stnName, xi, sigma, mu, rate)) rpvals = returnLevels(rp, mu, xi, sigma, rate) rpstr = ", ".join(['{:.3f}']*len(rpvals)).format(*rpvals) rpfile.write("{0}, {1}, {2}\n".format(stnNum, stnName, rpstr)) else: log.info("No data file for {0}".format(stnName)) gpdfile.close() rpfile.close()