def main(): from scipy.stats import gaussian_kde as gk from numpy import arange import numpy as np models = gen_models() clim_data, hindcast, ensemble = combine_models(models) prcp = get_lcrb_prcp() df = gen_ensembles() idx = (clim_data.index.year <= 2009) & (clim_data.index.year >= 1982) rps = RPSS(clim_data[idx], ensemble[:,idx], 9, 9) #hss = HSS(clim_data[idx], ensemble[:,idx], 9, 9) print 'NIPA RPSS:\n', rps #print 'NIPA HSS:\n', hss rps = RPSS(clim_data[idx], df.values, 9, 9) #hss = HSS(clim_data[idx], df.values, 9, 9) print 'NMME:RPSS\n', rps #print 'NMME HSS:\n', hss years = np.arange(1982,2010) fig, axes = plt.subplots(6, 5, figsize = (12,12), sharex=True, sharey=True) p_x = np.linspace(0, clim_data.max() + 10, 500) for i, n in enumerate(years): dNIPA = gk(ensemble[:,idx][i], bw_method = 0.5) dNMME = gk(df.values[i], bw_method = 0.5) ax = axes.ravel()[i] nip, = ax.plot(p_x,dNIPA.pdf(p_x), label = 'NIPA', linewidth = 2) ax.fill_between(p_x, 0, dNIPA.pdf(p_x), alpha = 0.5) nmm, = ax.plot(p_x,dNMME.pdf(p_x), label = 'NMME', linewidth = 2) ax.fill_between(p_x,0,dNMME.pdf(p_x), alpha = 0.5, color = 'green') obs = ax.vlines(clim_data[idx][i], 0, 0.005, linewidth = 2, label = 'obs') ax.set_title(str(n)) plt.xticks(arange(0,800,200)) plt.yticks(arange(0,0.010, 0.002)) axes[5,3].legend((nip, nmm, obs), ('NIPA', 'NMME', 'obs')) return fig, axes
def densityPlot(clim_data, ensemble, year): from numpy import linspace from scipy.stats import gaussian_kde as gk from matplotlib import pyplot as plt idx = clim_data.index.year == year dFcst = gk(ensemble[:,idx].squeeze(), bw_method = 0.5) dClim = gk(clim_data, bw_method = 0.5) x = linspace(0,600,1200) plt.plot(x,dFcst(x), label = 'Forecast') plt.plot(x,dClim(x), label = 'Climatology') plt.vlines(clim_data[idx], 0, max(dFcst(x)), label = 'Observed') plt.title('%i' % year) plt.show() return
def mcmc_clust(al=np.genfromtxt(ALIGNFILE,delimiter=',').astype(np.int), imps=IMPS): allen = al.shape[0] seqlen = al.shape[1] delclust = clust(al) print 'Building likelihood distributions...' try: pdist = gk(np.genfromtxt(LC_DIST, delimiter=',')) except IOError: print 'Existing distribution not found, building...' pdist = lclass(al, imps) print 'Starting MCMC:' print 'Step#\t|New Lik\t|New PropLik\t|Old Lik\t|Old PropLik\t|Accept Prob' old = impute.impute(al,imps, orderfunc=ORDERFUNC) old_lik = clik((old,delclust,allen)) old_plik = pdist(old_lik) states = [(clust(old),old_lik,old_plik,old_lik,old_plik,1)] for i in xrange(STEPS): prop = impute.impute(al,imps, orderfunc=ORDERFUNC) prop_lik = clik((prop,delclust,allen)) prop_plik = pdist(prop_lik) a = (prop_lik/old_lik)*(old_plik/prop_plik) states.append((clust(old),prop_lik,prop_plik,old_lik,old_plik,a)) print '%d\t|%2f\t|%2f\t|%2f\t|%2f\t|%e' % (i+1,prop_lik,prop_plik,old_lik,old_plik,a) if random.random()<a: old, old_lik, old_plik = prop, prop_lik, prop_plik states.append((clust(old),prop_lik,prop_plik,old_lik,old_plik,a)) np.savetxt(LC_STATES, np.array(states), delimiter=',')
def lclass(al, imps): allen = al.shape[0] seqlen = al.shape[1] delclust = clust(al) numprocs = multiprocessing.cpu_count() reps = [(al,delclust,imps)]*CCLASS_REPS ratios = P.map(c,reps) np.savetxt(OUT_RATIOS, ratios, delimiter=',') # Save ratios? return gk(ratios)
def nmmecompare(): from nmme import gen_ensembles from data_load import gen_models, combine_models, RPSS, HSS from scipy.stats import gaussian_kde as gk models = gen_models(cc = 0.95, quick = True) clim_data, hindcast, ensemble = combine_models(models) df = gen_ensembles() cmap = mpl.cm.get_cmap('viridis') rgba = [cmap(0.25), cmap(0.7)] idx = np.arange(1921,2011)>=1982 x = 16 dNIPA = gk(ensemble[:,idx][x], bw_method = 0.5) dNMME = gk(df.values[x], bw_method = 0.5) p_x = np.linspace(0, 600, 1000) fig, (ax1, ax2) = plt.subplots(1,2,sharey = True, figsize = (14,10)) ax1.plot(p_x, dNMME.pdf(p_x)*1000, color = rgba[0], label = 'NMME', linewidth = 2) ax1.plot(p_x, dNIPA.pdf(p_x)*1000, color = rgba[1], label = 'NIPA', linewidth = 2) ax1.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color = rgba[1], alpha = 0.5) ax1.fill_between(p_x, 0, dNMME.pdf(p_x)*1000, color = rgba[0],alpha = 0.5) ax1.vlines(clim_data[idx][x], 0, 6, linewidth = 4, color = 'k') ax1.set_title(str(clim_data.index.year[idx][x]),fontsize = 18, fontweight='bold') ax1.set_ylabel('Probability Density, $10^{-3}$', fontweight = 'bold') h, l = ax1.get_legend_handles_labels() ax1.legend(h, l) x = 25 dNIPA = gk(ensemble[:,idx][x], bw_method = 0.5) dNMME = gk(df.values[x], bw_method = 0.5) p_x = np.linspace(0, 600, 1000) ax2.plot(p_x, dNMME.pdf(p_x)*1000, color = rgba[0], label = 'NMME', linewidth = 2) ax2.plot(p_x, dNIPA.pdf(p_x)*1000, color = rgba[1], label = 'NIPA', linewidth = 2) ax2.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color = rgba[1], alpha = 0.5) ax2.fill_between(p_x, 0, dNMME.pdf(p_x)*1000, color = rgba[0],alpha = 0.5) ax2.vlines(clim_data[idx][x], 0, 6, linewidth = 4, color = 'k') ax2.set_title(str(clim_data.index.year[idx][x]), fontsize = 18, fontweight='bold') fig.text(0.33, 0.015, 'Total MAMJ Precipitation, mm') fig.savefig(EV['HOME'] + '/Desktop/Feb20Response/images/nmmecompare') plt.close(fig) return
def rpssexmp(): from data_load import gen_models, combine_models from scipy.stats import gaussian_kde as gk cmap = mpl.cm.get_cmap('viridis') rgba = [cmap(0.8), cmap(0.2)] models = gen_models() c, h, e = combine_models(models) x, x2 = 15, 20 fig, ax = plt.subplots(1, 1, figsize = (8,6)) ax.set_ylabel('Probability Density, $10^{-3}$') ax.set_xlabel('Total MAMJ Precipitation, mm') dNIPA = gk(e[:, x], bw_method = 0.5) p_x = np.linspace(0,600,1000) ax.plot(p_x, dNIPA.pdf(p_x)*1000, linewidth = 2, color = rgba[0], label = 'Bad') ax.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color = rgba[0], alpha = 0.5) dNIPA = gk(e[:, x2], bw_method = 0.5) ax.plot(p_x, dNIPA.pdf(p_x)*1000, linewidth = 2, color = rgba[1], label = 'Worse') ax.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color =rgba[1] , alpha = 0.5) ax.vlines(100, 0,8, linewidth = 2, label = 'obs') plt.legend() fig.savefig(EV['HOME'] + '/Desktop/Feb20Response/images/rpss_exmp') return
def mcmc_ttmp(al=np.genfromtxt(ALIGNFILE,delimiter=',').astype(np.int), imps=IMPS): allen = al.shape[0] seqlen = al.shape[1] delclust = clust(al) print 'Building likelihood distributions...' ldist = norm(*norm.fit(np.genfromtxt(RDIST, delimiter=','))) def lik(al): return ldist.pdf(tt.ttratio(al)) try: pdist = gk(np.genfromtxt(TTMP_DIST, delimiter=',')) except IOError: print 'Existing distribution not found, building...' pdist = lclass_ttmp(al, imps, lik) print 'Starting MCMC:' print 'Step#\tOld Clust\t|New Lik\t|New PropLik\t|Old Lik\t|Old PropLik\t|Accept Prob' old = impute.impute(al,imps, orderfunc=ORDERFUNC) old_lik = lik(old) old_plik = pdist(old_lik) old_clust = clust(old) states = [(old_clust,old_lik,old_plik,old_lik,old_plik,1)] Q, procs, data = multiprocessing.Queue(maxsize=MQS), [], [] numprocs = multiprocessing.cpu_count()-1 reps = -(-STEPS/numprocs) for i in xrange(numprocs): p = multiprocessing.Process(target=gttmp, args=(lik,al,imps,reps,Q,i,pdist)) procs.append(p) p.start() for i in xrange(reps*numprocs): prop, prop_lik, prop_plik, prop_clust = Q.get() a = (prop_lik/old_lik)*(old_plik/prop_plik) states.append((old_clust,prop_lik,prop_plik,old_lik,old_plik,a)) print '%d\t|%2f\t|%2f\t|%2f\t|%2f\t|%2f\t|%e' % (i+1,old_clust,prop_lik,prop_plik,old_lik,old_plik,a) if random.random()<a: old, old_lik, old_plik, old_clust = prop, prop_lik, prop_plik, prop_clust states.append((old_clust,prop_lik,prop_plik,old_lik,old_plik,a)) np.savetxt(TTMP_STATES, np.array(states), delimiter=',')
def lclass_ttmp(al, imps, lik): allen = al.shape[0] seqlen = al.shape[1] delclust = clust(al) Q, procs, data = multiprocessing.Queue(maxsize=MQS), [], [] numprocs = multiprocessing.cpu_count() reps = -(-CCLASS_REPS/numprocs) for i in xrange(numprocs): p = multiprocessing.Process(target=gttmp, args=(lik,al,imps,reps,Q,i)) procs.append(p) p.start() old_percent = 0 for i in xrange(reps*numprocs): percent = int(float(i)/(reps*numprocs) * 100) if percent > old_percent: print '%d percent' % int(percent) old_percent = percent prop, prop_lik, prop_clust = Q.get() data.append(prop_lik) np.savetxt(TTMP_DIST, data, delimiter=',') # Save ratios? return gk(data)
def main(): # Parse input parser = argparse.ArgumentParser(description='validate MCMC performance') parser.add_argument('alignfile', help='file containing alignment to use') parser.add_argument('-n', '--numseqs', type=int, help='number of sequences to use in subsample', required=True) parser.add_argument('-m', '--numsites', type=int, help='number of sites to include in subsample', required=True) parser.add_argument('-q', '--subseqs', type=int, help='number of sequences to subsample to', required=True) parser.add_argument('-r', '--repetitions', type=int, help='number of repetitions to do', required=True) parser.add_argument('-s', '--subsamples', type=int, help='number of subsamples per repetition', required=True) args = parser.parse_args() try: al = np.genfromtxt(args.alignfile, delimiter=',').astype(int) except ValueError: print 'Invalid alignment file'; exit() reps = args.repetitions subsamples = args.subsamples allen = args.numseqs seqlen = args.numsites subseqs = args.subseqs fname = args.alignfile[:-4]+'_tr' imps = allen-subseqs devnull = open(os.devnull, 'w') print """ Loaded data: Full alignment is %dx%d sequences & sites. Randomly selected datasets will be %dx%d sequences and sites. Subsamples will be %dx%d sequences and sites. There will be %d repetitions of %d subsamples each.\n """ % (al.shape[0], al.shape[1], allen, seqlen, subseqs, seqlen, reps, subsamples) results = [] print 'dset\tsubset\ttrue_clust\tsub_clust\tmcmc_clust\timp_clust' # Main loop for i in xrange(reps): # Generate a dataset to test on dataset = al[np.random.choice(xrange(al.shape[0]),allen,replace=0)][:,np.random.choice(xrange(al.shape[1]),seqlen,replace=0)] trueclust = m.clust(dataset) for j in xrange(subsamples): # Generate a subsample of the current dataset to simulate missingness subsample = dataset[np.random.choice(xrange(dataset.shape[0]),subseqs,replace=0)] subclust = m.clust(subsample) # Attempt to recover with imputation imputed_states = [m.impute.impute(subsample, imps) for k in xrange(IMPUTATIONS)] imputed_clusts = map(m.clust, imputed_states) avg_impclust = np.mean(imputed_clusts) # Attempt to recover with KS optimization """m.V_TDIST, m.V_STATES = '%s_%d_%d_target.csv' % (fname,i,j), '%s_%d_%d_states.csv' % (fname,i,j) sys.stdout, sys.stderr = devnull, devnull states, tdist = m.mcmc_ns(al=subsample, imps=imps) sys.stdout, sys.stderr = sys.__stdout__, sys.__stderr__""" # tdist = h.build_target(subsample, 100,'%s_%d_%d_target.csv' % (fname,i,j)) # cc, clust, ks = h.opt(subsample,imps,tdist) tdist = h.nonparametric_target(subsample, 100) cc, clust, ks = h.npopt(subsample,imps,tdist) # Plot results cclass_hist = plt.hist(cc,normed=1,alpha=.5, label='Sampled congruency classes', color='green') # xr = np.linspace(np.min(states[:,2]),np.max(states[:,2]),1000) xr = np.linspace(0,1,1000) tpdf = gk(tdist)(xr) plt.plot(xr,tpdf, label='Target distribution', color='blue') ymax = np.max([np.max(cclass_hist[0]), np.max(tpdf)]) plt.plot((subclust,subclust),(0,ymax),label='Observed clustering value (congruency class)', color='blue') mean_cclass = np.mean(cc) plt.plot((mean_cclass,mean_cclass),(0,ymax),label='MCMC average congruency class', color='green') plt.legend() plt.savefig('%s_%d_%d_cclass.png' % (fname,i,j)) plt.clf() clust_hist = plt.hist(clust,normed=1,alpha=.5, label='Sampled clustering values', color='green') ymax = np.max(clust_hist[0]) plt.plot((subclust,subclust),(0,ymax),label='Observed clustering value', color='blue') plt.plot((trueclust,trueclust),(0,ymax),label='True clustering value', color='red') plt.plot((avg_impclust,avg_impclust),(0,ymax),label='Imputed point estimate (mean)', color='purple') mean_clust = np.mean(clust) plt.plot((mean_clust,mean_clust),(0,ymax),label='Mean MCMC estimate', color='green') plt.legend() plt.savefig('%s_%d_%d_clust.png' % (fname,i,j)) plt.clf() results.append([i,j,trueclust,subclust,mean_clust,avg_impclust]) print '%d\t%d\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' % (i,j,trueclust,subclust,mean_clust,avg_impclust) np.savetxt('%s_summary.csv'%fname, results, delimiter=',', fmt='%s')
def terciles(): from data_load import gen_models, combine_models from scipy.stats import gaussian_kde as gk models = gen_models(cc = 0.95, quick = True) c, h, e = combine_models(models) cmap = mpl.cm.get_cmap('viridis') rgba = [cmap(0.0), cmap(0.35), cmap(0.8)] idx = np.argsort(c) B = c[idx][24] A = c[idx][66] x = 20 dNIPA = gk(e[:,x], bw_method = 0.5) dCLIM = gk(c, bw_method = 0.5) bclim = round(dCLIM.integrate_box(0,B),2)* 100 aclim = round(dCLIM.integrate_box(A, 600),2)* 100 nclim = round(dCLIM.integrate_box(B,A),2)* 100 bnipa = round(dNIPA.integrate_box(0,B),2)* 100 anipa = round(dNIPA.integrate_box(A, 600),2)* 100 nnipa = round(dNIPA.integrate_box(B,A),2)* 100 p_x = np.linspace(0,600,1000) below = p_x[p_x<B] middle = p_x[(p_x>=B) & (p_x<=A)] above = p_x[p_x>A] fig, (ax1, ax2) = plt.subplots(1,2,figsize = (12,6), sharey = True) ax1.plot(p_x, dCLIM.pdf(p_x)*1000, linewidth = 2, color = 'k') ax1.fill_between(below, 0, dCLIM.pdf(below)*1000, color = rgba[2], alpha = 0.5) ax1.fill_between(above, 0, dCLIM.pdf(above)*1000, color = rgba[0], alpha = 0.5) ax1.fill_between(middle, 0, dCLIM.pdf(middle)*1000, color = rgba[1], alpha = 0.5) line = '\n\nB: %.0f%%\nN: %.0f%%\nA: %.0f%%' % (bclim, nclim, aclim) ax1.text(400,3,line) ax1.set_title('Climatological PDF', fontsize = 22, fontweight = 'bold') ax1.set_ylabel('Probability Density, $10^{-3}$', fontweight = 'bold') ax2.plot(p_x, dCLIM.pdf(p_x)*1000, linewidth = 2, color = 'grey', alpha = 0.7) ax2.fill_between(p_x, dCLIM.pdf(p_x)*1000, label = 'Climatology', color = 'grey', alpha = 0.2) ax2.plot(p_x, dNIPA.pdf(p_x)*1000, linewidth = 2, color = 'k') ax2.fill_between(below, 0, dNIPA.pdf(below)*1000, color = rgba[2], label = 'Below', alpha = 0.5) ax2.fill_between(middle, 0, dNIPA.pdf(middle)*1000, color = rgba[1], label = 'Normal', alpha = 0.5) ax2.fill_between(above, 0, dNIPA.pdf(above)*1000, color = rgba[0], label = 'Above', alpha = 0.5) line = '\n\nB: %.0f%%\nN: %.0f%%\nA: %.0f%%' % (bnipa, nnipa, anipa) ax2.text(400,3,line) ax2.set_title('Forecast PDF', fontsize = 22, fontweight = 'bold') h, l = ax2.get_legend_handles_labels() fig.legend(h,l, loc = (0.395, 0.55)) fig.text(0.33, 0.015, 'Total MAMJ Precipitation, mm') fig.savefig(EV['HOME'] + '/Desktop/Feb20Response/images/tercile') return
def RPSS(clim_data, ensemble, n_dry = 10, n_wet = 10, file = None): from scipy.stats import gaussian_kde as gk from numpy import linspace, array, zeros, inf, round, argsort, median n = len(clim_data) nc = array([33., 34., 33.]) prob_clim = nc/100 pcum_clim = prob_clim.cumsum() x = linspace(0, clim_data.max() + 10, 500) dClim = gk(clim_data, bw_method = 0.5) rpss = zeros((n)) probs = zeros((n,3)) for i in range(n): dFcst = gk(ensemble[:,i], bw_method = 0.5) j = 1 test = 0 while test == 0: j = j+1 y = dClim.integrate_box(0,j) if y >= pcum_clim[0]: lower_ind = j-1 test = 1 j = 500 test = 0 while test == 0: j = j-1 y = dClim.integrate_box(0,j) if (y) <= pcum_clim[1]: test = 1 upper_ind = j pf_B = round(dFcst.integrate_box(-inf, lower_ind), 2) pf_N = round(dFcst.integrate_box(lower_ind, upper_ind), 2) pf_A = round(dFcst.integrate_box(upper_ind, inf), 2) prob_fcst = array([pf_B, pf_N, pf_A]) pcum_fcst = prob_fcst.cumsum() probs[i,:] = prob_fcst if clim_data[i]<=x[lower_ind]: pcum_obs = array([1,1,1]) elif clim_data[i]>=x[upper_ind]: pcum_obs = array([0,0,1]) else: pcum_obs = array([0,1,1]) rps_fcst = 0.5 * sum(pcum_fcst-pcum_obs)**2 rps_clim = 0.5 * sum(pcum_clim-pcum_obs)**2 rpss[i] = round((1 - rps_fcst/rps_clim), 2) if rpss[i] < -9: rpss[i] = -9 idx = argsort(clim_data) index = { 'all' : idx, 'dry' : idx[:n_dry], 'wet' : idx[-n_wet:] } RPSS = {} for yrs in index: total = len(index[yrs]) RPSS[yrs] = median(rpss[index[yrs]]) # print '%s years: total is %i, median rpss is %.2f' \ # % (yrs, total, RPSS[yrs]) return RPSS
import matplotlib.pyplot as plt import numpy as np import random from brazil_percent import dados as inflow dist = random.gauss X = [dist(0, 1) for _ in range(10000)] n = len(X) x_new = np.linspace(-5, 5, 1000) y = norm.pdf(x_new) x_sort = sorted(X) kernel = gk(x_sort, bw_method='silverman') points = np.linspace(min(x_sort), max(x_sort), 1000) fit = kernel(points) print(x_sort) h = kernel.factor print(h) print(kernel.covariance) print(1 / h) y_herm = [hermite(2, True)(i) for i in x_new] plt.hist(x_sort, 100, density=True) plt.plot(x_new, y, 'k') plt.plot(x_new, y_herm) plt.show()