Пример #1
0
def main():
    from scipy.stats import gaussian_kde as gk
    from numpy import arange
    import numpy as np
    models = gen_models()
    clim_data, hindcast, ensemble = combine_models(models)
    prcp = get_lcrb_prcp()
    df = gen_ensembles()
    idx = (clim_data.index.year <= 2009) & (clim_data.index.year >= 1982)
    rps = RPSS(clim_data[idx], ensemble[:,idx], 9, 9)
    #hss = HSS(clim_data[idx], ensemble[:,idx], 9, 9)
    print 'NIPA RPSS:\n', rps
    #print 'NIPA HSS:\n', hss
    rps = RPSS(clim_data[idx], df.values, 9, 9)
    #hss = HSS(clim_data[idx], df.values, 9, 9)
    print 'NMME:RPSS\n', rps
    #print 'NMME HSS:\n', hss
    years = np.arange(1982,2010)
    fig, axes = plt.subplots(6, 5, figsize = (12,12), sharex=True, sharey=True)
    p_x = np.linspace(0, clim_data.max() + 10, 500)
    for i, n in enumerate(years):
        dNIPA = gk(ensemble[:,idx][i], bw_method = 0.5)
        dNMME = gk(df.values[i], bw_method = 0.5)
        ax = axes.ravel()[i]
        nip, = ax.plot(p_x,dNIPA.pdf(p_x), label = 'NIPA', linewidth = 2)
        ax.fill_between(p_x, 0, dNIPA.pdf(p_x), alpha = 0.5)
        nmm, = ax.plot(p_x,dNMME.pdf(p_x), label = 'NMME', linewidth = 2)
        ax.fill_between(p_x,0,dNMME.pdf(p_x), alpha = 0.5, color = 'green')
        obs = ax.vlines(clim_data[idx][i], 0, 0.005, linewidth = 2, label = 'obs')
        ax.set_title(str(n))
        plt.xticks(arange(0,800,200))
        plt.yticks(arange(0,0.010, 0.002))
    axes[5,3].legend((nip, nmm, obs), ('NIPA', 'NMME', 'obs'))
    return fig, axes
Пример #2
0
def densityPlot(clim_data, ensemble, year):
    from numpy import linspace
    from scipy.stats import gaussian_kde as gk
    from matplotlib import pyplot as plt
    idx = clim_data.index.year == year
    dFcst = gk(ensemble[:,idx].squeeze(), bw_method = 0.5)
    dClim = gk(clim_data, bw_method = 0.5)
    x = linspace(0,600,1200)
    plt.plot(x,dFcst(x), label = 'Forecast')
    plt.plot(x,dClim(x), label = 'Climatology')
    plt.vlines(clim_data[idx], 0, max(dFcst(x)), label = 'Observed')
    plt.title('%i' % year)
    plt.show()

    return
Пример #3
0
def mcmc_clust(al=np.genfromtxt(ALIGNFILE,delimiter=',').astype(np.int), imps=IMPS):
	allen = al.shape[0]
	seqlen = al.shape[1]
	delclust = clust(al)
	
	print 'Building likelihood distributions...'
	try: 
		pdist = gk(np.genfromtxt(LC_DIST, delimiter=','))
	except IOError: 
		print 'Existing distribution not found, building...'
		pdist = lclass(al, imps)

	print 'Starting MCMC:'
	print 'Step#\t|New Lik\t|New PropLik\t|Old Lik\t|Old PropLik\t|Accept Prob'
	old = impute.impute(al,imps, orderfunc=ORDERFUNC)
	old_lik = clik((old,delclust,allen))
	old_plik = pdist(old_lik)

	states = [(clust(old),old_lik,old_plik,old_lik,old_plik,1)]

	for i in xrange(STEPS):
		prop = impute.impute(al,imps, orderfunc=ORDERFUNC)
		prop_lik = clik((prop,delclust,allen))
		prop_plik = pdist(prop_lik)

		a = (prop_lik/old_lik)*(old_plik/prop_plik)
		states.append((clust(old),prop_lik,prop_plik,old_lik,old_plik,a))
		print '%d\t|%2f\t|%2f\t|%2f\t|%2f\t|%e' % (i+1,prop_lik,prop_plik,old_lik,old_plik,a)
		if random.random()<a:
			old, old_lik, old_plik = prop, prop_lik, prop_plik

	states.append((clust(old),prop_lik,prop_plik,old_lik,old_plik,a))
	np.savetxt(LC_STATES, np.array(states), delimiter=',')
Пример #4
0
def lclass(al, imps):
	allen = al.shape[0]
	seqlen = al.shape[1]
	delclust = clust(al)
	numprocs = multiprocessing.cpu_count()
	reps = [(al,delclust,imps)]*CCLASS_REPS
	ratios = P.map(c,reps)
	np.savetxt(OUT_RATIOS, ratios, delimiter=',')		# Save ratios?
	return gk(ratios)
Пример #5
0
def nmmecompare():
    from nmme import gen_ensembles
    from data_load import gen_models, combine_models, RPSS, HSS
    from scipy.stats import gaussian_kde as gk
    models = gen_models(cc = 0.95, quick = True)
    clim_data, hindcast, ensemble = combine_models(models)
    df = gen_ensembles()

    cmap = mpl.cm.get_cmap('viridis')
    rgba = [cmap(0.25), cmap(0.7)]
    idx = np.arange(1921,2011)>=1982
    x = 16
    dNIPA = gk(ensemble[:,idx][x], bw_method = 0.5)
    dNMME = gk(df.values[x], bw_method = 0.5)
    p_x = np.linspace(0, 600, 1000)
    fig, (ax1, ax2) = plt.subplots(1,2,sharey = True, figsize = (14,10))
    ax1.plot(p_x, dNMME.pdf(p_x)*1000, color = rgba[0], label = 'NMME', linewidth = 2)
    ax1.plot(p_x, dNIPA.pdf(p_x)*1000, color = rgba[1], label = 'NIPA', linewidth = 2)
    ax1.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color = rgba[1], alpha = 0.5)
    ax1.fill_between(p_x, 0, dNMME.pdf(p_x)*1000, color = rgba[0],alpha = 0.5)
    ax1.vlines(clim_data[idx][x], 0, 6, linewidth = 4, color = 'k')
    ax1.set_title(str(clim_data.index.year[idx][x]),fontsize = 18, fontweight='bold')
    ax1.set_ylabel('Probability Density, $10^{-3}$', fontweight = 'bold')
    h, l = ax1.get_legend_handles_labels()
    ax1.legend(h, l)

    x = 25
    dNIPA = gk(ensemble[:,idx][x], bw_method = 0.5)
    dNMME = gk(df.values[x], bw_method = 0.5)
    p_x = np.linspace(0, 600, 1000)
    ax2.plot(p_x, dNMME.pdf(p_x)*1000, color = rgba[0], label = 'NMME', linewidth = 2)
    ax2.plot(p_x, dNIPA.pdf(p_x)*1000, color = rgba[1], label = 'NIPA', linewidth = 2)
    ax2.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color = rgba[1], alpha = 0.5)
    ax2.fill_between(p_x, 0, dNMME.pdf(p_x)*1000, color = rgba[0],alpha = 0.5)
    ax2.vlines(clim_data[idx][x], 0, 6, linewidth = 4, color = 'k')
    ax2.set_title(str(clim_data.index.year[idx][x]), fontsize = 18, fontweight='bold')
    fig.text(0.33, 0.015, 'Total MAMJ Precipitation, mm')
    fig.savefig(EV['HOME'] + '/Desktop/Feb20Response/images/nmmecompare')
    plt.close(fig)
    return
Пример #6
0
def rpssexmp():
    from data_load import gen_models, combine_models
    from scipy.stats import gaussian_kde as gk
    cmap = mpl.cm.get_cmap('viridis')
    rgba = [cmap(0.8), cmap(0.2)]
    models = gen_models()
    c, h, e = combine_models(models)
    x, x2 = 15, 20
    fig, ax = plt.subplots(1, 1, figsize = (8,6))
    ax.set_ylabel('Probability Density, $10^{-3}$')
    ax.set_xlabel('Total MAMJ Precipitation, mm')
    dNIPA = gk(e[:, x], bw_method = 0.5)
    p_x = np.linspace(0,600,1000)
    ax.plot(p_x, dNIPA.pdf(p_x)*1000, linewidth = 2, color = rgba[0], label = 'Bad')
    ax.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color = rgba[0], alpha = 0.5)
    dNIPA = gk(e[:, x2], bw_method = 0.5)
    ax.plot(p_x, dNIPA.pdf(p_x)*1000, linewidth = 2, color = rgba[1], label = 'Worse')
    ax.fill_between(p_x, 0, dNIPA.pdf(p_x)*1000, color =rgba[1] , alpha = 0.5)
    ax.vlines(100, 0,8, linewidth = 2, label = 'obs')

    plt.legend()
    fig.savefig(EV['HOME'] + '/Desktop/Feb20Response/images/rpss_exmp')
    return
Пример #7
0
def mcmc_ttmp(al=np.genfromtxt(ALIGNFILE,delimiter=',').astype(np.int), imps=IMPS):
	allen = al.shape[0]
	seqlen = al.shape[1]
	delclust = clust(al)
	
	print 'Building likelihood distributions...'
	ldist = norm(*norm.fit(np.genfromtxt(RDIST, delimiter=',')))
	def lik(al):
		return ldist.pdf(tt.ttratio(al))
	try: 
		pdist = gk(np.genfromtxt(TTMP_DIST, delimiter=','))
	except IOError: 
		print 'Existing distribution not found, building...'
		pdist = lclass_ttmp(al, imps, lik)

	print 'Starting MCMC:'
	print 'Step#\tOld Clust\t|New Lik\t|New PropLik\t|Old Lik\t|Old PropLik\t|Accept Prob'
	old = impute.impute(al,imps, orderfunc=ORDERFUNC)
	old_lik = lik(old)
	old_plik = pdist(old_lik)
	old_clust = clust(old)

	states = [(old_clust,old_lik,old_plik,old_lik,old_plik,1)]

	Q, procs, data = multiprocessing.Queue(maxsize=MQS), [], []
	numprocs = multiprocessing.cpu_count()-1
	reps = -(-STEPS/numprocs)
	for i in xrange(numprocs):
		p = multiprocessing.Process(target=gttmp, args=(lik,al,imps,reps,Q,i,pdist))
		procs.append(p)
		p.start()
	for i in xrange(reps*numprocs):
		prop, prop_lik, prop_plik, prop_clust = Q.get()
		a = (prop_lik/old_lik)*(old_plik/prop_plik)
		states.append((old_clust,prop_lik,prop_plik,old_lik,old_plik,a))
		print '%d\t|%2f\t|%2f\t|%2f\t|%2f\t|%2f\t|%e' % (i+1,old_clust,prop_lik,prop_plik,old_lik,old_plik,a)
		if random.random()<a:
			old, old_lik, old_plik, old_clust = prop, prop_lik, prop_plik, prop_clust

	states.append((old_clust,prop_lik,prop_plik,old_lik,old_plik,a))
	np.savetxt(TTMP_STATES, np.array(states), delimiter=',')
Пример #8
0
def lclass_ttmp(al, imps, lik):
	allen = al.shape[0]
	seqlen = al.shape[1]
	delclust = clust(al)
	Q, procs, data = multiprocessing.Queue(maxsize=MQS), [], []
	numprocs = multiprocessing.cpu_count()
	reps = -(-CCLASS_REPS/numprocs)
	for i in xrange(numprocs):
		p = multiprocessing.Process(target=gttmp, args=(lik,al,imps,reps,Q,i))
		procs.append(p)
		p.start()
	old_percent = 0
	for i in xrange(reps*numprocs):
		percent = int(float(i)/(reps*numprocs) * 100)
		if percent > old_percent: 
			print '%d percent' % int(percent)
			old_percent = percent
		prop, prop_lik, prop_clust = Q.get()
		data.append(prop_lik)
	np.savetxt(TTMP_DIST, data, delimiter=',')		# Save ratios?
	return gk(data)
Пример #9
0
def main():
	# Parse input
	parser = argparse.ArgumentParser(description='validate MCMC performance')
	parser.add_argument('alignfile', help='file containing alignment to use')
	parser.add_argument('-n', '--numseqs', type=int, help='number of sequences to use in subsample', required=True)
	parser.add_argument('-m', '--numsites', type=int, help='number of sites to include in subsample', required=True)
	parser.add_argument('-q', '--subseqs', type=int, help='number of sequences to subsample to', required=True)
	parser.add_argument('-r', '--repetitions', type=int, help='number of repetitions to do', required=True)
	parser.add_argument('-s', '--subsamples', type=int, help='number of subsamples per repetition', required=True)
	args = parser.parse_args()
	try: al = np.genfromtxt(args.alignfile, delimiter=',').astype(int)
	except ValueError: print 'Invalid alignment file'; exit()
	reps = args.repetitions
	subsamples = args.subsamples
	allen = args.numseqs
	seqlen = args.numsites
	subseqs = args.subseqs

	fname = args.alignfile[:-4]+'_tr'
	imps = allen-subseqs
	devnull = open(os.devnull, 'w')

	print """
Loaded data:
Full alignment is %dx%d sequences & sites.
Randomly selected datasets will be %dx%d sequences and sites.
Subsamples will be %dx%d sequences and sites.
There will be %d repetitions of %d subsamples each.\n
	""" % (al.shape[0], al.shape[1], allen, seqlen, subseqs, seqlen, reps, subsamples)

	results = []
	print 'dset\tsubset\ttrue_clust\tsub_clust\tmcmc_clust\timp_clust'
	# Main loop
	for i in xrange(reps):
		# Generate a dataset to test on
		dataset = al[np.random.choice(xrange(al.shape[0]),allen,replace=0)][:,np.random.choice(xrange(al.shape[1]),seqlen,replace=0)]
		trueclust = m.clust(dataset)
		for j in xrange(subsamples):
			# Generate a subsample of the current dataset to simulate missingness
			subsample = dataset[np.random.choice(xrange(dataset.shape[0]),subseqs,replace=0)]
			subclust = m.clust(subsample)

			# Attempt to recover with imputation
			imputed_states = [m.impute.impute(subsample, imps) for k in xrange(IMPUTATIONS)]
			imputed_clusts = map(m.clust, imputed_states)
			avg_impclust = np.mean(imputed_clusts)

			# Attempt to recover with KS optimization
			"""m.V_TDIST, m.V_STATES = '%s_%d_%d_target.csv' % (fname,i,j), '%s_%d_%d_states.csv' % (fname,i,j)
			sys.stdout, sys.stderr = devnull, devnull
			states, tdist = m.mcmc_ns(al=subsample, imps=imps)
			sys.stdout, sys.stderr = sys.__stdout__, sys.__stderr__"""
#			tdist = h.build_target(subsample, 100,'%s_%d_%d_target.csv' % (fname,i,j))
#			cc, clust, ks = h.opt(subsample,imps,tdist)
			tdist = h.nonparametric_target(subsample, 100)
			cc, clust, ks = h.npopt(subsample,imps,tdist)
			
			# Plot results
			cclass_hist = plt.hist(cc,normed=1,alpha=.5, label='Sampled congruency classes', color='green')
#			xr = np.linspace(np.min(states[:,2]),np.max(states[:,2]),1000)
			xr = np.linspace(0,1,1000)
			tpdf = gk(tdist)(xr)
			plt.plot(xr,tpdf, label='Target distribution', color='blue')
			ymax = np.max([np.max(cclass_hist[0]), np.max(tpdf)])
			plt.plot((subclust,subclust),(0,ymax),label='Observed clustering value (congruency class)', color='blue')
			mean_cclass = np.mean(cc)
			plt.plot((mean_cclass,mean_cclass),(0,ymax),label='MCMC average congruency class', color='green')
			plt.legend()
			plt.savefig('%s_%d_%d_cclass.png' % (fname,i,j))
			plt.clf()

			clust_hist = plt.hist(clust,normed=1,alpha=.5, label='Sampled clustering values', color='green')
			ymax = np.max(clust_hist[0])
			plt.plot((subclust,subclust),(0,ymax),label='Observed clustering value', color='blue')
			plt.plot((trueclust,trueclust),(0,ymax),label='True clustering value', color='red')
			plt.plot((avg_impclust,avg_impclust),(0,ymax),label='Imputed point estimate (mean)', color='purple')
			mean_clust = np.mean(clust)
			plt.plot((mean_clust,mean_clust),(0,ymax),label='Mean MCMC estimate', color='green')
			plt.legend()
			plt.savefig('%s_%d_%d_clust.png' % (fname,i,j))
			plt.clf()
			results.append([i,j,trueclust,subclust,mean_clust,avg_impclust])
			print '%d\t%d\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' % (i,j,trueclust,subclust,mean_clust,avg_impclust)
	np.savetxt('%s_summary.csv'%fname, results, delimiter=',', fmt='%s')
Пример #10
0
def terciles():
    from data_load import gen_models, combine_models
    from scipy.stats import gaussian_kde as gk

    models = gen_models(cc = 0.95, quick = True)
    c, h, e = combine_models(models)

    cmap = mpl.cm.get_cmap('viridis')
    rgba = [cmap(0.0), cmap(0.35), cmap(0.8)]

    idx = np.argsort(c)

    B = c[idx][24]
    A = c[idx][66]

    x = 20
    dNIPA = gk(e[:,x], bw_method = 0.5)
    dCLIM = gk(c, bw_method = 0.5)

    bclim = round(dCLIM.integrate_box(0,B),2)* 100
    aclim = round(dCLIM.integrate_box(A, 600),2)* 100
    nclim = round(dCLIM.integrate_box(B,A),2)* 100

    bnipa = round(dNIPA.integrate_box(0,B),2)* 100
    anipa = round(dNIPA.integrate_box(A, 600),2)* 100
    nnipa = round(dNIPA.integrate_box(B,A),2)* 100

    p_x = np.linspace(0,600,1000)
    below = p_x[p_x<B]
    middle = p_x[(p_x>=B) & (p_x<=A)]
    above = p_x[p_x>A]

    fig, (ax1, ax2) = plt.subplots(1,2,figsize = (12,6), sharey = True)

    ax1.plot(p_x, dCLIM.pdf(p_x)*1000, linewidth = 2, color = 'k')
    ax1.fill_between(below, 0, dCLIM.pdf(below)*1000, color = rgba[2], alpha = 0.5)
    ax1.fill_between(above, 0, dCLIM.pdf(above)*1000, color = rgba[0], alpha = 0.5)
    ax1.fill_between(middle, 0, dCLIM.pdf(middle)*1000, color = rgba[1], alpha = 0.5)
    line = '\n\nB: %.0f%%\nN: %.0f%%\nA: %.0f%%' % (bclim, nclim, aclim)
    ax1.text(400,3,line)
    ax1.set_title('Climatological PDF', fontsize = 22, fontweight = 'bold')
    ax1.set_ylabel('Probability Density, $10^{-3}$', fontweight = 'bold')

    ax2.plot(p_x, dCLIM.pdf(p_x)*1000, linewidth = 2,
                            color = 'grey', alpha = 0.7)
    ax2.fill_between(p_x, dCLIM.pdf(p_x)*1000, label = 'Climatology', color = 'grey', alpha = 0.2)
    ax2.plot(p_x, dNIPA.pdf(p_x)*1000, linewidth = 2, color = 'k')
    ax2.fill_between(below, 0, dNIPA.pdf(below)*1000, color = rgba[2],
                        label = 'Below', alpha = 0.5)
    ax2.fill_between(middle, 0, dNIPA.pdf(middle)*1000, color = rgba[1],
                        label = 'Normal', alpha = 0.5)
    ax2.fill_between(above, 0, dNIPA.pdf(above)*1000, color = rgba[0],
                        label = 'Above', alpha = 0.5)

    line = '\n\nB: %.0f%%\nN: %.0f%%\nA: %.0f%%' % (bnipa, nnipa, anipa)
    ax2.text(400,3,line)
    ax2.set_title('Forecast PDF', fontsize = 22, fontweight = 'bold')
    h, l = ax2.get_legend_handles_labels()
    fig.legend(h,l, loc = (0.395, 0.55))
    fig.text(0.33, 0.015, 'Total MAMJ Precipitation, mm')
    fig.savefig(EV['HOME'] + '/Desktop/Feb20Response/images/tercile')
    return
Пример #11
0
def RPSS(clim_data, ensemble, n_dry = 10, n_wet = 10, file = None):
    from scipy.stats import gaussian_kde as gk
    from numpy import linspace, array, zeros, inf, round, argsort, median

    n = len(clim_data)
    nc = array([33., 34., 33.])
    prob_clim = nc/100
    pcum_clim = prob_clim.cumsum()
    x = linspace(0, clim_data.max() + 10, 500)
    dClim = gk(clim_data, bw_method = 0.5)
    rpss = zeros((n))
    probs = zeros((n,3))

    for i in range(n):
        dFcst = gk(ensemble[:,i], bw_method = 0.5)
        j = 1
        test = 0
        while test == 0:
            j = j+1
            y = dClim.integrate_box(0,j)
            if y >= pcum_clim[0]:
                lower_ind = j-1
                test = 1

        j = 500
        test = 0
        while test == 0:
            j = j-1
            y = dClim.integrate_box(0,j)
            if (y) <= pcum_clim[1]:
                test = 1
                upper_ind = j

        pf_B = round(dFcst.integrate_box(-inf, lower_ind), 2)
        pf_N = round(dFcst.integrate_box(lower_ind, upper_ind), 2)
        pf_A = round(dFcst.integrate_box(upper_ind, inf), 2)

        prob_fcst = array([pf_B, pf_N, pf_A])
        pcum_fcst = prob_fcst.cumsum()
        probs[i,:] = prob_fcst

        if clim_data[i]<=x[lower_ind]:
            pcum_obs = array([1,1,1])
        elif clim_data[i]>=x[upper_ind]:
            pcum_obs = array([0,0,1])
        else:
            pcum_obs = array([0,1,1])

        rps_fcst = 0.5 * sum(pcum_fcst-pcum_obs)**2

        rps_clim = 0.5 * sum(pcum_clim-pcum_obs)**2


        rpss[i] = round((1 - rps_fcst/rps_clim), 2)
        if rpss[i] < -9: rpss[i] = -9

    idx = argsort(clim_data)

    index = {   'all' : idx,
                'dry' : idx[:n_dry],
                'wet' : idx[-n_wet:]
                }
    RPSS = {}
    for yrs in index:
        total = len(index[yrs])
        RPSS[yrs] = median(rpss[index[yrs]])
        # print '%s years: total is %i, median rpss is %.2f' \
        # % (yrs, total, RPSS[yrs])

    return RPSS
Пример #12
0
import matplotlib.pyplot as plt

import numpy as np
import random

from brazil_percent import dados as inflow

dist = random.gauss
X = [dist(0, 1) for _ in range(10000)]
n = len(X)

x_new = np.linspace(-5, 5, 1000)
y = norm.pdf(x_new)

x_sort = sorted(X)
kernel = gk(x_sort, bw_method='silverman')
points = np.linspace(min(x_sort), max(x_sort), 1000)
fit = kernel(points)
print(x_sort)
h = kernel.factor

print(h)
print(kernel.covariance)
print(1 / h)

y_herm = [hermite(2, True)(i) for i in x_new]

plt.hist(x_sort, 100, density=True)
plt.plot(x_new, y, 'k')
plt.plot(x_new, y_herm)
plt.show()