def simple_hierarchical_data(n): """ Generate data based on the simple one-way hierarchical model given in section 3.1.1:: y[i,j] | alpha[j], sigma^2 ~ N(alpha[j], sigma^2) i = 1, ..., n_j, j = 1, ..., J; alpha[j] | mu, tau^2 ~ N(mu, tau^2) j = 1, ..., J. sigma^2 ~ Inv-Chi^2(5, 20) mu ~ N(5, 5^2) tau^2 ~ Inv-Chi^2(2, 10) Parameters ---------- n : list, len(n) = J, n[j] = num observations in group j """ inv_sigma_sq = mc.rgamma(alpha=2.5, beta=50.0) mu = mc.rnormal(mu=5.0, tau=5.0 ** -2.0) inv_tau_sq = mc.rgamma(alpha=1.0, beta=10.0) J = len(n) alpha = mc.rnormal(mu=mu, tau=inv_tau_sq, size=J) y = [mc.rnormal(mu=alpha[j], tau=inv_sigma_sq, size=n[j]) for j in range(J)] mu_by_tau = mu * pl.sqrt(inv_tau_sq) alpha_by_sigma = alpha * pl.sqrt(inv_sigma_sq) alpha_bar = alpha.sum() alpha_bar_by_sigma = alpha_bar * pl.sqrt(inv_sigma_sq) return vars()
def simple_hierarchical_data(n): """ Generate data based on the simple one-way hierarchical model given in section 3.1.1:: y[i,j] | alpha[j], sigma^2 ~ N(alpha[j], sigma^2) i = 1, ..., n_j, j = 1, ..., J; alpha[j] | mu, tau^2 ~ N(mu, tau^2) j = 1, ..., J. sigma^2 ~ Inv-Chi^2(5, 20) mu ~ N(5, 5^2) tau^2 ~ Inv-Chi^2(2, 10) Parameters ---------- n : list, len(n) = J, n[j] = num observations in group j """ inv_sigma_sq = mc.rgamma(alpha=2.5, beta=50.) mu = mc.rnormal(mu=5., tau=5.**-2.) inv_tau_sq = mc.rgamma(alpha=1., beta=10.) J = len(n) alpha = mc.rnormal(mu=mu, tau=inv_tau_sq, size=J) y = [mc.rnormal(mu=alpha[j], tau=inv_sigma_sq, size=n[j]) for j in range(J)] mu_by_tau = mu * pl.sqrt(inv_tau_sq) alpha_by_sigma = alpha * pl.sqrt(inv_sigma_sq) alpha_bar = alpha.sum() alpha_bar_by_sigma = alpha_bar * pl.sqrt(inv_sigma_sq) return vars()
def pr5km_pop5km(self, pr, pop): """ Expects a pr array in 5km and pop array in 5km """ # loose any trailing dimensioanilty pr=np.squeeze(pr) pop=np.squeeze(pop) #if pr.shape != (0,pop[::pop_pr_res].shape[1]): if pr.shape!=pop.shape: #if pr.shape[0]!=np.shape(pop[:,::1])[1]: raise ValueError, 'PR input has shape %s, but the population input had shape %s.'%(pr.shape, pop.shape) # define blank 5km 2-d array to house burden burden_5km = np.zeros(np.product(pr.shape)).reshape(pr.shape) # extract vector of pr at 5km only where pr is non-zero - if all zero then return blank template where_pr_pos_5km = np.where(pr > 0) if len(where_pr_pos_5km[0])==0: return burden_5km pr_where_pr_pos_5km = np.atleast_1d(pr[where_pr_pos_5km]) # initialise 5km zero 1-d array for rate rate_5km = np.zeros(np.product(pr.shape)).reshape(pr.shape) # calculate rate for non-zero PR pixels i = np.random.randint(self.n) mu = self.f[i](pr_where_pr_pos_5km) r = (self.r_int[i] + self.r_lin[i] * pr_where_pr_pos_5km + self.r_quad[i] * pr_where_pr_pos_5km**2)*self.nyr rate_where_pr_pos_5km = np.atleast_1d(pm.rgamma(beta=r/mu, alpha=r)) # re-map thse rate values onto full length 5km rate vector rate_5km[where_pr_pos_5km]=rate_where_pr_pos_5km if(np.shape(pop)!=np.shape(rate_5km)): raise ValueError, '1km rate array has shape %s, but the 1km population array has shape %s.'%(np.shape(rate_5km),np.shape(pop)) # multiply 5km rate by 5km pop array popRate = rate_5km*pop # extract non-zero pixels (now also excludes zero Pop as well as zero rate), and return all zeroes if no non-zero pixels where_popRate_pos = np.where(popRate > 0) if len(where_popRate_pos[0])==0: return burden_5km popRate_where_popRate_pos = popRate[where_popRate_pos] # carry out poisson draws to define burden in these non-zero pixels burden_where_popRate_pos = np.random.poisson(popRate_where_popRate_pos) # re-map burden values to full 1km 2-d burden array burden_5km[where_popRate_pos] = burden_where_popRate_pos #for l in xrange(0,len(where_pos[0])): # j=where_pos[0][l] # out[:,j*pop_pr_res:(j+1)*pop_pr_res] = np.random.poisson(rate[l]*pop[:,j*pop_pr_res:(j+1)*pop_pr_res],size=(pop_pr_res,pop_pr_res)) return burden_5km
def incidence(sp_sub, two_ten_facs=two_ten_factors, p2b = BurdenPredictor('CSE_Asia_and_Americas_scale_0.6_model_exp.hdf5', N_year), N_year = N_year): pr = sp_sub.copy('F') pr = invlogit(pr) * two_ten_facs[np.random.randint(len(two_ten_facs))] i = np.random.randint(len(p2b.f)) mu = p2b.f[i](pr) # Uncomment and draw a negative binomial variate to get incidence over a finite time horizon. r = (p2b.r_int[i] + p2b.r_lin[i] * pr + p2b.r_quad[i] * pr**2) ar = pm.rgamma(beta=r/mu, alpha=r*N_year) #out = (1-np.exp(-ar)) # what we originally produced e.g. for Afghanistan out = ar # effectively what we did for burden paper out[np.where(out==0)]=1e-10 out[np.where(out==1)]=1-(1e-10) return out
def step(self): # We're going to do this in a way that allows easy extension # to multivariate beta (and even obs with non-diagonal covariances, # for whatever that's worth). y = np.atleast_1d(np.squeeze(self.gamma_obs.value)) if np.alen(y) == 0: self.stochastic.random() return mu_y = getattr(self.gamma_mu, 'value', self.gamma_mu) r2 = np.sum(np.square(y - mu_y)) alpha_post = self.alpha_prior + np.alen(y)/2. beta_post = self.beta_prior + r2/2. parents_post = {'alpha': alpha_post, 'beta': beta_post} self.stochastic.parents_post = parents_post self.stochastic.value = pymc.rgamma(**parents_post)
import pymc import numpy as np trans = [[80,10,10],[10,80,10],[10,10,80]] n_samples = 1000 means = [pymc.rgamma(alpha,beta,size=n_samples) for alpha,beta in zip([1,2,3],[0.1,0.2,0.3])] variances = [pymc.rgamma(alpha,beta,size=n_samples) for alpha,beta in zip([.2,.3,.4],[0.1,0.1,0.1])] transitions = [pymc.rdirichlet(trans_,size=n_samples) for trans_ in trans] n_gamma = 3 n_modes = n_gamma * 2 mean_params = [pymc.Gamma('mean_param{}'.format(i), alpha=1, beta=.1) for i in range(n_modes)] var_params = [pymc.Gamma('var_param{}'.format(i), alpha=1, beta=.1) for i in range(n_modes)] trans_params = [pymc.Beta('trans_params{}'.format(i), alpha=1,beta=1) for i in range(n_gamma*n_gamma)] mean_obs = [] mean_pred = [] var_obs = [] var_pred = [] trans_obs = [] trans_pred = [] for i in xrange(n_gamma): alpha1 = mean_params[i*2] beta1 = mean_params[i*2+1] mean_obs.append(pymc.Gamma("mean_obs{}".format(i),alpha=alpha1,beta=beta1,value=means[i],observed=True)) mean_pred.append(pymc.Gamma("mean_pred{}".format(i),alpha=alpha1,beta=beta1))
def map_S(S): # Make a map rast = spherical.mesh_to_map(X, S.value, 501) import pylab as pl pl.clf() pl.imshow(rast, interpolation='nearest') pl.colorbar() S.rand() lpf = [lambda x: 0 for i in xrange(n)] lp = 0 * S.value vals = X[:, 0] vars = pm.rgamma(4, 4, size=n) / 1000 likelihood_vars = np.vstack((vals, vars)).T Qobs = sparse.csc_matrix((n, n)) lpf_str = "lkp = -({X}-lv(i,1))**2/2.0D0/lv(i,2)" Qobs.setdiag(1. / vars) # lpf_str = "lkp=0" # Qobs.setdiag(0*vars+1e-8) import pylab as pl S.rand() metro = pymc_objects.GMRFMetropolis(S,
def make_plots(cols, dbname, continent, recs, pr_type, nyr = 1): samp_size=1000 print continent if continent.find('Africa') >= 0: lims = [.8,2.5] elif continent.find('Asia')>=0: lims = [.5,1.5] elif continent.find('America')>=0: lims = [.2,1.] else: lims = [.8,2.5] model_id = dbname + '_' + PR_to_column[pr_type] time_scale_fac = time_scaling(recs.pcd, recs.surv_int) if pr_type=='model_exp': pr = recs.mbg_pr elif pr_type=='data': pr=recs.pr elif pr_type=='mixed': pr=recs.mix_pr elif pr_type=='data_untrans': pr=recs.pfpr else: raise ValueError, 'PR type unknown' pl.clf() envs_post = pm.Matplot.func_envelopes(cols.fplot[:]*samp_size, [.25, .5, .9]) for env in envs_post: env.display(xplot, .8, new=False) pl.xlabel(r'Prevalence $(Pf$PR$_{2-10})$') pl.ylabel('Incidence (per 1000 p.a.)') # ar_data = recs.cases/recs.pyor/np.minimum(1,7./recs.surv_int) ar_data = recs.cases/recs.pyor*time_scale_fac # print ar_data.min()*samp_size, ar_data.max()*samp_size # ar_in = ar_data[np.where((pr<.25) * (pr > .10))]*samp_size # print ar_in.min(), ar_in.max() pl.plot(pr, ar_data*samp_size, 'r.', label='data') # pl.title(continent) # pl.legend(loc=2) pl.axis([0,lims[0],0,2500]) pl.savefig('../figs/%s_post.png'%model_id) # pl.figure() # pl.plot(xplot, cols.fplot[:].T*samp_size) # pl.plot(pr, ar_data*samp_size, 'r.', label='data') pl.figure() Nsamps = len(cols.r) AR_pred = np.empty((Nsamps*100, len(xplot))) for i in xrange(Nsamps): this_mu = cols.fplot[i] this_r = (cols.r_int[i] + cols.r_lin[i] * xplot + cols.r_quad[i] * xplot**2) # Uncomment to make an actual sample # AR_pred[i*100:(i+1)*100,:] = pm.rnegative_binomial(r=this_r, mu=this_mu*samp_size, size=100) # Uncomment to multiply population-wide AR AR_pred[i*100:(i+1)*100,:] = pm.rgamma(beta=this_r*nyr/this_mu, alpha=this_r*nyr, size=100)*1000 envs_pred = pm.Matplot.func_envelopes(AR_pred, [.25, .5, .9]) for env in envs_pred: env.display(xplot, .8, new=False) thirty_index = np.argmin(np.abs(xplot-.3)) print envs_pred[0].hi[thirty_index], envs_pred[0].lo[thirty_index] pl.xlabel(r'Prevalence $(Pf$PR$_{2-10})$') pl.ylabel('Incidence (per 1000 p.a.)') pl.plot(pr, ar_data*samp_size, 'r.', label='data') # pl.title(continent) # pl.legend(loc=2) pl.axis([0,lims[0],0,2500]) pl.savefig('../figs/%s_pred.png'%model_id) # Pdb(color_scheme='Linux').set_trace() # if hasattr(recs.lat, 'mask'): # where_lonlat = np.where(1-recs.lat.mask) # # else: # # where_lonlat = np.where(1-np.isnan(recs.lat)) # lat = recs.lat[where_lonlat] # lon = recs.lon[where_lonlat] mean_dev = np.mean(cols.AR_dev[:], axis=0)#[where_lonlat] # devs = np.rec.fromarrays([mean_dev, recs.lon, recs.lat], names=('mean_deviance','longitude','latitude')) # pl.rec2csv(devs, '../figs/%s_deviance.csv'%model_id) # pl.close('all') return envs_post, envs_pred
def make_model(X): neighbors, triangles, trimap, b = spherical.triangulate_sphere(X) # spherical.plot_triangulation(X,neighbors) # Matrix generation triangle_areas = [spherical.triangle_area(X, t) for t in triangles] Ctilde = spherical.Ctilde(X, triangles, triangle_areas) C = spherical.C(X, triangles, triangle_areas) G = spherical.G(X, triangles, triangle_areas) # Operator generation Ctilde = cholmod.into_matrix_type(Ctilde) G = cholmod.into_matrix_type(G) # amp is the overall amplitude. It's a free variable that will probably be highly confounded with kappa. amp = pm.Exponential('amp', .0001, value=100) # A constant mean. m = pm.Uninformative('m', value=0) @pm.deterministic(trace=False) def M(m=m, n=len(X)): """The mean vector""" return np.ones(n) * m kappa = pm.Exponential('kappa', 1, value=3) alpha = pm.DiscreteUniform('alpha', 1, 10, value=2., observed=True) @pm.deterministic(trace=False) def Q(kappa=kappa, alpha=alpha, amp=amp): out = operators.mod_frac_laplacian_precision( Ctilde, G, kappa, alpha, cholmod) / np.asscalar(amp)**2 return out # Nailing this ahead of time reduces time to compute logp from .18 to .13s for n=25000. pattern_products = cholmod.pattern_to_products(Q.value) # @pm.deterministic # def pattern_products(Q=Q): # return cholmod.pattern_to_products(Q) @pm.deterministic(trace=False) def precision_products(Q=Q, p=pattern_products): try: return cholmod.precision_to_products(Q, **p) except cholmod.NonPositiveDefiniteError: return None S = pymc_objects.SparseMVN('S', M, precision_products, cholmod) vars = pm.rgamma(4, 4, size=n) vals = X[:, 2] data = pm.Normal('data', S, 1. / vars, value=vals, observed=True) Qobs = sparse.csc_matrix((n, n)) Qobs.setdiag(1. / vars) @pm.deterministic(trace=False) def true_evidence(Q=Q, M=M, vals=vals, vars=vars): C = np.array(Q.todense().I + np.diag(vars)) return pm.mv_normal_cov_like(vals, M, C) # Stuff for the scoring algorithm-based full conditional def first_likelihood_derivative(x, vals=vals, vars=vars): return -(x - vals) / vars def second_likelihood_derivative(x, vals=vals, vars=vars): return -1. / vars return locals()
return out # Nailing this ahead of time reduces time to compute logp from .18 to .13s for n=25000. pattern_products = cholmod.pattern_to_products(Q.value) # @pm.deterministic # def pattern_products(Q=Q): # return cholmod.pattern_to_products(Q) @pm.deterministic def precision_products(Q=Q, p=pattern_products): return cholmod.precision_to_products(Q, **p) S=pymc_objects.SparseMVN('S',M, precision_products, cholmod) vals = X[:,0] vars = pm.rgamma(4,4,size=n)/10 Qobs = sparse.csc_matrix((n,n)) Qobs.setdiag(1./vars) def vecdiff(v1,v2): return np.abs((v2-v1)).max() true_mcond, _ = cholmod.conditional_mean_and_precision_products(vals,M,Q.value+Qobs,Qobs,**pattern_products) # true_mcond_ = M+np.dot(Q.value.todense().I,np.linalg.solve((Q.value.todense().I+np.diag(vars)),(vals-M))) # Stuff for the scoring algorithm-based full conditional def first_likelihood_derivative(x, vals=vals, vars=vars): return -(x-vals)/vars
import pymc import numpy as np trans = [[80, 10, 10], [10, 80, 10], [10, 10, 80]] n_samples = 1000 means = [ pymc.rgamma(alpha, beta, size=n_samples) for alpha, beta in zip([1, 2, 3], [0.1, 0.2, 0.3]) ] variances = [ pymc.rgamma(alpha, beta, size=n_samples) for alpha, beta in zip([.2, .3, .4], [0.1, 0.1, 0.1]) ] transitions = [pymc.rdirichlet(trans_, size=n_samples) for trans_ in trans] n_gamma = 3 n_modes = n_gamma * 2 mean_params = [ pymc.Gamma('mean_param{}'.format(i), alpha=1, beta=.1) for i in range(n_modes) ] var_params = [ pymc.Gamma('var_param{}'.format(i), alpha=1, beta=.1) for i in range(n_modes) ] trans_params = [ pymc.Beta('trans_params{}'.format(i), alpha=1, beta=1) for i in range(n_gamma * n_gamma) ] mean_obs = []
def make_plots(cols, dbname, continent, recs, pr_type, nyr=1): samp_size = 1000 print continent if continent.find('Africa') >= 0: lims = [.8, 2.5] elif continent.find('Asia') >= 0: lims = [.5, 1.5] elif continent.find('America') >= 0: lims = [.2, 1.] else: lims = [.8, 2.5] model_id = dbname + '_' + PR_to_column[pr_type] time_scale_fac = time_scaling(recs.pcd, recs.surv_int) if pr_type == 'model_exp': pr = recs.mbg_pr elif pr_type == 'data': pr = recs.pr elif pr_type == 'mixed': pr = recs.mix_pr elif pr_type == 'data_untrans': pr = recs.pfpr else: raise ValueError, 'PR type unknown' pl.clf() envs_post = pm.Matplot.func_envelopes(cols.fplot[:] * samp_size, [.25, .5, .9]) for env in envs_post: env.display(xplot, .8, new=False) pl.xlabel(r'Prevalence $(Pf$PR$_{2-10})$') pl.ylabel('Incidence (per 1000 p.a.)') # ar_data = recs.cases/recs.pyor/np.minimum(1,7./recs.surv_int) ar_data = recs.cases / recs.pyor * time_scale_fac # print ar_data.min()*samp_size, ar_data.max()*samp_size # ar_in = ar_data[np.where((pr<.25) * (pr > .10))]*samp_size # print ar_in.min(), ar_in.max() pl.plot(pr, ar_data * samp_size, 'r.', label='data') # pl.title(continent) # pl.legend(loc=2) pl.axis([0, lims[0], 0, 2500]) pl.savefig('../figs/%s_post.png' % model_id) # pl.figure() # pl.plot(xplot, cols.fplot[:].T*samp_size) # pl.plot(pr, ar_data*samp_size, 'r.', label='data') pl.figure() Nsamps = len(cols.r) AR_pred = np.empty((Nsamps * 100, len(xplot))) for i in xrange(Nsamps): this_mu = cols.fplot[i] this_r = (cols.r_int[i] + cols.r_lin[i] * xplot + cols.r_quad[i] * xplot**2) # Uncomment to make an actual sample # AR_pred[i*100:(i+1)*100,:] = pm.rnegative_binomial(r=this_r, mu=this_mu*samp_size, size=100) # Uncomment to multiply population-wide AR AR_pred[i * 100:(i + 1) * 100, :] = pm.rgamma( beta=this_r * nyr / this_mu, alpha=this_r * nyr, size=100) * 1000 envs_pred = pm.Matplot.func_envelopes(AR_pred, [.25, .5, .9]) for env in envs_pred: env.display(xplot, .8, new=False) thirty_index = np.argmin(np.abs(xplot - .3)) print envs_pred[0].hi[thirty_index], envs_pred[0].lo[thirty_index] pl.xlabel(r'Prevalence $(Pf$PR$_{2-10})$') pl.ylabel('Incidence (per 1000 p.a.)') pl.plot(pr, ar_data * samp_size, 'r.', label='data') # pl.title(continent) # pl.legend(loc=2) pl.axis([0, lims[0], 0, 2500]) pl.savefig('../figs/%s_pred.png' % model_id) # Pdb(color_scheme='Linux').set_trace() # if hasattr(recs.lat, 'mask'): # where_lonlat = np.where(1-recs.lat.mask) # # else: # # where_lonlat = np.where(1-np.isnan(recs.lat)) # lat = recs.lat[where_lonlat] # lon = recs.lon[where_lonlat] mean_dev = np.mean(cols.AR_dev[:], axis=0) #[where_lonlat] # devs = np.rec.fromarrays([mean_dev, recs.lon, recs.lat], names=('mean_deviance','longitude','latitude')) # pl.rec2csv(devs, '../figs/%s_deviance.csv'%model_id) # pl.close('all') return envs_post, envs_pred
def make_model(X): neighbors, triangles, trimap, b = spherical.triangulate_sphere(X) # spherical.plot_triangulation(X,neighbors) # Matrix generation triangle_areas = [spherical.triangle_area(X, t) for t in triangles] Ctilde = spherical.Ctilde(X, triangles, triangle_areas) C = spherical.C(X, triangles, triangle_areas) G = spherical.G(X, triangles, triangle_areas) # Operator generation Ctilde = cholmod.into_matrix_type(Ctilde) G = cholmod.into_matrix_type(G) # amp is the overall amplitude. It's a free variable that will probably be highly confounded with kappa. amp = pm.Exponential('amp', .0001, value=100) # A constant mean. m = pm.Uninformative('m',value=0) @pm.deterministic(trace=False) def M(m=m,n=len(X)): """The mean vector""" return np.ones(n)*m kappa = pm.Exponential('kappa',1,value=3) alpha = pm.DiscreteUniform('alpha',1,10,value=2., observed=True) @pm.deterministic(trace=False) def Q(kappa=kappa, alpha=alpha, amp=amp): out = operators.mod_frac_laplacian_precision(Ctilde, G, kappa, alpha, cholmod)/np.asscalar(amp)**2 return out # Nailing this ahead of time reduces time to compute logp from .18 to .13s for n=25000. pattern_products = cholmod.pattern_to_products(Q.value) # @pm.deterministic # def pattern_products(Q=Q): # return cholmod.pattern_to_products(Q) @pm.deterministic(trace=False) def precision_products(Q=Q, p=pattern_products): try: return cholmod.precision_to_products(Q, **p) except cholmod.NonPositiveDefiniteError: return None S=pymc_objects.SparseMVN('S',M, precision_products, cholmod) vars = pm.rgamma(4,4,size=n) vals = X[:,2] data = pm.Normal('data', S, 1./vars, value=vals, observed=True) Qobs = sparse.csc_matrix((n,n)) Qobs.setdiag(1./vars) @pm.deterministic(trace=False) def true_evidence(Q=Q, M=M, vals=vals, vars=vars): C = np.array(Q.todense().I+np.diag(vars)) return pm.mv_normal_cov_like(vals, M, C) # Stuff for the scoring algorithm-based full conditional def first_likelihood_derivative(x, vals=vals, vars=vars): return -(x-vals)/vars def second_likelihood_derivative(x, vals=vals, vars=vars): return -1./vars return locals()