def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights*1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_age_integrating_model_sim(N=500, delta_true=.15, pi_true=quadratic): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) #model.parameters['p']['smoothness'] = dict(amount='Very') age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] n = mc.runiform(100, 10000, size=N) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(a, pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) print model.results return model
def validate_covariate_model_fe(N=100, delta_true=3, pi_true=.01, beta_true=[.5, -.5, 0.], replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) # add fixed effect to simulated data X = mc.rnormal(0., 1.**-2, size=(N, len(beta_true))) Y_true = pl.dot(X, beta_true) for i in range(len(beta_true)): model.input_data['x_%d' % i] = X[:, i] model.input_data['true'] = pi_true * pl.exp(Y_true) model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns) model.beta['true'] = 0. for i in range(len(beta_true)): model.beta['true']['x_%d' % i] = beta_true[i] model.beta['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['beta'] ] model.beta['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['beta'] ] add_quality_metrics(model.beta) print '\nbeta' print model.beta model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'beta') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.beta['abs_err'].dropna())), model.beta.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'beta') model.results = pandas.DataFrame(model.results) return model
def validate_age_pattern_model_sim(N=500, delta_true=.15, pi_true=quadratic): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) age_list = pl.array(mc.runiform(0, 100, size=N), dtype=int) p = pi_age_true[age_list] n = mc.runiform(100, 10000, size=N) model.input_data['age_start'] = age_list model.input_data['age_end'] = age_list model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(a, pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) print model.results return model
def validate_covariate_model_re(N=500, delta_true=.15, pi_true=.01, sigma_true=[.1, .1, .1, .1, .1], ess=1000): ## set simulation parameters import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p'][ 'heterogeneity'] = 'Slightly' # ensure heterogeneity is slightly area_list = [] for sr in sorted(model.hierarchy.successors('all')): area_list.append(sr) for r in sorted(model.hierarchy.successors(sr)): area_list.append(r) area_list += sorted(model.hierarchy.successors(r))[:5] area_list = pl.array(area_list) ## generate simulation data model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) alpha = alpha_true_sim(model, area_list, sigma_true) # choose observed prevalence values model.input_data['effective_sample_size'] = ess model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] model.input_data['true'] = pl.nan for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = pi_true * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=20000, burn=10000, thin=10, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) add_quality_metrics(model.alpha) print '\nalpha' print model.alpha.dropna() model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] add_quality_metrics(model.sigma) print 'sigma_alpha' print model.sigma model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'sigma') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.alpha['abs_err'].dropna())), model.alpha.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'alpha') model.results = pandas.DataFrame(model.results) return model
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]): ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0 delta = delta_true * pl.exp(pl.dot(Z, zeta_true)) for i in range(len(zeta_true)): model.input_data['z_%d' % i] = Z[:, i] model.input_data['true'] = pi_true model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns) model.zeta['true'] = zeta_true model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean'] model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats( )['standard deviation'] add_quality_metrics(model.zeta) print '\nzeta' print model.zeta model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.zeta['abs_err'].dropna())), model.zeta.dropna()['covered?'].mean()) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'delta') add_to_results(model, 'input_data') add_to_results(model, 'zeta') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) return model
def fit_world(id, fast_fit=False, zero_re=True, alt_prior=False, global_heterogeneity='Slightly'): """ Fit consistent for all data in world Parameters ---------- id : int The model id number for the job to fit Example ------- >>> import fit_world >>> dm = fit_world.dismod3.load_disease_model(1234) >>> fit_world.fit_world(dm) """ dir = dismod3.settings.JOB_WORKING_DIR % id ## load the model from disk or from web import simplejson as json import data reload(data) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir dm = dismod3.load_disease_model(id) except (IOError, AssertionError): dm = dismod3.load_disease_model(id) model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) try: model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir except IOError: print 'loaded data from json, failed to save in new format' ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) # set all heterogeneity priors to Slightly for the global fit for t in model.parameters: if 'heterogeneity' in model.parameters[t]: model.parameters[t]['heterogeneity'] = global_heterogeneity ### For testing: ## speed up computation by reducing number of knots ## for t in 'irf': ## model.parameters[t]['parameter_age_mesh'] = [0, 100] model.vars += dismod3.ism.consistent(model, reference_area='all', reference_sex='total', reference_year='all', priors={}, zero_re=zero_re) ## fit model to data if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100) else: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True) dm.model = model # borrow strength to inform sigma_alpha between rate types post-hoc types_with_re = ['rr', 'f', 'i', 'm', 'smr', 'p', 'r', 'pf', 'm_with', 'X'] ## first calculate sigma_alpha_bar from posterior draws from each alpha alpha_vals = [] for type in types_with_re: if 'alpha' in model.vars[type]: for alpha_i in model.vars[type]['alpha']: alpha_vals += [a for a in alpha_i.trace() if a != 0] # remove zeros because areas with no siblings are included for convenience but are pinned to zero ## then blend sigma_alpha_i and sigma_alpha_bar for each sigma_alpha_i if len(alpha_vals) > 0: sigma_alpha_bar = pl.std(alpha_vals) for type in types_with_re: if 'sigma_alpha' in model.vars[type]: for sigma_alpha_i in model.vars[type]['sigma_alpha']: cur_val = sigma_alpha_i.trace() sigma_alpha_i.trace._trace[0] = (cur_val + sigma_alpha_bar) * pl.ones_like(sigma_alpha_i.trace._trace[0]) for t in 'p i r f rr pf m_with'.split(): param_type = dict(i='incidence', r='remission', f='excess-mortality', p='prevalence', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')[t] #graphics.plot_one_type(model, model.vars[t], {}, t) for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]: print 'generating empirical prior for %s' % a for s in dismod3.settings.gbd_sexes: for y in dismod3.settings.gbd_years: key = dismod3.utils.gbd_key_for(param_type, a, y, s) if t in model.parameters and 'level_bounds' in model.parameters[t]: lower=model.parameters[t]['level_bounds']['lower'] upper=model.parameters[t]['level_bounds']['upper'] else: lower=0 upper=pl.inf emp_priors = covariate_model.predict_for(model, model.parameters.get(t, {}), 'all', 'total', 'all', a, dismod3.utils.clean(s), int(y), alt_prior, model.vars[t], lower, upper) dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0)) if 'eta' in model.vars[t]: N,A = emp_priors.shape # N samples, for A age groups delta_trace = pl.transpose([pl.exp(model.vars[t]['eta'].trace()) for _ in range(A)]) # shape delta matrix to match prediction matrix emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors**2 / delta_trace).mean(0)) else: emp_prior_std = emp_priors.std(0) dm.set_mcmc('emp_prior_std', key, emp_prior_std) from fit_emp_prior import store_effect_coefficients store_effect_coefficients(dm, model.vars[t], param_type) if 'p_pred' in model.vars[t]: graphics.plot_one_ppc(model, t) pl.savefig(dir + '/prior-%s-ppc.png'%param_type) if 'p_pred' in model.vars[t] or 'lb' in model.vars[t]: graphics.plot_one_effects(model, t) pl.savefig(dir + '/prior-%s-effects.png'%param_type) for t in 'i r f p rr pf X m_with smr'.split(): fname = dir + '/empirical_priors/data-%s.csv'%t print 'saving tables for', t, 'to', fname if 'data' in model.vars[t] and 'p_pred' in model.vars[t]: stats = model.vars[t]['p_pred'].stats(batches=5) model.vars[t]['data']['mu_pred'] = stats['mean'] model.vars[t]['data']['sigma_pred'] = stats['standard deviation'] stats = model.vars[t]['pi'].stats(batches=5) model.vars[t]['data']['mc_error'] = stats['mc error'] model.vars[t]['data']['residual'] = model.vars[t]['data']['value'] - model.vars[t]['data']['mu_pred'] model.vars[t]['data']['abs_residual'] = pl.absolute(model.vars[t]['data']['residual']) #if 'delta' in model.vars[t]: # model.vars[t]['data']['logp'] = [mc.negative_binomial_like(n*p_obs, n*p_pred, n*p_pred*d) for n, p_obs, p_pred, d \ # in zip(model.vars[t]['data']['effective_sample_size'], # model.vars[t]['data']['value'], # model.vars[t]['data']['mu_pred'], # pl.atleast_1d(model.vars[t]['delta'].stats()['mean']))] model.vars[t]['data'].to_csv(fname) graphics.plot_fit(model) pl.savefig(dir + '/prior.png') graphics.plot_acorr(model) pl.savefig(dir + '/prior-convergence.png') graphics.plot_trace(model) pl.savefig(dir + '/prior-trace.png') # save results (do this last, because it removes things from the disease model that plotting function, etc, might need try: dm.save('dm-%d-prior-%s.json' % (dm.id, 'all')) except IOError, e: print e
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1, .1, .1, .1, .1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights * 1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array([ 'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR' ]) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_covariate_model_fe(N=100, delta_true=3, pi_true=.01, beta_true=[.5, -.5, 0.], replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) # add fixed effect to simulated data X = mc.rnormal(0., 1.**-2, size=(N,len(beta_true))) Y_true = pl.dot(X, beta_true) for i in range(len(beta_true)): model.input_data['x_%d'%i] = X[:,i] model.input_data['true'] = pi_true * pl.exp(Y_true) model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns) model.beta['true'] = 0. for i in range(len(beta_true)): model.beta['true']['x_%d'%i] = beta_true[i] model.beta['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['beta']] model.beta['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars['p']['beta']] add_quality_metrics(model.beta) print '\nbeta' print model.beta model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'beta') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.beta['abs_err'].dropna())), model.beta.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'beta') model.results = pandas.DataFrame(model.results) return model
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]): ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0 delta = delta_true * pl.exp(pl.dot(Z, zeta_true)) for i in range(len(zeta_true)): model.input_data['z_%d'%i] = Z[:,i] model.input_data['true'] = pi_true model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns) model.zeta['true'] = zeta_true model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean'] model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats()['standard deviation'] add_quality_metrics(model.zeta) print '\nzeta' print model.zeta model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.zeta['abs_err'].dropna())), model.zeta.dropna()['covered?'].mean()) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'delta') add_to_results(model, 'input_data') add_to_results(model, 'zeta') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) return model
def validate_covariate_model_re(N=500, delta_true=.15, pi_true=.01, sigma_true = [.1,.1,.1,.1,.1], ess=1000): ## set simulation parameters import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p']['heterogeneity'] = 'Slightly' # ensure heterogeneity is slightly area_list = [] for sr in sorted(model.hierarchy.successors('all')): area_list.append(sr) for r in sorted(model.hierarchy.successors(sr)): area_list.append(r) area_list += sorted(model.hierarchy.successors(r))[:5] area_list = pl.array(area_list) ## generate simulation data model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) alpha = alpha_true_sim(model, area_list, sigma_true) # choose observed prevalence values model.input_data['effective_sample_size'] = ess model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] model.input_data['true'] = pl.nan for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = pi_true * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=20000, burn=10000, thin=10, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) add_quality_metrics(model.alpha) print '\nalpha' print model.alpha.dropna() model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] add_quality_metrics(model.sigma) print 'sigma_alpha' print model.sigma model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'sigma') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.alpha['abs_err'].dropna())), model.alpha.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'alpha') model.results = pandas.DataFrame(model.results) return model