def validate_age_group(model, replicate): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) N = 30 delta_true = 5.0 pi_true = true_rate_function m = simulate_age_group_data(N=N, delta_true=delta_true, pi_true=pi_true) if model == "midpoint_covariate": fit_midpoint_covariate_model(m) if model == "alt_midpoint_covariate": fit_alt_midpoint_covariate_model(m) elif model == "age_standardizing": fit_age_standardizing_model(m) elif model == "age_integrating": fit_age_integrating_model(m) elif model == "midpoint_model": fit_midpoint_model(m) elif model == "disaggregation_model": fit_disaggregation_model(m) else: raise TypeError, 'Unknown model type: "%s"' % model # compare estimate to ground truth import data_simulation m.mu = pandas.DataFrame( dict( true=[pi_true(a) for a in range(101)], mu_pred=m.vars["mu_age"].stats()["mean"], sigma_pred=m.vars["mu_age"].stats()["standard deviation"], ) ) data_simulation.add_quality_metrics(m.mu) print "\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f" % ( m.mu["abs_err"].mean(), pl.median(pl.absolute(m.mu["rel_err"].dropna())), m.mu["covered?"].mean(), ) print data_simulation.add_quality_metrics(m.mu) data_simulation.initialize_results(m) data_simulation.add_to_results(m, "mu") data_simulation.finalize_results(m) return m
def fit(model): emp_priors = model.emp_priors ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, emp_priors['p', 'mu'], emp_priors['p', 'sigma']) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=5000, burn=2000, thin=25, tune_interval=100) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=101, burn=0, thin=1, tune_interval=100) #graphics.plot_one_ppc(model.vars['p'], 'p') #graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], emp_priors, 'p') pl.plot(model.a, model.pi_age_true, 'b--', linewidth=3, alpha=.5, label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.title('Heterogeneity %s'%model.parameters['p']['heterogeneity']) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[model.delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=model.pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.finalize_results(model) print model.results
def validate_age_group(model, replicate): # set random seed for reproducibility mc.np.random.seed(1234567+replicate) N = 30 delta_true = 5. pi_true = true_rate_function m = simulate_age_group_data(N=N, delta_true=delta_true, pi_true=pi_true) if model == 'midpoint_covariate': fit_midpoint_covariate_model(m) elif model == 'age_standardizing': fit_age_standardizing_model(m) elif model == 'age_integrating': fit_age_integrating_model(m) elif model == 'midpoint_model': fit_midpoint_model(m) elif model == 'disaggregation_model': fit_disaggregation_model(m) else: raise TypeError, 'Unknown model type: "%s"' % model # compare estimate to ground truth import data_simulation m.mu = pandas.DataFrame(dict(true=[pi_true(a) for a in range(101)], mu_pred=m.vars['mu_age'].stats()['mean'], lb_pred=m.vars['mu_age'].stats()['95% HPD interval'][:,0], ub_pred=m.vars['mu_age'].stats()['95% HPD interval'][:,1])) data_simulation.add_quality_metrics(m.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (m.mu['abs_err'].mean(), pl.median(pl.absolute(m.mu['rel_err'].dropna())), m.mu['covered?'].mean()) print data_simulation.add_quality_metrics(m.mu) data_simulation.initialize_results(m) data_simulation.add_to_results(m, 'mu') data_simulation.finalize_results(m) return m
def store_results(dm, area, sex, year): types_to_plot = 'p i r rr'.split() graphics.plot_convergence_diag(dm.vars) pl.clf() for i, t in enumerate(types_to_plot): pl.subplot(len(types_to_plot), 1, i+1) graphics.plot_data_bars(dm.model.get_data(t)) pl.plot(range(101), dm.emp_priors[t, 'mu'], linestyle='dashed', color='grey', label='Emp. Prior', linewidth=3) pl.plot(range(101), dm.true[t], 'b-', label='Truth', linewidth=3) pl.plot(range(101), dm.posteriors[t].mean(0), 'r-', label='Estimate', linewidth=3) pl.errorbar(range(101), dm.posteriors[t].mean(0), yerr=1.96*dm.posteriors[t].std(0), fmt='r-', linewidth=1, capsize=0) pl.ylabel(t) graphics.expand_axis() pl.legend(loc=(0.,-.95), fancybox=True, shadow=True) pl.subplots_adjust(hspace=0, left=.1, right=.95, bottom=.2, top=.95) pl.xlabel('Age (Years)') pl.show() model = dm model.mu = pandas.DataFrame() for t in types_to_plot: model.mu = model.mu.append(pandas.DataFrame(dict(true=dm.true[t], mu_pred=dm.posteriors[t].mean(0), sigma_pred=dm.posteriors[t].std(0))), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(), pl.median(pl.absolute(model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'mu') data_simulation.finalize_results(model) print model.results return model
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights*1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1, .1, .1, .1, .1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights * 1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array([ 'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR' ]) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N * mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125 * mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma * mu data['value'] = pl.exp( mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt( data['value'] * (1 - data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t*age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type==t] = p_t[data_type==t] # add covariate shifts import dismod3 import simplejson as json gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.hierarchy = gbd_model.hierarchy from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = {} for t in types: alpha[t] = alpha_true_sim(model, area_list, sigma_true) print json.dumps(alpha, indent=2) model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): t = data_type[i] p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i+1) pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'] model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'] data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame() model.sigma = pandas.DataFrame() for t in types: alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) alpha_t['true'] = pandas.Series(dict(alpha[t])) alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['type'] = t model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True) sigma_t = pandas.DataFrame(dict(true=sigma_true)) sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']] sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']] model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True) data_simulation.add_quality_metrics(model.alpha) data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(), pl.median(pl.absolute(model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N*mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125*mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma*mu data['value'] = pl.exp(mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt(data['value']*(1-data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0,100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
def validate_consistent_model_sim(N=500, delta_true=.5, true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t * age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type == t] = p_t[data_type == t] n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i + 1) pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) #graphics.plot_one_type(model, model.vars['p'], {}, 'p') #pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][ data_type == t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type == t] = model.vars['p'][ 'p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame( dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr' ] model.delta['sigma_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr' ] data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame( dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats() ['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.mu['abs_err'].mean(), pl.median(pl.absolute( model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.finalize_results(model) print model.results return model