def generate_disease_data(condition, cov): """ Generate csv files with gold-standard disease data, and somewhat good, somewhat dense disease data, as might be expected from a condition that is carefully studied in the literature """ age_len = dismod3.MAX_AGE ages = np.arange(age_len, dtype='float') # incidence rate i0 = .005 + .02 * mc.invlogit((ages - 44) / 3) #i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.)) # remission rate #r = 0. * ages r = .1 * np.ones_like(ages) # excess-mortality rate #f_init = .085 * (ages / 100) ** 2.5 SMR = 3. * np.ones_like(ages) - ages / age_len # all-cause mortality-rate mort = dismod3.get_disease_model('all-cause_mortality') #age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)] age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)] # TODO: take age structure from real data sparse_intervals = dict([[ region, random.sample(age_intervals, (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1) ] for ii, region in enumerate(countries_for)]) dense_intervals = dict( [[region, random.sample(age_intervals, len(age_intervals) / 2)] for ii, region in enumerate(countries_for)]) gold_data = [] noisy_data = [] for ii, region in enumerate(sorted(countries_for)): if region == 'world': continue print region sys.stdout.flush() # introduce unexplained regional variation #i = i0 * (1 + float(ii) / 21) # or not i = i0 for year in [1990, 2005]: for sex in ['male', 'female']: param_type = 'all-cause_mortality' key = dismod3.gbd_key_for(param_type, region, year, sex) m_all_cause = mort.mortality(key, mort.data) # calculate excess-mortality rate from smr f = (SMR - 1.) * m_all_cause ## compartmental model (bins S, C, D, M) import scipy.linalg from dismod3 import NEARLY_ZERO from dismod3.utils import trim SCDM = np.zeros([4, age_len]) p = np.zeros(age_len) m = np.zeros(age_len) SCDM[0, 0] = 1. SCDM[1, 0] = 0. SCDM[2, 0] = 0. SCDM[3, 0] = 0. p[0] = SCDM[1, 0] / (SCDM[0, 0] + SCDM[1, 0] + NEARLY_ZERO) m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1 - NEARLY_ZERO) for a in range(age_len - 1): A = [[-i[a] - m[a], r[a], 0., 0.], [i[a], -r[a] - m[a] - f[a], 0., 0.], [m[a], m[a], 0., 0.], [0., f[a], 0., 0.]] SCDM[:, a + 1] = np.dot(scipy.linalg.expm(A), SCDM[:, a]) p[a + 1] = SCDM[1, a + 1] / (SCDM[0, a + 1] + SCDM[1, a + 1] + NEARLY_ZERO) m[a + 1] = m_all_cause[a + 1] - f[a + 1] * p[a + 1] # duration = E[time in bin C] hazard = r + m + f pr_not_exit = np.exp(-hazard) X = np.empty(len(hazard)) X[-1] = 1 / hazard[-1] for ii in reversed(range(len(X) - 1)): X[ii] = (pr_not_exit[ii] * (X[ii + 1] + 1)) + (1 / hazard[ii] * (1 - pr_not_exit[ii]) - pr_not_exit[ii]) country = countries_for[region][0] params = dict(age_intervals=age_intervals, condition=condition, gbd_region=region, country=country, year=year, sex=sex, effective_sample_size=1000) params['age_intervals'] = [[0, 99]] generate_and_append_data(gold_data, 'prevalence data', p, **params) generate_and_append_data(gold_data, 'incidence data', i, **params) generate_and_append_data(gold_data, 'excess-mortality data', f, **params) generate_and_append_data(gold_data, 'remission data', r, **params) generate_and_append_data(gold_data, 'duration data', X, **params) # TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum params['age_intervals'] = [[0, 99]] iX = i * X * (1 - p) * regional_population(key) generate_and_append_data(gold_data, 'incidence_x_duration', iX, **params) params['effective_sample_size'] = 1000 params['cov'] = 0. params['age_intervals'] = age_intervals generate_and_append_data(noisy_data, 'prevalence data', p, **params) generate_and_append_data(noisy_data, 'excess-mortality data', f, **params) generate_and_append_data(noisy_data, 'remission data', r, **params) generate_and_append_data(noisy_data, 'incidence data', i, **params) col_names = sorted(data_dict_for_csv(gold_data[0]).keys()) f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in gold_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() f_name = OUTPUT_PATH + '%s_data.tsv' % condition f_file = open(f_name, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in noisy_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() # upload data file from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL dismod_server_login() twc.go(DISMOD_BASE_URL + 'dismod/data/upload/') twc.formvalue(1, 'tab_separated_values', open(f_name).read()) # TODO: find or set the model number for this model, set the # expert priors and covariates, merge the covariate data into the # model, and add the "ground truth" to the disease json try: url = twc.submit() except Exception, e: print e
if __name__ == "__main__": dm_egypt = hep_c_fit(["egypt"], [1990, 2005], egypt_flag=True) dm_na_me = hep_c_fit(["north_africa_middle_east"], [1990, 2005]) # combine prevalence curves for egypt and rest of north africa for y in [1990, 2005]: for s in ["male", "female"]: key = "prevalence+egypt+%d+%s" % (y, s) prev_1 = neg_binom_model.calc_rate_trace(dm_egypt, key, dm_egypt.vars[key]) pop_1 = neg_binom_model.population_by_age[("EGY", str(y), s)] key = "prevalence+north_africa_middle_east+%d+%s" % (y, s) prev_0 = neg_binom_model.calc_rate_trace(dm_na_me, key, dm_na_me.vars[key]) pop_0 = neg_binom_model.regional_population(key) # generate population weighted average prev = (prev_0 * (pop_0 - pop_1) + prev_1 * pop_1) / pop_0 neg_binom_model.store_mcmc_fit(dm_na_me, key, None, prev) # generate plots of results dismod3.tile_plot_disease_model(dm_na_me, [key], defaults={"ymax": 0.15, "alpha": 0.5}) dm_na_me.savefig("dm-%d-posterior-na_me_w_egypt.%f.png" % (dm_na_me.id, random())) # save results dismod3.post_disease_model(dm_na_me) dm = hep_c_fit( "caribbean latin_america_tropical latin_america_andean latin_america_central latin_america_southern".split(), [1990, 2005],
def generate_disease_data(condition, cov): """ Generate csv files with gold-standard disease data, and somewhat good, somewhat dense disease data, as might be expected from a condition that is carefully studied in the literature """ age_len = dismod3.MAX_AGE ages = np.arange(age_len, dtype='float') # incidence rate i0 = .005 + .02 * mc.invlogit((ages - 44) / 3) #i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.)) # remission rate #r = 0. * ages r = .1 * np.ones_like(ages) # excess-mortality rate #f_init = .085 * (ages / 100) ** 2.5 SMR = 3. * np.ones_like(ages) - ages / age_len # all-cause mortality-rate mort = dismod3.get_disease_model('all-cause_mortality') #age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)] age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)] # TODO: take age structure from real data sparse_intervals = dict([[region, random.sample(age_intervals, (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1)] for ii, region in enumerate(countries_for)]) dense_intervals = dict([[region, random.sample(age_intervals, len(age_intervals)/2)] for ii, region in enumerate(countries_for)]) gold_data = [] noisy_data = [] for ii, region in enumerate(sorted(countries_for)): if region == 'world': continue print region sys.stdout.flush() # introduce unexplained regional variation #i = i0 * (1 + float(ii) / 21) # or not i = i0 for year in [1990, 2005]: for sex in ['male', 'female']: param_type = 'all-cause_mortality' key = dismod3.gbd_key_for(param_type, region, year, sex) m_all_cause = mort.mortality(key, mort.data) # calculate excess-mortality rate from smr f = (SMR - 1.) * m_all_cause ## compartmental model (bins S, C, D, M) import scipy.linalg from dismod3 import NEARLY_ZERO from dismod3.utils import trim SCDM = np.zeros([4, age_len]) p = np.zeros(age_len) m = np.zeros(age_len) SCDM[0,0] = 1. SCDM[1,0] = 0. SCDM[2,0] = 0. SCDM[3,0] = 0. p[0] = SCDM[1,0] / (SCDM[0,0] + SCDM[1,0] + NEARLY_ZERO) m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1-NEARLY_ZERO) for a in range(age_len - 1): A = [[-i[a]-m[a], r[a] , 0., 0.], [ i[a] , -r[a]-m[a]-f[a], 0., 0.], [ m[a], m[a] , 0., 0.], [ 0., f[a], 0., 0.]] SCDM[:,a+1] = np.dot(scipy.linalg.expm(A), SCDM[:,a]) p[a+1] = SCDM[1,a+1] / (SCDM[0,a+1] + SCDM[1,a+1] + NEARLY_ZERO) m[a+1] = m_all_cause[a+1] - f[a+1] * p[a+1] # duration = E[time in bin C] hazard = r + m + f pr_not_exit = np.exp(-hazard) X = np.empty(len(hazard)) X[-1] = 1 / hazard[-1] for ii in reversed(range(len(X)-1)): X[ii] = (pr_not_exit[ii] * (X[ii+1] + 1)) + (1 / hazard[ii] * (1 - pr_not_exit[ii]) - pr_not_exit[ii]) country = countries_for[region][0] params = dict(age_intervals=age_intervals, condition=condition, gbd_region=region, country=country, year=year, sex=sex, effective_sample_size=1000) params['age_intervals'] = [[0,99]] generate_and_append_data(gold_data, 'prevalence data', p, **params) generate_and_append_data(gold_data, 'incidence data', i, **params) generate_and_append_data(gold_data, 'excess-mortality data', f, **params) generate_and_append_data(gold_data, 'remission data', r, **params) generate_and_append_data(gold_data, 'duration data', X, **params) # TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum params['age_intervals'] = [[0,99]] iX = i * X * (1-p) * regional_population(key) generate_and_append_data(gold_data, 'incidence_x_duration', iX, **params) params['effective_sample_size'] = 1000 params['cov'] = 0. params['age_intervals'] = age_intervals generate_and_append_data(noisy_data, 'prevalence data', p, **params) generate_and_append_data(noisy_data, 'excess-mortality data', f, **params) generate_and_append_data(noisy_data, 'remission data', r, **params) generate_and_append_data(noisy_data, 'incidence data', i, **params) col_names = sorted(data_dict_for_csv(gold_data[0]).keys()) f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in gold_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() f_name = OUTPUT_PATH + '%s_data.tsv' % condition f_file = open(f_name, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in noisy_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() # upload data file from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL dismod_server_login() twc.go(DISMOD_BASE_URL + 'dismod/data/upload/') twc.formvalue(1, 'tab_separated_values', open(f_name).read()) # TODO: find or set the model number for this model, set the # expert priors and covariates, merge the covariate data into the # model, and add the "ground truth" to the disease json try: url = twc.submit() except Exception, e: print e