def ds_initialize(model_num, data_type, area, thin, iter, replicate, ds_loc='/homes/peterhm/dismod_spline-20130115/build', save_loc='/homes/peterhm/dismod_spline-20130115/build/fit', bare_bones=False): ''' Parameters ---------- model_num : int dismod model number data_type : str one of the epidemiologic parameters allowed 'p', 'i', 'r', 'f', 'pf', 'csmr', 'rr', 'smr', 'X' area : str level of heirarchy to keep thin : int thinning number for MCMC iter : int number of iterations in MCMC replicate : int number for random number bare_bones : bool True creates minimalist files, False uses DisMod-MR values, default set to False Results ------- gets data and builds necessary files .. Note :: If bare_bones is False, parameter files must be filled. ''' cwd = os.getcwd() os.chdir('%s' %ds_loc) # creates necessary files if bare_bones == True: # creates data os.system('bin/get_data.py %s' %model_num) os.system('bin/fit.sh %s %s' %(model_num, c_data.convert_data_type(data_type))) else: # load data structure dm3 = mu.load_new_model(model_num, area, data_type) # create required files data_in = c_data.build_data_in(dm3, data_type, model_num) prior_in = c_prior.build_prior_in(dm3, data_type, model_num) parameter_in = build_param_in(dm3, thin, iter, replicate) # save files if not os.path.exists(save_loc): os.makedirs(save_loc) data_in.to_csv(save_loc + '/data_in.csv',index=False) prior_in.to_csv(save_loc + '/prior_in.csv',index=False) parameter_in.to_csv(save_loc + '/parameter_in.csv',index=False) # return to working directory os.chdir('%s' %cwd)
def build_data_in(dm3, data_type, model_num): # find standard error and use it for standard deviation dm3 = mu.create_uncertainty(dm3, "log_normal") # create data file data_in = empty_data_in(dm3.input_data.index) # add covariates cov = dm3.input_data.filter(like="x_") data_in = data_in.join(pandas.DataFrame(cov, columns=[""])) cov_z = dm3.input_data.filter(like="z_") if len(cov_z.columns) != 0: data_in = data_in.join(pandas.DataFrame(cov_z, columns=[""])) # add data data_in["integrand"] = convert_data_type(data_type) data_in["meas_value"] = dm3.input_data["value"] data_in["meas_stdev"] = dm3.input_data["standard_error"] data_in["sex"] = dm3.input_data["sex"] data_in["age_lower"] = dm3.input_data["age_start"] data_in["age_upper"] = dm3.input_data["age_end"] + 1.0 data_in["time_lower"] = dm3.input_data["year_start"] data_in["time_upper"] = dm3.input_data["year_end"] + 1.0 data_in["x_sex"] = dm3.input_data["sex"].map(dict(male=0.5, female=-0.5, total=0)) # create data hierarchy model = mu.load_new_model(model_num, "all", data_type) superregion = set(model.hierarchy.neighbors("all")) region = set(pl.flatten([model.hierarchy.neighbors(sr) for sr in model.hierarchy.neighbors("all")])) country = set( pl.flatten( [ [model.hierarchy.neighbors(r) for r in model.hierarchy.neighbors(sr)] for sr in model.hierarchy.neighbors("all") ] ) ) # create data area levels for i in dm3.input_data.index: if dm3.input_data.ix[i, "area"] in country: data_in.ix[i, "m_sub"] = dm3.input_data.ix[i, "area"] data_in.ix[i, "m_region"] = model.hierarchy.in_edges(dm3.input_data.ix[i, "area"])[0][0] data_in.ix[i, "m_super"] = model.hierarchy.in_edges( model.hierarchy.in_edges(dm3.input_data.ix[i, "area"])[0][0] )[0][0] elif dm3.input_data.ix[i, "area"] in region: data_in.ix[i, "m_region"] = dm3.input_data.ix[i, "area"] data_in.ix[i, "m_super"] = model.hierarchy.in_edges(dm3.input_data.ix[i, "area"])[0][0] elif dm3.input_data.ix[i, "area"] in superregion: data_in.ix[i, "m_super"] = dm3.input_data.ix[i, "area"] return data_in
def prior_m_area(dm3, model_num, data_type): # create 'm_sub'/'m_region' from unique input_data['area'] prior_in = empty_prior_in(pl.unique(dm3.input_data['area']).index) prior_in['name'] = pl.unique(dm3.input_data['area']) prior_in['mean'] = 0. prior_in['std'] = 1. prior_in['lower'] = '-inf' prior_in['upper'] = 'inf' # create hierarchy model = mu.load_new_model(model_num, 'all', data_type) superregion = set(model.hierarchy.neighbors('all')) region = set(pl.flatten([model.hierarchy.neighbors(sr) for sr in model.hierarchy.neighbors('all')])) country = set(pl.flatten([[model.hierarchy.neighbors(r) for r in model.hierarchy.neighbors(sr)] for sr in model.hierarchy.neighbors('all')])) # create data area levels for i in pl.unique(dm3.input_data['area']).index: if dm3.input_data.ix[i,'area'] in country: prior_in.ix[i,'type'] = 'm_sub' elif dm3.input_data.ix[i,'area'] in region: prior_in.ix[i,'type'] = 'm_region' elif dm3.input_data.ix[i,'area'] in superregion: prior_in.ix[i,'type'] = 'm_super' return prior_in
# create output structures stats = [ "seed", "bias_" + rate_type, "rmse_" + rate_type, "mae_" + rate_type, "mare_" + rate_type, "pc_" + rate_type, "time_" + rate_type, ] output = pandas.DataFrame(pl.zeros((1, len(stats))), columns=stats) output["seed"] = replicate failure = [] # load new model model = mu.load_new_model(model_num, area, data_type) # replace invalid uncertainty with 10% of data set model = mu.create_uncertainty(model, rate_type) # withhold 25% of data model.input_data, test_ix = mu.test_train(model.input_data, data_type, replicate) try: # create pymc nodes for model and fit the model model.vars += dismod3.ism.age_specific_rate(model, data_type, area, "male", 2005, rate_type=rate_type) # fit the model, using a hill-climbing alg to find an initial value # and then sampling from the posterior with MCMC start = time.clock() dismod3.fit.fit_asr(model, data_type, iter=iter, thin=thin, burn=burn) elapsed = time.clock() - start # extract posterior predicted values for data pred = pandas.DataFrame(
import sys sys.path += ['.', '..', '/homes/peterhm/gbd/', '/homes/peterhm/gbd/book'] import model_utilities as mu reload(mu) import dismod3 reload(dismod3) model_num = 40418 test_area = 'europe_western' data_type = 'p' rate_type='binom' # example model0, to test vars and test-train model = mu.load_new_model(model_num, test_area, data_type) nan_ix = list(model.input_data['effective_sample_size'][pl.isnan(model.input_data['effective_sample_size'])==1].index) # list of nan in effective sample size model = mu.create_uncertainty(model, 'binom') for cv in list(model.input_data.filter(like='x_').columns): # fill missing with 0 model.input_data[cv] = model.input_data[cv].fillna([0]) # example model1, to test test-train model1 = mu.load_new_model(model_num, test_area, data_type) model1 = mu.create_uncertainty(model1, 'normal') # example model2, to test loading and uncertainty model2 = mu.load_new_model(model_num, test_area, data_type) non_nan_ix2 = list(model2.input_data['effective_sample_size'][pl.isnan(model2.input_data['effective_sample_size'])==0].index) # list of nan in effective sample size ten_percent = pl.percentile(model2.input_data.ix[non_nan_ix2, 'effective_sample_size'], 10.) model2 = mu.create_uncertainty(model2, 'normal')