def write_squeezed(sqzd, location_id, year_id, sex_id, map_file): tmap = pd.read_csv(map_file) for me_id, df in sqzd.groupby(['me_id']): t_meid = tmap.query('modelable_entity_id_source == %s' % me_id) t_meid = t_meid['modelable_entity_id_target'].squeeze() try: t_meid = int(t_meid) except Exception: pass if not isinstance(t_meid, int): continue print('Writing squeezed %s to file' % t_meid) df['location_id'] = int(float(location_id)) df['year_id'] = int(float(year_id)) df['sex_id'] = int(float(sex_id)) df['measure_id'] = 5 df['age_group_id'] = df.age_group_id.astype(float).astype(int) df["modelable_entity_id"] = t_meid pusher = SuperPusher(spec={ 'file_pattern': ("{modelable_entity_id}/{location_id}/" "{measure_id}_{year_id}_{sex_id}.h5"), 'h5_tablename': 'draws' }, directory=output_dir) pusher.push(df, append=False)
def export_summary(self, component, year_type, df): if year_type == "single_year": pattern = "{measure_id}/single_year/{location_id}/{year_id}.csv" index_cols = { "cause": [ "measure_id", "year_id", "location_id", "sex_id", "age_group_id", "cause_id", "metric_id" ], "impairment": [ "measure_id", "year_id", "location_id", "sex_id", "age_group_id", "cause_id", "rei_id", "metric_id" ], "injuries": [ "measure_id", "year_id", "location_id", "sex_id", "age_group_id", "cause_id", "rei_id", "metric_id" ], "sequela": [ "measure_id", "year_id", "location_id", "sex_id", "age_group_id", "sequela_id", "metric_id" ] } if year_type == "multi_year": pattern = "{measure_id}/multi_year/{location_id}.csv" index_cols = { "cause": [ "measure_id", "year_start_id", "year_end_id", "location_id", "sex_id", "age_group_id", "cause_id", "metric_id" ], "impairment": [ "measure_id", "year_start_id", "year_end_id", "location_id", "sex_id", "age_group_id", "cause_id", "rei_id", "metric_id" ], "injuries": [ "measure_id", "year_start_id", "year_end_id", "location_id", "sex_id", "age_group_id", "cause_id", "rei_id", "metric_id" ], "sequela": [ "measure_id", "year_start_id", "year_end_id", "location_id", "sex_id", "age_group_id", "sequela_id", "metric_id" ] } df = sort_index_columns(df, index_cols[component]) df = df[index_cols[component] + ["val", "upper", "lower"]] # path to use in summaries directory = os.path.join(self.como_version.como_dir, "summaries", component) pusher = SuperPusher(directory=directory, spec={"file_pattern": pattern}) pusher.push(df, index=False)
def export_summary(self, component, year_type, df): if year_type == "single_year": pattern = "{measure_id}/single_year/{location_id}/{year_id}.csv" if year_type == "multi_year": pattern = "{measure_id}/multi_year/{location_id}.csv" # path to use in summaries directory = os.path.join(self.como_version.como_dir, "summaries", component) pusher = SuperPusher(directory=directory, spec={"file_pattern": pattern}) pusher.push(df)
def __init__(self, split_version_id, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # static ids self.split_version_id = split_version_id self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id # read func is derived from static values. we call it to initialize the # internal caching self._read_func = split_prop_read_func() cached_props = self._read_func(params={}, filters=self.ss_filters) # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if -1 in cached_props["age_start"].unique().tolist(): age_group_id.append(164) if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) sp_formula = SevPropFormula() sp_formula.build_custom_draw_source(params={}, read_func=self._read_func) sp_formula.add_transforms() self._ss_draw_source = sp_formula.draw_source # epi draws source mvid, dstep = get_best_model_version_and_decomp_step( output_dir, int(self.parent_meid)) src = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=int(self.parent_meid), model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep) src.remove_transform(automagic_age_sex_agg) if n_draws < 1000: src.add_transform(group_and_downsample, n_draws) self._epi_draw_source = src self.pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir)
class SevSplitter(object): def __init__(self, split_version_id, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # static ids self.split_version_id = split_version_id self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id # read func is derived from static values. we call it to initialize the # internal caching self._read_func = split_prop_read_func() cached_props = self._read_func(params={}, filters=self.ss_filters) # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if -1 in cached_props["age_start"].unique().tolist(): age_group_id.append(164) if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) sp_formula = SevPropFormula() sp_formula.build_custom_draw_source(params={}, read_func=self._read_func) sp_formula.add_transforms() self._ss_draw_source = sp_formula.draw_source # epi draws source mvid, dstep = get_best_model_version_and_decomp_step( output_dir, int(self.parent_meid)) src = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=int(self.parent_meid), model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep) src.remove_transform(automagic_age_sex_agg) if n_draws < 1000: src.add_transform(group_and_downsample, n_draws) self._epi_draw_source = src self.pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir) @property def ss_filters(self): return {"split_version_id": self.split_version_id} @property def demo_filters(self): return { "location_id": self.dimensions.index_dim.get_level("location_id"), "age_group_id": self.dimensions.index_dim.get_level("age_group_id"), "year_id": self.dimensions.index_dim.get_level("year_id"), "sex_id": self.dimensions.index_dim.get_level("sex_id"), "measure_id": self.dimensions.index_dim.get_level("measure_id") } @property def parent_meid(self): df = self._read_func(params={}, filters=self.ss_filters) return df.parent_meid.unique()[0] @property def child_meid(self): df = self._read_func(params={}, filters=self.ss_filters) return df.child_meid.unique().tolist() def split(self): # get input draws draws = self._epi_draw_source.content(filters=self.demo_filters.copy()) # get split props filters = self.ss_filters filters.update(self.demo_filters) gprops = self._ss_draw_source.content(filters=filters) splits = merge_split(draws, gprops, group_cols=self.dimensions.index_names, value_cols=self.dimensions.data_list()) splits = splits.assign(modelable_entity_id=splits['child_meid']) splits = splits[self.dimensions.index_names + ["modelable_entity_id"] + self.dimensions.data_list()] splits = splits.fillna(0) self.pusher.push(splits, append=False) def _q_split(self, inq, outq): for location_id in iter(inq.get, sentinel): print(location_id) try: self.dimensions.index_dim.replace_level( "location_id", location_id) self.split() outq.put((False, location_id)) except Exception as e: outq.put((ExceptionWrapper(e), location_id)) def run_all_splits_mp(self, n_processes=23): inq = Queue() outq = Queue() # Create and feed sim procs split_procs = [] min_procs = min([ n_processes, self.dimensions.index_dim.cardinality("location_id") ]) for i in range(min_procs): p = Process(target=self._q_split, args=(inq, outq)) split_procs.append(p) p.start() # run the simulations for location_id in self.dimensions.index_dim.get_level("location_id"): inq.put(location_id) # make the workers die after for _ in split_procs: inq.put(sentinel) # get results results = [] for location_id in self.dimensions.index_dim.get_level("location_id"): proc_result = outq.get() results.append(proc_result) # close up the queue for p in split_procs: p.join() for exc, location_id in results: if exc: exc.re_raise()
def __init__( self, split_version_id, output_dir, location_id=[], year_id=[], age_group_id=[], sex_id=[], measure_id=[], location_set_id=35, gbd_round_id=5, n_draws=1000): # static ids self.split_version_id = split_version_id self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree( location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves()] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2017] if not age_group_id: # this has the advantage of intantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if not sex_id: sex_id = [1, 2] if not measure_id: measure_id = [5, 6] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions(index_dict, data_dict) # read func is derived from static values. we call it to initialize the # internal caching self._read_func = split_prop_read_func() self._read_func(params={}, filters=self.ss_filters) # ss draw source is derived from static values sp_formula = SevPropFormula( location_set_id=location_set_id, n_draws=n_draws) sp_formula.build_custom_draw_source( params={}, read_func=self._read_func) sp_formula.add_transforms() self._ss_draw_source = sp_formula.draw_source # epi draws source self._epi_draw_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=self.parent_meid, gbd_round_id=gbd_round_id) self.pusher = SuperPusher( spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws'}, directory=output_dir)
def __init__(self, me_map, output_dir, location_id=[], year_id=[], age_group_id=[], sex_id=[], measure_id=[], location_set_id=35, gbd_round_id=5, n_draws=1000, copy_env_inc=False): # set static values self.me_map = me_map self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id self.copy_env_inc = copy_env_inc # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2017] if not age_group_id: age_group_id = range(2, 21) + [30, 31, 32, 235] if not sex_id: sex_id = [1, 2] if not measure_id: measure_id = [5, 6] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) # draws that are imported or computed are stored here self.draws = {} # objects for reading data self._importers = {} for me_id in me_map["sub"].keys() + [me_map["env"]]: me_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, gbd_round_id=gbd_round_id) me_source.remove_transform(automagic_age_sex_agg) self._importers[me_id] = me_source # object for pushing results to disk self._pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir)
class ExAdjust(object): def __init__(self, me_map, output_dir, location_id=[], year_id=[], age_group_id=[], sex_id=[], measure_id=[], location_set_id=35, gbd_round_id=5, n_draws=1000, copy_env_inc=False): # set static values self.me_map = me_map self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id self.copy_env_inc = copy_env_inc # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2017] if not age_group_id: age_group_id = range(2, 21) + [30, 31, 32, 235] if not sex_id: sex_id = [1, 2] if not measure_id: measure_id = [5, 6] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) # draws that are imported or computed are stored here self.draws = {} # objects for reading data self._importers = {} for me_id in me_map["sub"].keys() + [me_map["env"]]: me_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, gbd_round_id=gbd_round_id) me_source.remove_transform(automagic_age_sex_agg) self._importers[me_id] = me_source # object for pushing results to disk self._pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir) @property def filters(self): return { "location_id": self.dimensions.index_dim.get_level("location_id"), "age_group_id": self.dimensions.index_dim.get_level("age_group_id"), "year_id": self.dimensions.index_dim.get_level("year_id"), "sex_id": self.dimensions.index_dim.get_level("sex_id"), "measure_id": self.dimensions.index_dim.get_level("measure_id") } def _import_draws(self): gbdizer = gbdize.GBDizeDataFrame(self.dimensions) # import draws for me_id in self._importers.keys(): draw_source = self._importers[me_id] draws = draw_source.content(filters=self.filters) draws = gbdizer.fill_empty_indices(draws, 0) self.draws[me_id] = draws.set_index(self.dimensions.index_names) def _calc_sigma_sub(self): """calculate the sum of the sub sequela""" # concatenate all required frames sub_dfs = [] for me_id in self.me_map["sub"].keys(): sub_dfs.append(self.draws[me_id]) sub_df = pd.concat(sub_dfs) # return the sum sub_df.reset_index(inplace=True) if self.copy_env_inc: draw_cols = self.dimensions.data_dim.get_level("data") sub_df.loc[sub_df['measure_id'] == 6, draw_cols] = 0 return sub_df.groupby(self.dimensions.index_names).sum() def _resid(self): """calculate the residual numbers""" # get the needed data sigma_sub_df = self.draws["sigma_sub"] env_df = self.draws[self.me_map["env"]] # if it is a squeeze type then we use the absolute value of the diff resid_df = (env_df - sigma_sub_df)[(sigma_sub_df <= env_df)].fillna(0) return resid_df def _excess(self, sub_me): """calculate the excess proportions""" # get the needed data sub_me_df = self.draws[sub_me] env_df = self.draws[self.me_map["env"]] sigma_sub_df = self.draws["sigma_sub"] # create a boolean dataframe for our 2 cases more = (sigma_sub_df > env_df) # now calculate the excess values excess_df = ((sigma_sub_df[more] - env_df[more]) * sub_me_df[more] / sigma_sub_df[more]).fillna(value=0) return excess_df def _squeeze(self, sub_me): """calculate the squeezed proportions""" # get the needed data sub_me_df = self.draws[sub_me] env_df = self.draws[self.me_map["env"]] sigma_sub_df = self.draws["sigma_sub"] # create a boolean dataframe for our 2 cases more = (sigma_sub_df > env_df) # get the squeezed values when squeeze_more = env_df[more] * sub_me_df[more] / sigma_sub_df[more] squeeze_less = sub_me_df[~more] squeeze_df = squeeze_more.fillna(squeeze_less) return squeeze_df def _export(self): """export all data""" # export residual me_id = self.me_map["resid"] resid_df = self.draws[me_id].reset_index() resid_df["modelable_entity_id"] = me_id self._pusher.push(resid_df, append=False) # export any subcause adjustments for sub_me in self.me_map["sub"].keys(): if "squeeze" in self.me_map["sub"][sub_me].keys(): me_id = self.me_map["sub"][sub_me]["squeeze"] squeeze_df = self.draws[me_id].reset_index() squeeze_df["modelable_entity_id"] = me_id self._pusher.push(squeeze_df, append=False) if "excess" in self.me_map["sub"][sub_me].keys(): me_id = self.me_map["sub"][sub_me]["excess"] excess_df = self.draws[me_id].reset_index() excess_df["modelable_entity_id"] = me_id self._pusher.push(excess_df, append=False) def adjust(self): """run exclusivity adjustment on all MEs""" self._import_draws() self.draws["sigma_sub"] = self._calc_sigma_sub() self.draws[self.me_map["resid"]] = self._resid() for sub_me in self.me_map["sub"].keys(): if "squeeze" in self.me_map["sub"][sub_me].keys(): self.draws[self.me_map["sub"][sub_me]["squeeze"]] = ( self._squeeze(sub_me)) if "excess" in self.me_map["sub"][sub_me].keys(): self.draws[self.me_map["sub"][sub_me]["excess"]] = ( self._excess(sub_me)) self._export() def _q_adjust(self, inq, outq): for location_id in iter(inq.get, sentinel): try: self.dimensions.index_dim.replace_level( "location_id", location_id) self.adjust() outq.put((False, location_id)) except Exception as e: outq.put((ExceptionWrapper(e), location_id)) def run_all_adjustments_mp(self, n_processes=23): inq = Queue() outq = Queue() # Create and feed sim procs adjust_procs = [] min_procs = min([ n_processes, self.dimensions.index_dim.cardinality("location_id") ]) for i in range(min_procs): p = Process(target=self._q_adjust, args=(inq, outq)) adjust_procs.append(p) p.start() # run the silulations for location_id in self.dimensions.index_dim.get_level("location_id"): inq.put(location_id) # make the workers die after for _ in adjust_procs: inq.put(sentinel) # get results results = [] for location_id in self.dimensions.index_dim.get_level("location_id"): proc_result = outq.get() results.append(proc_result) # close up the queue for p in adjust_procs: p.join() for exc, location_id in results: if exc: exc.re_raise()
def __init__( self, process_name, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # validate decomp_step validate_decomp_step("ExAdjust", decomp_step, gbd_round_id) this_file = os.path.realpath(__file__) this_dir = os.path.dirname(this_file) filepath = os.path.join(this_dir,"..","maps","final.json") with open(filepath, 'r') as f: emap = json.load(f) me_map = emap[process_name]["kwargs"]["me_map"] # set static values self.me_map = json_parser(json.dumps(me_map)) self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id try: self.copy_env_inc = emap[ process_name]["kwargs"].pop("copy_env_inc") except KeyError: self.copy_env_inc = False # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree( location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves()] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions(index_dict, data_dict) # draws that are imported or computed are stored here self.draws = {} # objects for reading data self._importers = {} for me_id in list(self.me_map["sub"].keys()) + [self.me_map["env"]]: mvid, dstep = ( get_best_model_version_and_decomp_step(output_dir, me_id) ) me_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep ) me_source.remove_transform(automagic_age_sex_agg) if n_draws < 1000: me_source.add_transform(group_and_downsample, n_draws) self._importers[me_id] = me_source # object for pushing results to disk self._pusher = SuperPusher( spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws'}, directory=output_dir)
class ExAdjust(object): def __init__( self, process_name, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # validate decomp_step validate_decomp_step("ExAdjust", decomp_step, gbd_round_id) this_file = os.path.realpath(__file__) this_dir = os.path.dirname(this_file) filepath = os.path.join(this_dir,"..","maps","final.json") with open(filepath, 'r') as f: emap = json.load(f) me_map = emap[process_name]["kwargs"]["me_map"] # set static values self.me_map = json_parser(json.dumps(me_map)) self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id try: self.copy_env_inc = emap[ process_name]["kwargs"].pop("copy_env_inc") except KeyError: self.copy_env_inc = False # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree( location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves()] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions(index_dict, data_dict) # draws that are imported or computed are stored here self.draws = {} # objects for reading data self._importers = {} for me_id in list(self.me_map["sub"].keys()) + [self.me_map["env"]]: mvid, dstep = ( get_best_model_version_and_decomp_step(output_dir, me_id) ) me_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep ) me_source.remove_transform(automagic_age_sex_agg) if n_draws < 1000: me_source.add_transform(group_and_downsample, n_draws) self._importers[me_id] = me_source # object for pushing results to disk self._pusher = SuperPusher( spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws'}, directory=output_dir) @property def filters(self): return { "location_id": self.dimensions.index_dim.get_level("location_id"), "age_group_id": self.dimensions.index_dim.get_level("age_group_id" ), "year_id": self.dimensions.index_dim.get_level("year_id"), "sex_id": self.dimensions.index_dim.get_level("sex_id"), "measure_id": self.dimensions.index_dim.get_level("measure_id") } def _import_draws(self): gbdizer = gbdize.GBDizeDataFrame(self.dimensions) # import draws for me_id in self._importers.keys(): draw_source = self._importers[me_id] draws = draw_source.content(filters=self.filters) draws = gbdizer.fill_empty_indices(draws, 0) self.draws[me_id] = draws.set_index(self.dimensions.index_names) def _calc_sigma_sub(self): """calculate the sum of the sub sequela""" # concatenate all required frames sub_dfs = [] for me_id in self.me_map["sub"].keys(): sub_dfs.append(self.draws[me_id]) sub_df = pd.concat(sub_dfs) # return the sum sub_df.reset_index(inplace=True) if self.copy_env_inc: draw_cols = self.dimensions.data_dim.get_level("data") sub_df.loc[sub_df['measure_id'] == 6, draw_cols] = 0 return sub_df.groupby(self.dimensions.index_names).sum() def _resid(self): """calculate the residual numbers""" # get the needed data sigma_sub_df = self.draws["sigma_sub"] env_df = self.draws[self.me_map["env"]] # if it is a squeeze type then we use the absolute value of the diff resid_df = (env_df - sigma_sub_df)[(sigma_sub_df <= env_df)].fillna(0) return resid_df def _excess(self, sub_me): """calculate the excess proportions""" # get the needed data sub_me_df = self.draws[sub_me] env_df = self.draws[self.me_map["env"]] sigma_sub_df = self.draws["sigma_sub"] # create a boolean dataframe for our 2 cases more = (sigma_sub_df > env_df) # now calculate the excess values excess_df = ( (sigma_sub_df[more] - env_df[more]) * sub_me_df[more] / sigma_sub_df[more] ).fillna(value=0) return excess_df def _squeeze(self, sub_me): """calculate the squeezed proportions""" # get the needed data sub_me_df = self.draws[sub_me] env_df = self.draws[self.me_map["env"]] sigma_sub_df = self.draws["sigma_sub"] # create a boolean dataframe for our 2 cases more = (sigma_sub_df > env_df) # get the squeezed values when squeeze_more = env_df[more] * sub_me_df[more] / sigma_sub_df[more] squeeze_less = sub_me_df[~more] squeeze_df = squeeze_more.fillna(squeeze_less) return squeeze_df def _export(self): """export all data""" # export residual me_id = self.me_map["resid"] resid_df = self.draws[me_id].reset_index() resid_df["modelable_entity_id"] = me_id self._pusher.push(resid_df, append=False) # export any subcause adjustments for sub_me in self.me_map["sub"].keys(): if "squeeze" in list(self.me_map["sub"][sub_me].keys()): me_id = self.me_map["sub"][sub_me]["squeeze"] squeeze_df = self.draws[me_id].reset_index() squeeze_df["modelable_entity_id"] = me_id self._pusher.push(squeeze_df, append=False) if "excess" in list(self.me_map["sub"][sub_me].keys()): me_id = self.me_map["sub"][sub_me]["excess"] excess_df = self.draws[me_id].reset_index() excess_df["modelable_entity_id"] = me_id self._pusher.push(excess_df, append=False) def adjust(self): """run exclusivity adjustment on all MEs""" self._import_draws() self.draws["sigma_sub"] = self._calc_sigma_sub() self.draws[self.me_map["resid"]] = self._resid() for sub_me in self.me_map["sub"].keys(): if "squeeze" in list(self.me_map["sub"][sub_me].keys()): self.draws[self.me_map["sub"][sub_me]["squeeze"]] = ( self._squeeze(sub_me)) if "excess" in list(self.me_map["sub"][sub_me].keys()): self.draws[self.me_map["sub"][sub_me]["excess"]] = ( self._excess(sub_me)) self._export() def _q_adjust(self, inq, outq): for location_id in iter(inq.get, sentinel): try: self.dimensions.index_dim.replace_level("location_id", location_id) self.adjust() outq.put((False, location_id)) except Exception as e: outq.put((ExceptionWrapper(e), location_id)) def run_all_adjustments_mp(self, n_processes=23): inq = Queue() outq = Queue() # Create and feed sim process adjust_procs = [] min_procs = min( [n_processes, self.dimensions.index_dim.cardinality("location_id")] ) for i in range(min_procs): p = Process(target=self._q_adjust, args=(inq, outq)) adjust_procs.append(p) p.start() # run the simulations for location_id in self.dimensions.index_dim.get_level("location_id"): inq.put(location_id) # make the workers die after for _ in adjust_procs: inq.put(sentinel) # get results results = [] for location_id in self.dimensions.index_dim.get_level("location_id"): proc_result = outq.get() results.append(proc_result) # close up the queue for p in adjust_procs: p.join() for exc, location_id in results: if exc: exc.re_raise()
def run_squeeze(location_id, year_id, sex_id): ################################### # Prepare envelopes ################################### sequelae_map = pd.read_csv(SOURCE_TARGET_FILE) envelope_dict = create_env(location_id, year_id, sex_id) ################################### # Prepare unsqueezed prevalence ################################### # Load map of sequelae and their targets unsqueezed = get_unsqueezed(sequelae_map, location_id, year_id, sex_id) unsqueezed.loc[:, drawcols] = unsqueezed.loc[:, drawcols].clip(lower=0) ################################### # SQUEEZE ################################### # Parallelize the squeezing pool = Pool(20) ages = list(pd.unique(unsqueezed['age_group_id'])) partial_squeeze = partial(squeeze_age_group, unsqueezed=unsqueezed, env_dict=envelope_dict) squeezed = pool.map(partial_squeeze, ages, chunksize=1) pool.close() pool.join() squeezed = pd.concat(squeezed) squeezed = squeezed.groupby( ['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id']).sum() squeezed = squeezed.reset_index() ################################## # Write to files ################################## write_squeezed(squeezed, location_id, year_id, sex_id, MAP_FILE) ################################## # Allocate residuals ################################## allocate_residuals(unsqueezed, squeezed, location_id, year_id, sex_id, MAP_FILE) ########################################### # Determine the remainder of the envelopes ########################################### remains = calc_env_remainders(envelope_dict, squeezed) remain_map = { 'id_bord': 2000, 'id_mild': 1999, 'id_mod': 2001, 'id_sev': 2002, 'id_prof': 2003 } for key, meid in remain_map.iteritems(): print('Writing remainder %s to file' % meid) try: meid = int(meid) except Exception: pass df = remains[key] df['location_id'] = int(float(location_id)) df['year_id'] = int(float(year_id)) df['sex_id'] = int(float(sex_id)) df['measure_id'] = 5 df['age_group_id'] = df.age_group_id.astype(float).astype(int) df["modelable_entity_id"] = meid pusher = SuperPusher(spec={ 'file_pattern': ("{modelable_entity_id}/{location_id}/" "{measure_id}_{year_id}_{sex_id}.h5"), 'h5_tablename': 'draws' }, directory=output_dir) pusher.push(df[[ 'location_id', 'year_id', 'age_group_id', "sex_id", "modelable_entity_id", "measure_id" ] + drawcols], append=False)
def allocate_residuals(usqzd, sqzd, location_id, year_id, sex_id, map_file): tmap = pd.read_csv(map_file) resids = usqzd.merge( sqzd, on=['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id'], suffixes=('.usqzd', '.sqzd')) resids = resids[resids['resid_target_me.usqzd'].notnull()] dscols = ['draw_%s.sqzd' % d for d in range(1000)] ducols = ['draw_%s.usqzd' % d for d in range(1000)] toalloc = resids[ducols].values - resids[dscols].values toalloc = toalloc.clip(min=0) resids = resids.join( pd.DataFrame(data=toalloc, index=resids.index, columns=drawcols)) resids = resids[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'resid_target_me.usqzd' ] + drawcols] resids.rename(columns={'resid_target_me.usqzd': 'resid_target_me'}, inplace=True) resids = resids.groupby(['resid_target_me', 'age_group_id']).sum() resids = resids.reset_index() resids = resids[['resid_target_me', 'age_group_id'] + drawcols] for me_id, resid_df in resids.groupby('resid_target_me'): t_meid = tmap.query('modelable_entity_id_source == %s' % me_id) t_meid = t_meid.modelable_entity_id_target.squeeze() try: t_meid = int(t_meid) except Exception: pass present = True try: draw_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, gbd_round_id=5) draw_source.remove_transform(automagic_age_sex_agg) t_df = draw_source.content( filters={ "location_id": location_id, "year_id": year_id, "sex_id": sex_id, "measure_id": 5 }) except NoBestVersionError: present = False if present: t_df = t_df.merge(resid_df, on='age_group_id', suffixes=('#base', '#resid')) newvals = (t_df.filter(like="#base").values + t_df.filter(like="#resid").values) t_df = t_df.join( pd.DataFrame(data=newvals, index=t_df.index, columns=drawcols)) print('Writing residual %s to file' % t_meid) t_df['location_id'] = int(float(location_id)) t_df['year_id'] = int(float(year_id)) t_df['sex_id'] = int(float(sex_id)) t_df['measure_id'] = 5 t_df['age_group_id'] = t_df.age_group_id.astype(float).astype(int) t_df["modelable_entity_id"] = t_meid t_df = t_df[[ 'location_id', 'year_id', 'age_group_id', "sex_id", "modelable_entity_id", "measure_id" ] + drawcols] pusher = SuperPusher(spec={ 'file_pattern': ("{modelable_entity_id}/{location_id}/" "{measure_id}_{year_id}_{sex_id}.h5"), 'h5_tablename': 'draws' }, directory=output_dir) pusher.push(t_df, append=False) else: print('ME ID %s missing' % me_id) return resids