def copy_and_backfill(self): prof_id_cret_old = self.me_map["cretinism"]["srcs"] old = self.me_dict[prof_id_cret_old].reset_index() # Handle year differences between gbd2016 and gbd2017 old.loc[old.year_id == 2016, 'year_id'] = 2017 # Handle Saudia Arabia loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=4) saudia_id = 152 saudia_sub_nats = loc_meta.loc[loc_meta.parent_id == saudia_id, 'location_id'].tolist() saudi_arabia = old.loc[old.location_id.isin(saudia_sub_nats), :] saudi_arabia.loc[:, 'location_id'] = saudia_id saudi_arabia = saudi_arabia.drop_duplicates(keep='first') old = pd.concat([old, saudi_arabia], axis=0) # Handle other location differences between gbd2016 and gbd2017 data_cols = self.draw_cols data_dct = {'data_cols': data_cols} index_cols = list(set(old.columns) - set(data_cols)) index_cols.remove('location_id') demo = get_demographics(gbd_team='epi', gbd_round_id=5) index_dct = { tuple(index_cols): list(set(tuple(x) for x in old[index_cols].values)), 'location_id': demo['location_id'] } gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) new = gbdizer.fill_location_from_nearest_parent(old, location_set_id=35, gbd_round_id=5) prof_id_cret_new = self.me_map["cretinism"]["trgs"] self.me_dict[prof_id_cret_new] = new
def fill_square(df, col, gbd_round_id): '''make data square across a column for a set of index columns''' demo = get_demographics(gbd_team='epi', gbd_round_id=gbd_round_id) draw_cols = list(df.filter(like='draw_').columns) index_cols = list(set(df.columns) - set(draw_cols)) index_cols.remove(col) index_dct = { tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)), col: demo[col] } data_dct = {'draw_cols': draw_cols} gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) return gbdizer.fill_empty_indices(df, 0)
def backfill(df, norway_id, code_dir, loc_meta): #backfill data_cols = ['cases', 'effective_sample_size', 'sample_size'] data_dct = {'data_cols': data_cols} index_cols = list(set(df.columns) - set(data_cols)) index_cols.remove('location_id') norway_subs = loc_meta.loc[loc_meta.parent_id == norway_id, 'location_id'].tolist() index_dct = { tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)), 'location_id': norway_subs } gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) backfilled = gbdizer.fill_location_from_nearest_parent(df, location_set_id=35, gbd_round_id=5) return backfilled
def fill_square(df, index_cols, square_col, square_col_vals, fill_val=0): """make data square across a column for a set of index columns""" # get index dimensions index_cols = [col for col in index_cols if col != square_col] index_dct = { tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)), square_col: square_col_vals } # get data dimensions data_dct = { "non_draw_cols": [col for col in df.columns if col not in index_cols + [square_col]] } # make it square gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) df = gbdizer.fill_empty_indices(df, fill_val) return df
def __init__(self, split_version_id, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # static ids self.split_version_id = split_version_id self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id # read func is derived from static values. we call it to initialize the # internal caching self._read_func = split_prop_read_func() cached_props = self._read_func(params={}, filters=self.ss_filters) # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if -1 in cached_props["age_start"].unique().tolist(): age_group_id.append(164) if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) sp_formula = SevPropFormula() sp_formula.build_custom_draw_source(params={}, read_func=self._read_func) sp_formula.add_transforms() self._ss_draw_source = sp_formula.draw_source # epi draws source mvid, dstep = get_best_model_version_and_decomp_step( output_dir, int(self.parent_meid)) src = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=int(self.parent_meid), model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep) src.remove_transform(automagic_age_sex_agg) if n_draws < 1000: src.add_transform(group_and_downsample, n_draws) self._epi_draw_source = src self.pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir)
def __init__( self, split_version_id, output_dir, location_id=[], year_id=[], age_group_id=[], sex_id=[], measure_id=[], location_set_id=35, gbd_round_id=5, n_draws=1000): # static ids self.split_version_id = split_version_id self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree( location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves()] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2017] if not age_group_id: # this has the advantage of intantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if not sex_id: sex_id = [1, 2] if not measure_id: measure_id = [5, 6] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions(index_dict, data_dict) # read func is derived from static values. we call it to initialize the # internal caching self._read_func = split_prop_read_func() self._read_func(params={}, filters=self.ss_filters) # ss draw source is derived from static values sp_formula = SevPropFormula( location_set_id=location_set_id, n_draws=n_draws) sp_formula.build_custom_draw_source( params={}, read_func=self._read_func) sp_formula.add_transforms() self._ss_draw_source = sp_formula.draw_source # epi draws source self._epi_draw_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=self.parent_meid, gbd_round_id=gbd_round_id) self.pusher = SuperPusher( spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws'}, directory=output_dir)
def __init__(self, me_map, output_dir, location_id=[], year_id=[], age_group_id=[], sex_id=[], measure_id=[], location_set_id=35, gbd_round_id=5, n_draws=1000, copy_env_inc=False): # set static values self.me_map = me_map self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id self.copy_env_inc = copy_env_inc # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2017] if not age_group_id: age_group_id = range(2, 21) + [30, 31, 32, 235] if not sex_id: sex_id = [1, 2] if not measure_id: measure_id = [5, 6] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) # draws that are imported or computed are stored here self.draws = {} # objects for reading data self._importers = {} for me_id in me_map["sub"].keys() + [me_map["env"]]: me_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, gbd_round_id=gbd_round_id) me_source.remove_transform(automagic_age_sex_agg) self._importers[me_id] = me_source # object for pushing results to disk self._pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir)
def __init__( self, process_name, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # validate decomp_step validate_decomp_step("ExAdjust", decomp_step, gbd_round_id) this_file = os.path.realpath(__file__) this_dir = os.path.dirname(this_file) filepath = os.path.join(this_dir,"..","maps","final.json") with open(filepath, 'r') as f: emap = json.load(f) me_map = emap[process_name]["kwargs"]["me_map"] # set static values self.me_map = json_parser(json.dumps(me_map)) self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id try: self.copy_env_inc = emap[ process_name]["kwargs"].pop("copy_env_inc") except KeyError: self.copy_env_inc = False # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree( location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves()] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions(index_dict, data_dict) # draws that are imported or computed are stored here self.draws = {} # objects for reading data self._importers = {} for me_id in list(self.me_map["sub"].keys()) + [self.me_map["env"]]: mvid, dstep = ( get_best_model_version_and_decomp_step(output_dir, me_id) ) me_source = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=me_id, model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep ) me_source.remove_transform(automagic_age_sex_agg) if n_draws < 1000: me_source.add_transform(group_and_downsample, n_draws) self._importers[me_id] = me_source # object for pushing results to disk self._pusher = SuperPusher( spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws'}, directory=output_dir)