def get_dws(self): dws = self._get_standard_dws() dws = dws.append(self._get_custom_dws()) dws = dws.append(self._get_inj_dws()) dws = dws.append(self._get_epi_dws()) dws = dws.append(self._get_mnd_dws()) dws = dws.append(self._get_autism_dws()) dws = dws.append(self._get_uro_dws()) # healthstate_id 799 - Asymptomatic asymp_row = {draw_col: 0 for draw_col in self._draw_cols} asymp_row['healthstate_id'] = 799 dws = dws.append(pd.DataFrame([asymp_row])) dws = dws.reset_index(drop=True) dws = dws[['healthstate_id'] + self._draw_cols] # resample if len(self._draw_cols) != len(self.dims.data_list()): dimensions = deepcopy(self.dims) dimensions.index_dim.drop_level("age_group_id") dimensions.index_dim.drop_level("year_id") dimensions.index_dim.drop_level("location_id") gbdizer = gbdize.GBDizeDataFrame(dimensions) dws = gbdizer.correlated_percentile_resample(dws) self.dws = dws
def copy_and_backfill(self): prof_id_cret_old = self.me_map["cretinism"]["srcs"] old = self.me_dict[prof_id_cret_old].reset_index() # Handle year differences between gbd2016 and gbd2017 old.loc[old.year_id == 2016, 'year_id'] = 2017 # Handle Saudia Arabia loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=4) saudia_id = 152 saudia_sub_nats = loc_meta.loc[loc_meta.parent_id == saudia_id, 'location_id'].tolist() saudi_arabia = old.loc[old.location_id.isin(saudia_sub_nats), :] saudi_arabia.loc[:, 'location_id'] = saudia_id saudi_arabia = saudi_arabia.drop_duplicates(keep='first') old = pd.concat([old, saudi_arabia], axis=0) # Handle other location differences between gbd2016 and gbd2017 data_cols = self.draw_cols data_dct = {'data_cols': data_cols} index_cols = list(set(old.columns) - set(data_cols)) index_cols.remove('location_id') demo = get_demographics(gbd_team='epi', gbd_round_id=5) index_dct = { tuple(index_cols): list(set(tuple(x) for x in old[index_cols].values)), 'location_id': demo['location_id'] } gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) new = gbdizer.fill_location_from_nearest_parent(old, location_set_id=35, gbd_round_id=5) prof_id_cret_new = self.me_map["cretinism"]["trgs"] self.me_dict[prof_id_cret_new] = new
def get_dws(self): dws = self._get_standard_dws() dws = dws.append(self._get_custom_dws()) dws = dws.append(self._get_inj_dws()) dws = dws.append(self._get_epi_dws()) dws = dws.append(self._get_mnd_dws()) dws = dws.append(self._get_autism_dws()) dws = dws.append(self._get_uro_dws()) # Asymp draw_cols = ["draw_{}".format(i) for i in range(1000)] asymp_row = {'draw_%s' % i: 0 for i in range(1000)} asymp_row['healthstate_id'] = 799 dws = dws.append(pd.DataFrame([asymp_row])) dws = dws.reset_index(drop=True) dws = dws[['healthstate_id'] + draw_cols] # resample if len(draw_cols) != len(self.dims.data_list()): dimensions = deepcopy(self.dims) dimensions.index_dim.drop_level("age_group_id") dimensions.index_dim.drop_level("year_id") dimensions.index_dim.drop_level("location_id") gbdizer = gbdize.GBDizeDataFrame(dimensions) dws = gbdizer.random_choice_resample(dws) self.dws = dws
def _fill_0s(df, draw_source, dimensions): # make it square filters = draw_source.content_kwargs["filters"].copy() dimensions = deepcopy(dimensions) dimensions.index_dim.replace_level("location_id", filters["location_id"]) gbdizer = gbdize.GBDizeDataFrame(dimensions) df = gbdizer.fill_empty_indices(df, 0) return df
def _simulated_aggregate_prevalence(self): if self.como_sim is None: raise AttributeError( "cannot access _simulated_aggregate_prevalence without " "setting como_sim") df = self.como_sim.agg_causes.copy() gbdizer = gbdize.GBDizeDataFrame(self.dimensions) df = gbdizer.random_choice_resample(df) return df[self.index_cols + self.draw_cols]
def _import_draws(self): gbdizer = gbdize.GBDizeDataFrame(self.dimensions) # import draws for me_id in self._importers.keys(): draw_source = self._importers[me_id] draws = draw_source.content(filters=self.filters) draws = gbdizer.fill_empty_indices(draws, 0) self.draws[me_id] = draws.set_index(self.dimensions.index_names)
def get_id_dws(self): dws = pd.read_csv("FILEPATH/combined_id_dws.csv") dimensions = deepcopy(self.dims) dimensions.index_dim.drop_level("year_id") dimensions.index_dim.drop_level("location_id") gbdizer = gbdize.GBDizeDataFrame(dimensions) dws = dws.reset_index(drop=True) if len(self._draw_cols) != len(self.dims.data_list()): dws = gbdizer.correlated_percentile_resample(dws) self.id_dws = dws
def read_single_en_injury(self, modelable_entity_id, model_version_id, measure_id=[ measures.YLD, measures.INCIDENCE, measures.ST_PREVALENCE, measures.LT_PREVALENCE ]): injury_source = ( self._ss_factory.get_en_injuries_modelable_entity_source( modelable_entity_id, model_version_id)) dim = self.dimensions.get_simulation_dimensions(measure_id=measure_id, at_birth=False) # get filters w/ added years if interpolation is needed filters = dim.index_dim.to_dict()["levels"] req_years = filters["year_id"] if not set(req_years).issubset(set(self._estim_years)): filters["year_id"] = list(set(req_years + self._estim_years)) # read data df = injury_source.content(filters=filters) if df.empty: raise Exception(f"No data returned for ME {modelable_entity_id}, " f"model version {model_version_id}.") draw_cols = [col for col in df.columns if "draw_" in col] # add indices to dimensions object from draw source transforms dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist()) dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist()) dim.index_dim.add_level("healthstate_id", df.healthstate_id.unique().tolist()) dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist()) # interpolate missing years if not set(df.year_id.unique()).issuperset(set(req_years)): interp_df = pchip_interpolate(df=df, id_cols=dim.index_names, value_cols=draw_cols, time_col="year_id", time_vals=req_years) df = df[df.year_id.isin(req_years)] df = df.append(interp_df) else: df = df[df.year_id.isin(req_years)] # resample if len(dim.data_list()) != len(draw_cols): gbdizer = gbdize.GBDizeDataFrame(dim) df = gbdizer.correlated_percentile_resample(df) return df
def fill_square(df, col, gbd_round_id): '''make data square across a column for a set of index columns''' demo = get_demographics(gbd_team='epi', gbd_round_id=gbd_round_id) draw_cols = list(df.filter(like='draw_').columns) index_cols = list(set(df.columns) - set(draw_cols)) index_cols.remove(col) index_dct = { tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)), col: demo[col] } data_dct = {'draw_cols': draw_cols} gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) return gbdizer.fill_empty_indices(df, 0)
def read_single_en_injury(self, modelable_entity_id, model_version_id, measure_id=[3, 6, 35, 36]): injury_source = ( self._ss_factory.get_en_injuries_modelable_entity_source( modelable_entity_id, model_version_id)) dim = self.dimensions.get_simulation_dimensions(measure_id) # get filters w/ added years if interpolation is needed filters = dim.index_dim.to_dict()["levels"] req_years = filters["year_id"] if not set(req_years).issubset(set(self._estim_years)): filters["year_id"] = list(set(req_years + self._estim_years)) # read data df = injury_source.content(filters=filters) if df.empty: raise Exception("No data returned for meid:{} and mvid:{}".format( modelable_entity_id, model_version_id)) # add indices to dimensions object from draw source transforms dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist()) dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist()) dim.index_dim.add_level("healthstate_id", df.healthstate_id.unique().tolist()) dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist()) # interpolate missing years if not set(df.year_id.unique()).issuperset(set(req_years)): interp_df = pchip_interpolate(df=df, id_cols=dim.index_names, value_cols=self._draw_cols, time_col="year_id", time_vals=req_years) df = df[df.year_id.isin(req_years)] df = df.append(interp_df) else: df = df[df.year_id.isin(req_years)] # resample if ndraws is less than 1000 if len(dim.data_list()) != 1000: gbdizer = gbdize.GBDizeDataFrame(dim) df = gbdizer.correlated_percentile_resample(df) return df
def backfill(df, norway_id, code_dir, loc_meta): #backfill data_cols = ['cases', 'effective_sample_size', 'sample_size'] data_dct = {'data_cols': data_cols} index_cols = list(set(df.columns) - set(data_cols)) index_cols.remove('location_id') norway_subs = loc_meta.loc[loc_meta.parent_id == norway_id, 'location_id'].tolist() index_dct = { tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)), 'location_id': norway_subs } gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) backfilled = gbdizer.fill_location_from_nearest_parent(df, location_set_id=35, gbd_round_id=5) return backfilled
def fill_square(df, index_cols, square_col, square_col_vals, fill_val=0): """make data square across a column for a set of index columns""" # get index dimensions index_cols = [col for col in index_cols if col != square_col] index_dct = { tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)), square_col: square_col_vals } # get data dimensions data_dct = { "non_draw_cols": [col for col in df.columns if col not in index_cols + [square_col]] } # make it square gbdizer = gbdize.GBDizeDataFrame( dimensionality.DataFrameDimensions(index_dct, data_dct)) df = gbdizer.fill_empty_indices(df, fill_val) return df
def get_id_dws(self): draw_cols = ["draw_{}".format(i) for i in range(1000)] dws = pd.read_csv("filepath/03_custom/" "combined_id_dws.csv") dws['age_end'] = dws['age_end'] + 1 dws['age_end'] = dws.age_end.replace({101: 200}) dws.rename(columns={d.replace("_", ""): d for d in draw_cols}, inplace=True) dws = dws[["age_start", "age_end", "healthstate_id"] + draw_cols] dimensions = deepcopy(self.dims) dimensions.index_dim.drop_level("year_id") dimensions.index_dim.drop_level("location_id") gbdizer = gbdize.GBDizeDataFrame(dimensions) dws = gbdizer.fill_age_from_continuous_range(dws, 12, "age_start", "age_end") dws = dws.reset_index(drop=True) if len(draw_cols) != len(self.dims.data_list()): dws = gbdizer.random_choice_resample(dws) self.id_dws = dws
def get_en_matrices(self): if self.dimensions is None or self.como_version is None: raise AttributeError( "cannot get ncodes via this method if como_version is None or" " dimensions is None") df_list = [] years = list( set( cap_val(self.dimensions.index_dim.levels.year_id, [1990, 1995, 2000, 2005, 2010, 2016]))) parallelism = ["location_id", "sex_id"] for slices in self.dimensions.index_slices(parallelism): for year in years: nemat = pd.read_csv( "FILEPATH/" "NEmatrix_{location_id}_{year_id}_{sex_id}.csv".format( location_id=slices[0], year_id=year, sex_id=slices[1])) nemat = nemat.merge(self.como_version.cause_list, left_on="ecode", right_on="acause") nemat = nemat.merge(self.como_version.injury_dws_by_sequela, left_on="ncode", right_on="n_code") nemat = nemat[["cause_id", "age_group_id", "sequela_id"] + self.dimensions.data_list()] nemat["location_id"] = slices[0] nemat["sex_id"] = slices[1] nemat["year_id"] = year df_list.append(nemat) df = pd.concat(df_list) dims = deepcopy(self.dimensions) dims.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist()) dims.index_dim.add_level("cause_id", df.cause_id.unique().tolist()) dims.index_dim.drop_level("measure_id") gbdizer = gbdize.GBDizeDataFrame(dims) df = gbdizer.random_choice_resample(df) df = gbdizer.fill_year_from_nearest_neighbor(df) self.en_matrices = df
def square_data(df, ds): if df.empty: return df # collect draw source attributes filters = ds.content_kwargs["filters"].copy() model_version_id = ds.params['model_version_id'] modelable_entity_id = ds.params['modelable_entity_id'] dim = deepcopy(ds.params["dimensions"]) # construct the dimensions object for key, val in list(filters.items()): dim.index_dim.replace_level(key, val) dim.index_dim.add_level("modelable_entity_id", [modelable_entity_id]) dim.index_dim.add_level("model_version_id", [model_version_id]) dim.index_dim.replace_level("year_id", df.year_id.unique().tolist()) # fill in empty gbdizer = gbdize.GBDizeDataFrame(dim) df = gbdizer.add_missing_index_cols(df) df = gbdizer.fill_empty_indices(df, 0) return df
def _get_short_term_EN_annual(self, dim): # get non interpolated values annual_sg = SuperGopher( {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'}, os.path.join("filepath", "FILEPATH")) annual_df = annual_sg.content( location_id=dim.index_dim.get_level("location_id"), year_id=dim.index_dim.get_level("year_id"), sex_id=dim.index_dim.get_level("sex_id")) # clean data annual_df = annual_df.merge(self.como_version.cause_list, left_on="ecode", right_on="acause") annual_df = annual_df.merge(self.como_version.ncode_hierarchy, left_on="ncode", right_on="rei") annual_df["age"] = annual_df["age"].round(2).astype(str) ridiculous_am = { '0.0': 2, '0.01': 3, '0.1': 4, '1.0': 5, '5.0': 6, '10.0': 7, '15.0': 8, '20.0': 9, '25.0': 10, '30.0': 11, '35.0': 12, '40.0': 13, '45.0': 14, '50.0': 15, '55.0': 16, '60.0': 17, '65.0': 18, '70.0': 19, '75.0': 20, '80.0': 30, '85.0': 31, '90.0': 32, '95.0': 235 } annual_df["age"] = annual_df["age"].replace(ridiculous_am).astype(int) annual_df.rename(columns={"age": "age_group_id"}, inplace=True) # transform to rate annual_df = transform_metric(annual_df, 3, 1) # collapse inpatient annual_df = annual_df.groupby([ "location_id", "year_id", "age_group_id", "sex_id", "cause_id", "rei_id" ]).sum().reset_index() # fill demographics gbdizer = gbdize.GBDizeDataFrame(dim) annual_df = gbdizer.add_missing_index_cols(annual_df) annual_df = gbdizer.gbdize_any_by_dim(annual_df, "age_group_id") annual_df.fillna(0, inplace=True) # resample if necessary annual_df = self.resample_if_needed(annual_df, dim, gbdizer) return annual_df
def _get_short_term_EN_estimation(self, dim): # get non interpolated values estim_sg = SuperGopher( {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'}, os.path.join("filepath", "03_outputs/01_draws/ylds")) years = list( set( cap_val(dim.index_dim.levels.year_id, [1990, 1995, 2000, 2005, 2010, 2016]) + [2005])) estim_df = estim_sg.content( location_id=dim.index_dim.get_level("location_id"), year_id=years, sex_id=dim.index_dim.get_level("sex_id")) # clean data estim_df = estim_df.merge(self.como_version.cause_list, left_on="ecode", right_on="acause") estim_df = estim_df.merge(self.como_version.ncode_hierarchy, left_on="ncode", right_on="rei") estim_df["age"] = estim_df["age"].round(2).astype(str) ridiculous_am = { '0.0': 2, '0.01': 3, '0.1': 4, '1.0': 5, '5.0': 6, '10.0': 7, '15.0': 8, '20.0': 9, '25.0': 10, '30.0': 11, '35.0': 12, '40.0': 13, '45.0': 14, '50.0': 15, '55.0': 16, '60.0': 17, '65.0': 18, '70.0': 19, '75.0': 20, '80.0': 30, '85.0': 31, '90.0': 32, '95.0': 235 } estim_df["age"] = estim_df["age"].replace(ridiculous_am).astype(int) estim_df.rename(columns={"age": "age_group_id"}, inplace=True) # transform to rate estim_df = transform_metric(estim_df, 3, 1) # collapse inpatient estim_df = estim_df.groupby([ "location_id", "year_id", "age_group_id", "sex_id", "cause_id", "rei_id" ]).sum().reset_index() # fill demographics data_cols = ["draw_{}".format(i) for i in range(1000)] gbdizer = gbdize.GBDizeDataFrame(dim) estim_df = gbdizer.add_missing_index_cols(estim_df) estim_df = gbdizer.gbdize_any_by_dim(estim_df, "age_group_id") estim_df.fillna(0, inplace=True) if gbdizer.missing_values(estim_df, "year_id"): estim_df = gbdizer.fill_year_by_interpolating( df=estim_df, rank_df=estim_df[estim_df["year_id"] == 2005], data_cols=data_cols) estim_df = estim_df[estim_df.year_id.isin( dim.index_dim.get_level("year_id"))] # resample if necessary estim_df = self.resample_if_needed(estim_df, dim, gbdizer) return estim_df
def read_inputs(self): """get como draws for a single modelable_entity/model_version""" print('Reading draws for (meid, mvid): ({}, {})'.format( self.meid, self.mvid)) if self.super_gopher is None: self.super_gopher = SuperGopher.auto(self.meid_data_dir) all_draws = [] reference_draws = [] missing_dim_q = [] for dimensions in self.dimensions_q: gbdizer = gbdize.GBDizeDataFrame(dimensions) try: draws = self.super_gopher.content( location_id=dimensions.index_dim.get_level("location_id"), year_id=dimensions.index_dim.get_level("year_id"), sex_id=dimensions.index_dim.get_level("sex_id"), measure_id=dimensions.index_dim.get_level("measure_id"), age_group_id=dimensions.index_dim.get_level( "age_group_id")) except InvalidFilter: draws = pd.DataFrame(columns=dimensions.index_names) if not draws.empty: # gbdize. aka fill in missing dimensions draws = self.gbdize_dimensions(draws, gbdizer) # keep a copy of all 1000 draws for interpolation reference_draws.append(draws) # resample draws = self.resample_if_needed(draws, dimensions, gbdizer) if len(draws) != dimensions.total_cardinality: missing = self.missing_dimensions(draws, dimensions) missing_dim_q.append(missing) all_draws.append(draws) # prep for interpolation of missing demographics if len(reference_draws) > 0: reference_draws = pd.concat(reference_draws) else: reference_draws = pd.DataFrame(columns=dimensions.index_names) missing_dim_q = list(flatten(missing_dim_q)) for dimensions in missing_dim_q: gbdizer = gbdize.GBDizeDataFrame(dimensions) interp_draws, rank_df = self.get_interpolation_draws( reference_draws, dimensions) if not interp_draws.empty: # gbdize. aka fill in missing dimensions interp_draws = self.gbdize_dimensions(interp_draws, gbdizer) rank_df = self.gbdize_dimensions(rank_df, gbdizer) # case where years are stored as floats, breaks interpolate interp_draws['year_id'] = interp_draws['year_id'].astype(int) try: data_cols = ["draw_{}".format(i) for i in range(1000)] interp_draws = gbdizer.fill_year_by_interpolating( interp_draws, rank_df, data_cols) except MissingGBDemographics: print( "(meid: {meid}, mvid: {mvid}) " " Could not interpolate for years: {years}, " "measure: {meas} " "location_id: {loc} " "sex_id: {sex}".format( meid=self.meid, mvid=self.mvid, years=dimensions.index_dim.get_level("year_id"), meas=dimensions.index_dim.get_level("measure_id"), loc=dimensions.index_dim.get_level("location_id"), sex=dimensions.index_dim.get_level("sex_id"))) interp_draws = self.gbdize_dimensions( interp_draws, gbdizer, "year_id") # append draws to reference reference_draws = reference_draws.append(interp_draws, ignore_index=True) draws = interp_draws.loc[interp_draws['year_id'].isin( dimensions.index_dim.get_level('year_id'))] # resample draws = self.resample_if_needed(draws, dimensions, gbdizer) all_draws.append(draws) # if dimensions overlap, drop duplicates from reference draws reference_draws.drop_duplicates(subset=dimensions.index_names, inplace=True) # concatenate all the results draws = pd.concat(all_draws) # in case dimensions overlap, drop duplicates draws.drop_duplicates(inplace=True) draws['modelable_entity_id'] = self.meid return draws.reset_index(drop=True)