def normalize_age(data: pd.DataFrame, fill_value: Real, cols_to_fill: List[str]) -> pd.DataFrame: data_ages = set(data.age_group_id.unique() ) if "age_group_id" in data.columns else set() gbd_ages = set(utility_data.get_age_group_ids()) if not data_ages: # Data does not correspond to individuals, so no age column necessary. pass elif data_ages == {SPECIAL_AGES["all_ages"]}: # Data applies to all ages, so copy. dfs = [] for age in gbd_ages: missing = data.copy() missing.loc[:, "age_group_id"] = age dfs.append(missing) data = pd.concat(dfs, ignore_index=True) elif data_ages < gbd_ages: # Data applies to subset, so fill other ages with fill value. key_columns = list(data.columns.difference(cols_to_fill)) key_columns.remove("age_group_id") expected_index = pd.MultiIndex.from_product( [data[c].unique() for c in key_columns] + [gbd_ages], names=key_columns + ["age_group_id"], ) data = (data.set_index(key_columns + ["age_group_id"]).reindex( expected_index, fill_value=fill_value).reset_index()) else: # data_ages == gbd_ages pass return data
def get_disability_weight(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame: if entity.kind == "cause": data = utility_data.get_demographic_dimensions(location_id, draws=True, value=0.0) data = data.set_index( utilities.get_ordered_index_cols(data.columns.difference(DRAW_COLUMNS)) ) if entity.sequelae: for sequela in entity.sequelae: try: prevalence = get_data(sequela, "prevalence", location_id) except DataDoesNotExistError: # sequela prevalence does not exist so no point continuing with this sequela continue disability = get_data(sequela, "disability_weight", location_id) disability.index = disability.index.set_levels( [location_id], level="location_id" ) data += prevalence * disability cause_prevalence = get_data(entity, "prevalence", location_id) data = (data / cause_prevalence).fillna(0).reset_index() else: # entity.kind == 'sequela' try: data = extract.extract_data(entity, "disability_weight", location_id) data = utilities.normalize(data) cause = [c for c in causes if c.sequelae and entity in c.sequelae][0] data = utilities.clear_disability_weight_outside_restrictions( data, cause, 0.0, utility_data.get_age_group_ids() ) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) except (IndexError, DataDoesNotExistError): logger.warning( f"{entity.name.capitalize()} has no disability weight data. All values will be 0." ) data = utility_data.get_demographic_dimensions(location_id, draws=True, value=0.0) return data
def get_exposure_and_restriction_ages(exposure: pd.DataFrame, entity: RiskFactor) -> set: """Get the intersection of age groups found in exposure data and entity restriction age range. Used to filter other risk data where using just exposure age groups isn't sufficient because exposure at the point of extraction is pre-filtering by age restrictions. Parameters ---------- exposure Exposure data for `entity`. entity Entity for which to find the intersecting exposure and restriction ages. Returns ------- Set of age groups found in both the entity's exposure data and in the entity's age restrictions. """ exposure_age_groups = set(exposure.age_group_id) start, end = get_age_group_ids_by_restriction(entity, "outer") restriction_age_groups = get_restriction_age_ids( start, end, utility_data.get_age_group_ids()) valid_age_groups = exposure_age_groups.intersection(restriction_age_groups) return valid_age_groups
def load_lbwsg_exposure(key: str, location: str): path = paths.lbwsg_data_path('exposure', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data.drop('modelable_entity_id', 'columns') data = data[data.parameter != 'cat124'] # LBWSG data has an extra residual category added by get_draws. data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data.groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'exposure', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_paf(key: str, location: str): path = paths.lbwsg_data_path('population_attributable_fraction', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data[data.metric_id == vi_globals.METRICS['Percent']] # All lbwsg risk is about mortality. data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])] temp = [] causes_map = {c.gbd_id: c for c in causes} # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'population_attributable_fraction', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_deaths(entity: Cause, location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "deaths", location_id) data = utilities.filter_data_by_restrictions( data, entity, "yll", utility_data.get_age_group_ids() ) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def _load_diarrhea_sequela_disability_weight(sequela, location_id: int): logger.info(f'Loading disability weight for {sequela.name} from GBD 2016.') data = extract.get_auxiliary_data('disability_weight', 'sequela', 'all', location_id) data = data.loc[data.healthstate_id == sequela.healthstate.gbd_id, :] data = standardize.normalize(data) data = utilities.clear_disability_weight_outside_restrictions( data, causes.diarrheal_diseases, 0.0, utility_data.get_age_group_ids()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) return utilities.reshape(data)
def _load_prevalence(entity, location_id: int, entity_type: str): logger.info(f'Loading prevalence for {entity.name} from GBD 2016.') data = extract.get_como_draws(entity.gbd_id, location_id, entity_type) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) return utilities.reshape(data)
def get_prevalence(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "prevalence", location_id) if entity.kind == "cause": restrictions_entity = entity else: # sequela cause = [c for c in causes if c.sequelae and entity in c.sequelae][0] restrictions_entity = cause data = utilities.filter_data_by_restrictions( data, restrictions_entity, "yld", utility_data.get_age_group_ids() ) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def _get_raw_demographic_dimensions(location: str): location_id = extract.get_location_id(location) ages = utility_data.get_age_group_ids() years = range(project_globals.MIN_YEAR, project_globals.MAX_YEAR + 1) sexes = [vi_globals.SEXES['Male'], vi_globals.SEXES['Female']] location_id = [location_id] values = [location_id, sexes, ages, years] names = ['location_id', 'sex_id', 'age_group_id', 'year_id'] data = (pd.MultiIndex.from_product(values, names=names).to_frame(index=False)) data = standardize.normalize(data) data = utilities.reshape(data) return data
def load_ikf_paf(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False) relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = (data.groupby('cause_id', as_index=False) .apply(core.filter_by_relative_risk, relative_risk) .reset_index(drop=True)) causes_map = {c.gbd_id: c for c in causes} temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_exposure( entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int ) -> pd.DataFrame: data = extract.extract_data(entity, "exposure", location_id) data = data.drop("modelable_entity_id", "columns") if entity.name in EXTRA_RESIDUAL_CATEGORY: cat = EXTRA_RESIDUAL_CATEGORY[entity.name] data = data.drop(labels=data.query("parameter == @cat").index) data[DRAW_COLUMNS] = data[DRAW_COLUMNS].clip(lower=MINIMUM_EXPOSURE_VALUE) if entity.kind in ["risk_factor", "alternative_risk_factor"]: data = utilities.filter_data_by_restrictions( data, entity, "outer", utility_data.get_age_group_ids() ) if entity.distribution in ["dichotomous", "ordered_polytomous", "unordered_polytomous"]: tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat( [ utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1), ], ignore_index=True, ) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(DRAW_COLUMNS + ["parameter"])) sums = data.groupby(cols)[DRAW_COLUMNS].sum() data = ( data.groupby("parameter") .apply(lambda df: df.set_index(cols).loc[:, DRAW_COLUMNS].divide(sums)) .reset_index() ) else: data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS + ["parameter"]) return data
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_shigella_remission_rate(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = extract.get_modelable_entity_draws( causes.diarrheal_diseases.dismod_id, location_id) data = data[data.measure_id == vi_globals.MEASURES['Remission rate']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_population_attributable_fraction( entity: Union[RiskFactor, Etiology], location_id: int ) -> pd.DataFrame: causes_map = {c.gbd_id: c for c in causes} if entity.kind == "risk_factor": data = extract.extract_data(entity, "population_attributable_fraction", location_id) relative_risk = extract.extract_data(entity, "relative_risk", location_id) # FIXME: we don't currently support yll-only causes so I'm dropping them because the data in some cases is # very messed up, with mort = morb = 1 (e.g., aortic aneurysm in the RR data for high systolic bp) - # 2/8/19 K.W. yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = ( data.groupby("cause_id", as_index=False) .apply(filter_by_relative_risk, relative_risk) .reset_index(drop=True) ) temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(["cause_id", "measure_id"]): cause = causes_map[c_id] measure = "yll" if measure == MEASURES["YLLs"] else "yld" df = utilities.filter_data_by_restrictions( df, cause, measure, utility_data.get_age_group_ids() ) temp.append(df) data = pd.concat(temp, ignore_index=True) else: # etiology data = extract.extract_data( entity, "etiology_population_attributable_fraction", location_id ) cause = [c for c in causes if entity in c.etiologies][0] data = utilities.filter_data_by_restrictions( data, cause, "inner", utility_data.get_age_group_ids() ) if np.any(data[DRAW_COLUMNS] < 0): logger.warning( f"{entity.name.capitalize()} has negative values for paf. These will be replaced with 0." ) other_cols = [c for c in data.columns if c not in DRAW_COLUMNS] data.set_index(other_cols, inplace=True) data = data.where(data[DRAW_COLUMNS] > 0, 0).reset_index() data = utilities.convert_affected_entity(data, "cause_id") data.loc[ data["measure_id"] == MEASURES["YLLs"], "affected_measure" ] = "excess_mortality_rate" data.loc[data["measure_id"] == MEASURES["YLDs"], "affected_measure"] = "incidence_rate" data = ( data.groupby(["affected_entity", "affected_measure"]) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True) ) data = data.filter( DEMOGRAPHIC_COLUMNS + ["affected_entity", "affected_measure"] + DRAW_COLUMNS ) return data