def load_lbwsg_exposure(key: str, location: str): path = paths.lbwsg_data_path('exposure', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data.drop('modelable_entity_id', 'columns') data = data[data.parameter != 'cat124'] # LBWSG data has an extra residual category added by get_draws. data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data.groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'exposure', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def validate_and_reshape_gbd_data( data: pd.DataFrame, entity: ModelableEntity, key: EntityKey, location: str, gbd_round_id: int, age_group_ids: List[int] = None) -> pd.DataFrame: # from vivarium_inputs.core.get_data data = vi_utils.reshape(data, value_cols=vi_globals.DRAW_COLUMNS) # from interface.get_measure data = _scrub_gbd_conventions(data, location, age_group_ids) estimation_years = get_gbd_estimation_years(gbd_round_id) validation_years = pd.DataFrame({ 'year_start': range(min(estimation_years), max(estimation_years) + 1) }) validation_years['year_end'] = validation_years['year_start'] + 1 # validate_for_simulation(data, entity, key.measure, location, years=validation_years, # age_bins=get_gbd_age_bins(age_group_ids)) data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age') data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year') data = vi_utils.sort_hierarchical_data(data).droplevel('location') return data
def load_forecast_data(key: EntityKey, location: str): location_id = extract.get_location_id(location) path = paths.forecast_data_path(key) data = extract.load_forecast_from_xarray(path, location_id) data = data[data.scenario == project_globals.FORECASTING_SCENARIO].drop( columns='scenario') if key == EntityKey('etiology.shigellosis.incidence'): # Only one draw for incidence data = pd.concat(project_globals.NUM_DRAWS * [ data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id']).value ], axis=1) else: data = data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() if len(data.columns) == 100: # Not 1000 draws for everything data = pd.concat([data] * 10, axis=1) data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) data = data.reset_index() data = standardize.normalize(data) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_relative_risk(key: str, location: str): path = paths.lbwsg_data_path('relative_risk', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = utilities.convert_affected_entity(data, 'cause_id') # RRs for all causes are the same. data = data[data.affected_entity == 'diarrheal_diseases'] data['affected_entity'] = 'all' # All lbwsg risk is about mortality. data.loc[:, 'affected_measure'] = 'excess_mortality_rate' data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + vi_globals.DRAW_COLUMNS) data = ( data .groupby(['affected_entity', 'parameter']) .apply(utilities.normalize, fill_value=1) .reset_index(drop=True) ) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) tmrel_mask = data.parameter == tmrel_cat data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS] = ( data .loc[tmrel_mask, vi_globals.DRAW_COLUMNS] .mask(np.isclose(data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS], 1.0), 1.0) ) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'relative_risk', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_shigella_disability_weight(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = _get_raw_demographic_dimensions(location) data = pd.DataFrame(0, columns=vi_globals.DRAW_COLUMNS, index=data.index) for sequela in causes.diarrheal_diseases.sequelae: prevalence = _load_prevalence(sequela, location_id, 'sequela') disability = _load_diarrhea_sequela_disability_weight( sequela, location_id) disability.index = disability.index.set_levels([location_id], 'location_id') data += prevalence * disability diarrhea_prevalence = _load_prevalence(causes.diarrheal_diseases, location_id, 'cause') data = (data / diarrhea_prevalence).fillna(0).reset_index() data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_paf(key: str, location: str): path = paths.lbwsg_data_path('population_attributable_fraction', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data[data.metric_id == vi_globals.METRICS['Percent']] # All lbwsg risk is about mortality. data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])] temp = [] causes_map = {c.gbd_id: c for c in causes} # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'population_attributable_fraction', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def write_utilization_rate(artifact, location): key = 'healthcare_entity.outpatient_visits.utilization_rate' from vivarium_csu_hypertension_sdc import external_data data_dir = Path(external_data.__file__).parent data = pd.read_csv(data_dir / f'outpatient_utilization.csv') loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = ( np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp( np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=globals.DRAW_COLUMNS) data = pd.concat( [data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) artifact.write(key, data)
def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def write_hypertension_medication_data(artifact, location): external_data_specification = { 'adherence': { 'seed_columns': ['location'], # all adherence will use the same seeds 'distribution': 'beta', }, 'medication_probabilities': { 'seed_columns': [ 'location', 'measure', 'thiazide_type_diuretics', 'beta_blockers', 'ace_inhibitors', 'angiotensin_ii_blockers', 'calcium_channel_blockers' ], 'distribution': 'beta', }, 'therapy_category': { 'seed_columns': ['location', 'therapy_category'], 'distribution': 'beta', }, 'treatment_coverage': { 'seed_columns': ['location', 'measure'], 'distribution': 'beta', }, 'drug_efficacy': { 'seed_columns': [ 'location', 'drug' ], # don't include dosage so all dosages of same drug will use same seeds 'distribution': 'normal', }, } for k, spec in external_data_specification.items(): data = load_external_data(k, location) data = generate_draws(data, spec['seed_columns'], spec['distribution']) if set(data.location) == {'Global'}: # do this post draw generation so all locations use the same draws if data is global data.location = location if k == 'medication_probabilities': # drop ACE + ARB single pill because not used data = data.loc[~((data.ace_inhibitors == 1) & (data.angiotensin_ii_blockers == 1))] data = utilities.sort_hierarchical_data(utilities.reshape(data)) if k == 'therapy_category': # normalize so that sum of all categories = 1 data = data.divide(data.sum(axis=0), axis=1) key = f'health_technology.hypertension_medication.{k}' data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') artifact.write(key, data)
def _load_em_from_meid(location, meid, measure): location_id = utility_data.get_location_id(location) data = gbd.get_modelable_entity_draws(meid, location_id) data = data[data.measure_id == vi_globals.MEASURES[measure]] data = vi_utils.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = vi_utils.reshape(data) data = vi_utils.scrub_gbd_conventions(data, location) data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age') data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year') return vi_utils.sort_hierarchical_data(data)
def _load_diarrhea_sequela_disability_weight(sequela, location_id: int): logger.info(f'Loading disability weight for {sequela.name} from GBD 2016.') data = extract.get_auxiliary_data('disability_weight', 'sequela', 'all', location_id) data = data.loc[data.healthstate_id == sequela.healthstate.gbd_id, :] data = standardize.normalize(data) data = utilities.clear_disability_weight_outside_restrictions( data, causes.diarrheal_diseases, 0.0, utility_data.get_age_group_ids()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) return utilities.reshape(data)
def _load_prevalence(entity, location_id: int, entity_type: str): logger.info(f'Loading prevalence for {entity.name} from GBD 2016.') data = extract.get_como_draws(entity.gbd_id, location_id, entity_type) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) return utilities.reshape(data)
def _get_raw_demographic_dimensions(location: str): location_id = extract.get_location_id(location) ages = utility_data.get_age_group_ids() years = range(project_globals.MIN_YEAR, project_globals.MAX_YEAR + 1) sexes = [vi_globals.SEXES['Male'], vi_globals.SEXES['Female']] location_id = [location_id] values = [location_id, sexes, ages, years] names = ['location_id', 'sex_id', 'age_group_id', 'year_id'] data = (pd.MultiIndex.from_product(values, names=names).to_frame(index=False)) data = standardize.normalize(data) data = utilities.reshape(data) return data
def load_ikf_paf(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False) relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = (data.groupby('cause_id', as_index=False) .apply(core.filter_by_relative_risk, relative_risk) .reset_index(drop=True)) causes_map = {c.gbd_id: c for c in causes} temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_location_specific_life_expectancy(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = extract.get_location_specific_life_expectancy(location_id) data = data.rename(columns={'age': 'age_start'}) data['age_end'] = data.age_start.shift(-1).fillna(5.01) earliest_year = data[data.year_id == 2025] out = [] for year in range(project_globals.MIN_YEAR, 2025): df = earliest_year.copy() df['year_id'] = year out.append(df) data = pd.concat(out + [data], ignore_index=True) data = utilities.normalize_sex(data, None, ['value']) data = standardize.normalize_year(data) data = utilities.reshape(data, value_cols=['value']) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lri_birth_prevalence_from_meid(_, location): """Ignore the first argument to fit in to the get_data model. """ location_id = utility_data.get_location_id(location) data = get_draws('modelable_entity_id', project_globals.LRI_BIRTH_PREVALENCE_MEID, source=project_globals.LRI_BIRTH_PREVALENCE_DRAW_SOURCE, age_group_id=project_globals.LRI_BIRTH_PREVALENCE_AGE_ID, measure_id=vi_globals.MEASURES['Prevalence'], gbd_round_id=project_globals.LRI_BIRTH_PREVALENCE_GBD_ROUND, location_id=location_id) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.normalize(data, fill_value=0) idx_columns = list(vi_globals.DEMOGRAPHIC_COLUMNS) idx_columns.remove('age_group_id') data = data.filter(idx_columns + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_healthcare_utilization(key: str, location: str) -> pd.DataFrame: data = pd.read_csv(paths.HEALTHCARE_UTILIZATION, dtype={'location_id': np.int64, 'sex_id': np.int64, 'age_group_id': np.int64, 'year_id': np.int64, 'outpatient_visits_per_cap_mean': np.float64, 'outpatient_visits_per_cap_95_upper': np.float64, 'outpatient_visits_per_cap_95_lower': np.float64}) loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = (np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp(np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=vi_globals.DRAW_COLUMNS) data = pd.concat([data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_live_births_by_year(key: EntityKey, location: str): location_id = extract.get_location_id(location) asfr_key = EntityKey('covariate.age_specific_fertility_rate.estimate') pop_key = EntityKey(project_globals.POPULATION_STRUCTURE) asfr_data = extract.load_forecast_from_xarray( paths.forecast_data_path(asfr_key), location_id) asfr_data = asfr_data[ (asfr_data.scenario == project_globals.FORECASTING_SCENARIO) & (asfr_data.year_id >= project_globals.MIN_YEAR)].drop( columns='scenario') asfr_data = asfr_data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() asfr_data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) pop_data = extract.load_forecast_from_xarray( paths.forecast_data_path(pop_key), location_id) pop_data = pop_data[( pop_data.scenario == project_globals.FORECASTING_SCENARIO)].drop( columns='scenario') pop_data = pop_data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() pop_data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) pop_data = pop_data.loc[asfr_data.index] live_births = asfr_data * pop_data live_births = (live_births.reset_index().drop( columns=['sex_id', 'age_group_id']).groupby(['location_id', 'year_id' ]).sum().reset_index()) data = standardize.normalize(live_births) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_shigella_remission_rate(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = extract.get_modelable_entity_draws( causes.diarrheal_diseases.dismod_id, location_id) data = data[data.measure_id == vi_globals.MEASURES['Remission rate']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] data = utilities.convert_affected_entity(data, 'cause_id') data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)] morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate' data = core.filter_relative_risk_to_cause_restrictions(data) data = (data.groupby(['affected_entity', 'parameter']) .apply(utilities.normalize, fill_value=1) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + vi_globals.DRAW_COLUMNS) tmrel_cat = utility_data.get_tmrel_category(entity) tmrel_mask = data.parameter == tmrel_cat data.loc[tmrel_mask, value_cols] = ( data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0) ) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_data(entity, measure: str, location: Union[str, int]): measure_handlers = { # Cause-like measures "incidence_rate": (get_incidence_rate, ("cause", "sequela")), "raw_incidence_rate": (get_raw_incidence_rate, ("cause", "sequela")), "prevalence": (get_prevalence, ("cause", "sequela")), "birth_prevalence": (get_birth_prevalence, ("cause", "sequela")), "disability_weight": (get_disability_weight, ("cause", "sequela")), "remission_rate": (get_remission_rate, ("cause",)), "cause_specific_mortality_rate": (get_cause_specific_mortality_rate, ("cause",)), "excess_mortality_rate": (get_excess_mortality_rate, ("cause",)), "deaths": (get_deaths, ("cause",)), # Risk-like measures "exposure": ( get_exposure, ( "risk_factor", "alternative_risk_factor", ), ), "exposure_standard_deviation": ( get_exposure_standard_deviation, ("risk_factor", "alternative_risk_factor"), ), "exposure_distribution_weights": ( get_exposure_distribution_weights, ("risk_factor", "alternative_risk_factor"), ), "relative_risk": (get_relative_risk, ("risk_factor",)), "population_attributable_fraction": ( get_population_attributable_fraction, ("risk_factor", "etiology"), ), # Covariate measures "estimate": (get_estimate, ("covariate",)), # Population measures "structure": (get_structure, ("population",)), "theoretical_minimum_risk_life_expectancy": ( get_theoretical_minimum_risk_life_expectancy, ("population",), ), "age_bins": (get_age_bins, ("population",)), "demographic_dimensions": (get_demographic_dimensions, ("population",)), } if measure not in measure_handlers: raise InvalidQueryError(f"No functions available to pull data for measure {measure}.") handler, entity_types = measure_handlers[measure] if entity.kind not in entity_types: raise InvalidQueryError(f"{measure.capitalize()} not available for {entity.kind}.") location_id = ( utility_data.get_location_id(location) if isinstance(location, str) else location ) data = handler(entity, location_id) if measure in [ "structure", "theoretical_minimum_risk_life_expectancy", "estimate", "exposure_distribution_weights", ]: value_cols = ["value"] else: value_cols = DRAW_COLUMNS data = utilities.reshape(data, value_cols=value_cols) return data
def write_sbp_data(artifact, location): load = get_load(location) affected_entity_map = { 'ischemic_heart_disease': 'acute_myocardial_infarction', 'ischemic_stroke': 'acute_ischemic_stroke', 'intracerebral_hemorrhage': 'acute_intracerebral_hemorrhage', 'subarachnoid_hemorrhage': 'acute_subarachnoid_hemorrhage', 'chronic_kidney_disease': 'chronic_kidney_disease' } prefix = 'risk_factor.high_systolic_blood_pressure.' measures = [ "restrictions", "distribution", "tmred", "exposure", "exposure_standard_deviation", "relative_risk_scalar", "exposure_distribution_weights" ] for m in measures: key = prefix + m artifact.write(key, load(key)) sbp = risk_factors.high_systolic_blood_pressure data = gbd.get_paf(sbp.gbd_id, utility_data.get_location_id(location)) data = data[data.metric_id == globals.METRICS['Percent']] data = data[data.measure_id == globals.MEASURES['YLDs']] data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure' ]).apply(utilities.normalize, fill_value=0).reset_index(drop=True)) data = data.loc[data.affected_entity.isin(affected_entity_map.keys())] data.affected_entity.replace(to_replace=affected_entity_map, inplace=True) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) key = prefix + 'population_attributable_fraction' artifact.write(key, data) data = gbd.get_relative_risk(sbp.gbd_id, utility_data.get_location_id(location)) data = utilities.convert_affected_entity(data, 'cause_id') morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality' data = data.loc[data.affected_entity.isin(affected_entity_map.keys())] data = core.filter_relative_risk_to_cause_restrictions(data) data.affected_entity.replace(to_replace=affected_entity_map, inplace=True) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + globals.DRAW_COLUMNS) data = (data.groupby(['affected_entity', 'parameter' ]).apply(utilities.normalize, fill_value=1).reset_index(drop=True)) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.sort_hierarchical_data(data) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') loc = location.lower().replace(' ', '_') ckd_rr = pd.read_hdf( f'/share/costeffectiveness/artifacts/vivarium_csu_hypertension_sdc/ckd_rr/{loc}.hdf' ) ckd_rr = ckd_rr.reset_index() ckd_rr['parameter'] = 'per unit' ckd_rr['affected_entity'] = 'chronic_kidney_disease' ckd_rr['affected_measure'] = 'incidence_rate' ckd_rr = ckd_rr.set_index([ 'location', 'sex', 'age_start', 'year_start', 'affected_entity', 'affected_measure', 'parameter', 'age_end', 'year_end' ]) data = pd.concat([data, ckd_rr]) key = prefix + 'relative_risk' artifact.write(key, data)