def get_data(key: EntityKey, entity: ModelableEntity, location: str, source: str, gbd_id_type: str, age_group_ids: Set[int], gbd_round_id: int, decomp_step: str = 'iterative') -> pd.DataFrame: age_group_ids = list(age_group_ids) # from interface.get_measure # from vivarium_inputs.core.get_data location_id = utility_data.get_location_id(location) if isinstance( location, str) else location # from vivarium_inputs.core.get_{measure} # from vivarium_inputs.extract.extract_data check_metadata(entity, key.measure) # from vivarium_inputs.extract.extract_{measure} # from vivarium_gbd_access.gbd.get_{measure} data = get_draws(gbd_id_type=gbd_id_type, gbd_id=entity.gbd_id, source=source, location_id=location_id, sex_id=gbd_constants.SEX.MALE + gbd_constants.SEX.FEMALE, age_group_id=age_group_ids, gbd_round_id=gbd_round_id, decomp_step=decomp_step, status='best') return data
def write_utilization_rate(artifact, location): key = 'healthcare_entity.outpatient_visits.utilization_rate' from vivarium_csu_hypertension_sdc import external_data data_dir = Path(external_data.__file__).parent data = pd.read_csv(data_dir / f'outpatient_utilization.csv') loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = ( np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp( np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=globals.DRAW_COLUMNS) data = pd.concat( [data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) artifact.write(key, data)
def test_core_causelike(entity, measure, location): entity_name, entity_expected_measure_ids = entity measure_name, measure_id = measure tester = success_expected if (entity_expected_measure_ids & measure_id) else fail_expected df = tester(entity_name, measure_name, utility_data.get_location_id(location))
def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def _load_em_from_meid(location, meid, measure): location_id = utility_data.get_location_id(location) data = gbd.get_modelable_entity_draws(meid, location_id) data = data[data.measure_id == vi_globals.MEASURES[measure]] data = vi_utils.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = vi_utils.reshape(data) data = vi_utils.scrub_gbd_conventions(data, location) data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age') data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year') return vi_utils.sort_hierarchical_data(data)
def get_raw_data(entity: ModelableEntity, measure: str, location: str) -> Union[pd.Series, pd.DataFrame]: """Pull raw data from GBD for the requested entity, measure, and location. Skip standard raw validation checks in order to return data that can be investigated for oddities. The only filter that occurs is by applicable measure id, metric id, or to most detailed causes where relevant. Available measures: For entity kind 'sequela': incidence_rate, prevalence, birth_prevalence, disability_weight For entity kind 'cause': incidence_rate, prevalence, birth_prevalence, disability_weight, remission_rate, deaths For entity kind 'risk_factor': exposure, exposure_standard_deviation, exposure_distribution_weights, relative_risk, population_attributable_fraction, mediation_factors For entity kind 'etiology': population_attributable_fraction For entity kind 'alternative_risk_factor': exposure, exposure_standard_deviation, exposure_distribution_weights For entity kind 'covariate': estimate For entity kind 'population': structure, theoretical_minimum_risk_life_expectancy Parameters ---------- entity Entity for which to extract data. measure Measure for which to extract data. location Location for which to extract data. Returns ------- Union[pandas.Series, pandas.DataFrame] Data for the entity-measure pair and specific location requested, with no formatting or reshaping. """ location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, measure, location_id, validate=False) return data
def load_ikf_paf(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False) relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = (data.groupby('cause_id', as_index=False) .apply(core.filter_by_relative_risk, relative_risk) .reset_index(drop=True)) causes_map = {c.gbd_id: c for c in causes} temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lri_birth_prevalence_from_meid(_, location): """Ignore the first argument to fit in to the get_data model. """ location_id = utility_data.get_location_id(location) data = get_draws('modelable_entity_id', project_globals.LRI_BIRTH_PREVALENCE_MEID, source=project_globals.LRI_BIRTH_PREVALENCE_DRAW_SOURCE, age_group_id=project_globals.LRI_BIRTH_PREVALENCE_AGE_ID, measure_id=vi_globals.MEASURES['Prevalence'], gbd_round_id=project_globals.LRI_BIRTH_PREVALENCE_GBD_ROUND, location_id=location_id) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.normalize(data, fill_value=0) idx_columns = list(vi_globals.DEMOGRAPHIC_COLUMNS) idx_columns.remove('age_group_id') data = data.filter(idx_columns + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_healthcare_utilization(key: str, location: str) -> pd.DataFrame: data = pd.read_csv(paths.HEALTHCARE_UTILIZATION, dtype={'location_id': np.int64, 'sex_id': np.int64, 'age_group_id': np.int64, 'year_id': np.int64, 'outpatient_visits_per_cap_mean': np.float64, 'outpatient_visits_per_cap_95_upper': np.float64, 'outpatient_visits_per_cap_95_lower': np.float64}) loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = (np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp(np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=vi_globals.DRAW_COLUMNS) data = pd.concat([data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] data = utilities.convert_affected_entity(data, 'cause_id') data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)] morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate' data = core.filter_relative_risk_to_cause_restrictions(data) data = (data.groupby(['affected_entity', 'parameter']) .apply(utilities.normalize, fill_value=1) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + vi_globals.DRAW_COLUMNS) tmrel_cat = utility_data.get_tmrel_category(entity) tmrel_mask = data.parameter == tmrel_cat data.loc[tmrel_mask, value_cols] = ( data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0) ) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def test_extract_population(measures): pop = ModelableEntity("ignored", "population", None) df = extract.extract_data( pop, measures, utility_data.get_location_id("India"), validate=VALIDATE_FLAG )
def test_extract_covariatelike(entity, measure, location): df = extract.extract_data( entity, measure, utility_data.get_location_id(location), validate=VALIDATE_FLAG )
def test_get_measure_covariatelike(entity, measure, location): df = get_measure(entity, measure, utility_data.get_location_id(location))
def test_core_healthsystem(entity, measure, location): df = core.get_data(entity, measure, utility_data.get_location_id(location))
def write_sbp_data(artifact, location): load = get_load(location) affected_entity_map = { 'ischemic_heart_disease': 'acute_myocardial_infarction', 'ischemic_stroke': 'acute_ischemic_stroke', 'intracerebral_hemorrhage': 'acute_intracerebral_hemorrhage', 'subarachnoid_hemorrhage': 'acute_subarachnoid_hemorrhage', 'chronic_kidney_disease': 'chronic_kidney_disease' } prefix = 'risk_factor.high_systolic_blood_pressure.' measures = [ "restrictions", "distribution", "tmred", "exposure", "exposure_standard_deviation", "relative_risk_scalar", "exposure_distribution_weights" ] for m in measures: key = prefix + m artifact.write(key, load(key)) sbp = risk_factors.high_systolic_blood_pressure data = gbd.get_paf(sbp.gbd_id, utility_data.get_location_id(location)) data = data[data.metric_id == globals.METRICS['Percent']] data = data[data.measure_id == globals.MEASURES['YLDs']] data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure' ]).apply(utilities.normalize, fill_value=0).reset_index(drop=True)) data = data.loc[data.affected_entity.isin(affected_entity_map.keys())] data.affected_entity.replace(to_replace=affected_entity_map, inplace=True) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) key = prefix + 'population_attributable_fraction' artifact.write(key, data) data = gbd.get_relative_risk(sbp.gbd_id, utility_data.get_location_id(location)) data = utilities.convert_affected_entity(data, 'cause_id') morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality' data = data.loc[data.affected_entity.isin(affected_entity_map.keys())] data = core.filter_relative_risk_to_cause_restrictions(data) data.affected_entity.replace(to_replace=affected_entity_map, inplace=True) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + globals.DRAW_COLUMNS) data = (data.groupby(['affected_entity', 'parameter' ]).apply(utilities.normalize, fill_value=1).reset_index(drop=True)) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.sort_hierarchical_data(data) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') loc = location.lower().replace(' ', '_') ckd_rr = pd.read_hdf( f'/share/costeffectiveness/artifacts/vivarium_csu_hypertension_sdc/ckd_rr/{loc}.hdf' ) ckd_rr = ckd_rr.reset_index() ckd_rr['parameter'] = 'per unit' ckd_rr['affected_entity'] = 'chronic_kidney_disease' ckd_rr['affected_measure'] = 'incidence_rate' ckd_rr = ckd_rr.set_index([ 'location', 'sex', 'age_start', 'year_start', 'affected_entity', 'affected_measure', 'parameter', 'age_end', 'year_end' ]) data = pd.concat([data, ckd_rr]) key = prefix + 'relative_risk' artifact.write(key, data)
def get_data(entity, measure: str, location: Union[str, int]): measure_handlers = { # Cause-like measures "incidence_rate": (get_incidence_rate, ("cause", "sequela")), "raw_incidence_rate": (get_raw_incidence_rate, ("cause", "sequela")), "prevalence": (get_prevalence, ("cause", "sequela")), "birth_prevalence": (get_birth_prevalence, ("cause", "sequela")), "disability_weight": (get_disability_weight, ("cause", "sequela")), "remission_rate": (get_remission_rate, ("cause",)), "cause_specific_mortality_rate": (get_cause_specific_mortality_rate, ("cause",)), "excess_mortality_rate": (get_excess_mortality_rate, ("cause",)), "deaths": (get_deaths, ("cause",)), # Risk-like measures "exposure": ( get_exposure, ( "risk_factor", "alternative_risk_factor", ), ), "exposure_standard_deviation": ( get_exposure_standard_deviation, ("risk_factor", "alternative_risk_factor"), ), "exposure_distribution_weights": ( get_exposure_distribution_weights, ("risk_factor", "alternative_risk_factor"), ), "relative_risk": (get_relative_risk, ("risk_factor",)), "population_attributable_fraction": ( get_population_attributable_fraction, ("risk_factor", "etiology"), ), # Covariate measures "estimate": (get_estimate, ("covariate",)), # Population measures "structure": (get_structure, ("population",)), "theoretical_minimum_risk_life_expectancy": ( get_theoretical_minimum_risk_life_expectancy, ("population",), ), "age_bins": (get_age_bins, ("population",)), "demographic_dimensions": (get_demographic_dimensions, ("population",)), } if measure not in measure_handlers: raise InvalidQueryError(f"No functions available to pull data for measure {measure}.") handler, entity_types = measure_handlers[measure] if entity.kind not in entity_types: raise InvalidQueryError(f"{measure.capitalize()} not available for {entity.kind}.") location_id = ( utility_data.get_location_id(location) if isinstance(location, str) else location ) data = handler(entity, location_id) if measure in [ "structure", "theoretical_minimum_risk_life_expectancy", "estimate", "exposure_distribution_weights", ]: value_cols = ["value"] else: value_cols = DRAW_COLUMNS data = utilities.reshape(data, value_cols=value_cols) return data
def test_core_population(measures): pop = ModelableEntity("ignored", "population", None) df = core.get_data(pop, measures, utility_data.get_location_id("India"))