def load_metadata(key: str, location: str): key = EntityKey(key) entity = get_entity(key) entity_metadata = entity[key.measure] if hasattr(entity_metadata, 'to_dict'): entity_metadata = entity_metadata.to_dict() return entity_metadata
def open_artifact(output_path: Path, location: str) -> Artifact: """Creates or opens an artifact at the output path. Parameters ---------- output_path Fully resolved path to the artifact file. location Proper GBD location name represented by the artifact. Returns ------- A new artifact. """ if not output_path.exists(): logger.debug(f"Creating artifact at {str(output_path)}.") else: logger.debug(f"Opening artifact at {str(output_path)} for appending.") artifact = Artifact(output_path, filter_terms=[get_location_term(location)]) key = EntityKey(project_globals.METADATA_LOCATIONS) if str(key) not in artifact: artifact.write(key, [location]) return artifact
def load_forecast_data(key: EntityKey, location: str): location_id = extract.get_location_id(location) path = paths.forecast_data_path(key) data = extract.load_forecast_from_xarray(path, location_id) data = data[data.scenario == project_globals.FORECASTING_SCENARIO].drop( columns='scenario') if key == EntityKey('etiology.shigellosis.incidence'): # Only one draw for incidence data = pd.concat(project_globals.NUM_DRAWS * [ data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id']).value ], axis=1) else: data = data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() if len(data.columns) == 100: # Not 1000 draws for everything data = pd.concat([data] * 10, axis=1) data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) data = data.reset_index() data = standardize.normalize(data) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_entity(key: str): # Map of entity types to their gbd mappings. type_map = { 'cause': causes, 'covariate': covariates, 'risk_factor': risk_factors, 'alternative_risk_factor': alternative_risk_factors } key = EntityKey(key) return type_map[key.type][key.name]
def load_live_births_by_year(key: EntityKey, location: str): location_id = extract.get_location_id(location) asfr_key = EntityKey('covariate.age_specific_fertility_rate.estimate') pop_key = EntityKey(project_globals.POPULATION_STRUCTURE) asfr_data = extract.load_forecast_from_xarray( paths.forecast_data_path(asfr_key), location_id) asfr_data = asfr_data[ (asfr_data.scenario == project_globals.FORECASTING_SCENARIO) & (asfr_data.year_id >= project_globals.MIN_YEAR)].drop( columns='scenario') asfr_data = asfr_data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() asfr_data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) pop_data = extract.load_forecast_from_xarray( paths.forecast_data_path(pop_key), location_id) pop_data = pop_data[( pop_data.scenario == project_globals.FORECASTING_SCENARIO)].drop( columns='scenario') pop_data = pop_data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() pop_data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) pop_data = pop_data.loc[asfr_data.index] live_births = asfr_data * pop_data live_births = (live_births.reset_index().drop( columns=['sex_id', 'age_group_id']).groupby(['location_id', 'year_id' ]).sum().reset_index()) data = standardize.normalize(live_births) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_exposure(key: str, location: str) -> pd.DataFrame: if key != data_keys.LBWSG.EXPOSURE: raise ValueError(f'Unrecognized key {key}') key = EntityKey(key) entity = utilities.get_entity(key) data = utilities.get_data(key, entity, location, gbd_constants.SOURCES.EXPOSURE, 'rei_id', metadata.AGE_GROUP.GBD_2019_LBWSG_EXPOSURE, metadata.GBD_2019_ROUND_ID, 'step4') data = data[data['year_id'] == 2019].drop(columns='year_id') data = utilities.process_exposure(data, key, entity, location, metadata.GBD_2019_ROUND_ID, metadata.AGE_GROUP.GBD_2019_LBWSG_EXPOSURE | metadata.AGE_GROUP.GBD_2020) data = data[data.index.get_level_values('year_start') == 2019] return data
def load_gbd_2020_rr(key: str, location: str) -> pd.DataFrame: entity_key = EntityKey(key) entity = utilities.get_gbd_2020_entity(entity_key) data = utilities.get_data( entity_key, entity, location, gbd_constants.SOURCES.RR, 'rei_id', metadata.AGE_GROUP.GBD_2020, metadata.GBD_2020_ROUND_ID ) data = utilities.process_relative_risk(data, entity_key, entity, location, metadata.GBD_2020_ROUND_ID, metadata.AGE_GROUP.GBD_2020) if key == data_keys.STUNTING.RELATIVE_RISK: # Remove neonatal relative risks neonatal_age_ends = data.index.get_level_values('age_end').unique()[:2] data.loc[data.index.get_level_values('age_end').isin(neonatal_age_ends)] = 1.0 elif key == data_keys.WASTING.RELATIVE_RISK: # Remove relative risks for simulants under 6 months data.loc[data.index.get_level_values('age_end') <= data_values.WASTING.START_AGE] = 1.0 # Set risk to affect diarrheal emr diarrhea_rr = data.query(f"affected_entity == '{data_keys.DIARRHEA.name}'") data = pd.concat([ diarrhea_rr.rename( index={'incidence_rate': 'excess_mortality_rate'}, level='affected_measure' ), data.drop(diarrhea_rr.index) ]).sort_index() elif key == data_keys.DISCONTINUED_BREASTFEEDING.RELATIVE_RISK: # Remove RR outside of [6 months, 2 years) discontinued_tmrel_index = data.query( f'age_start < {data_values.DISCONTINUED_BREASTFEEDING_START_AGE}' f' or age_end > {data_values.DISCONTINUED_BREASTFEEDING_END_AGE}' ).index discontinued_tmrel_rr = pd.DataFrame( 1.0, columns=metadata.ARTIFACT_COLUMNS, index=discontinued_tmrel_index ) data.update(discontinued_tmrel_rr) elif key == data_keys.NON_EXCLUSIVE_BREASTFEEDING.RELATIVE_RISK: # Remove month [6, months, 1 year) exposure non_exclusive_tmrel_index = data.query( f'age_start == {data_values.NON_EXCLUSIVE_BREASTFEEDING_END_AGE}' ).index non_exclusive_tmrel_rr = pd.DataFrame( 1.0, columns=metadata.ARTIFACT_COLUMNS, index=non_exclusive_tmrel_index ) data.update(non_exclusive_tmrel_rr) return data
def load_sids_csmr(key: str, location: str) -> pd.DataFrame: if key == data_keys.AFFECTED_UNMODELED_CAUSES.SIDS_CSMR: key = EntityKey(key) entity: Cause = utilities.get_entity(key) # get around the validation rejecting yll only causes entity.restrictions.yll_only = False entity.restrictions.yld_age_group_id_start = min(metadata.AGE_GROUP.GBD_2019_SIDS) entity.restrictions.yld_age_group_id_end = max(metadata.AGE_GROUP.GBD_2019_SIDS) data = interface.get_measure(entity, key.measure, location).droplevel('location') return data else: raise ValueError(f'Unrecognized key {key}')
def load_gbd_2020_exposure(key: str, location: str) -> pd.DataFrame: entity_key = EntityKey(key) entity = utilities.get_gbd_2020_entity(entity_key) data = utilities.get_data(entity_key, entity, location, gbd_constants.SOURCES.EXPOSURE, 'rei_id', metadata.AGE_GROUP.GBD_2020, metadata.GBD_2020_ROUND_ID) data = utilities.process_exposure(data, entity_key, entity, location, metadata.GBD_2020_ROUND_ID, metadata.AGE_GROUP.GBD_2020) if entity_key == data_keys.STUNTING.EXPOSURE: # Remove neonatal exposure neonatal_age_ends = data.index.get_level_values('age_end').unique()[:2] data.loc[data.index.get_level_values('age_end').isin(neonatal_age_ends)] = 0.0 data.loc[data.index.get_level_values('age_end').isin(neonatal_age_ends) & (data.index.get_level_values('parameter') == data_keys.STUNTING.CAT4)] = 1.0 return data
def load_and_write_demographic_data(artifact: Artifact, location: str): keys = [ EntityKey(project_globals.POPULATION_STRUCTURE), EntityKey(project_globals.POPULATION_AGE_BINS), EntityKey(project_globals.POPULATION_DEMOGRAPHY), EntityKey( project_globals.POPULATION_TMRLE), # Theoretical life expectancy EntityKey(project_globals.POPULATION_LSLE ), # Location specific life expectancy EntityKey(project_globals.ALL_CAUSE_CSMR), EntityKey(project_globals.COVARIATE_LIVE_BIRTHS), ] for key in keys: load_and_write_data(artifact, key, location)
def load_lbwsg_rr(key: str, location: str) -> pd.DataFrame: if key != data_keys.LBWSG.RELATIVE_RISK: raise ValueError(f'Unrecognized key {key}') key = EntityKey(key) entity = utilities.get_entity(key) data = utilities.get_data(key, entity, location, gbd_constants.SOURCES.RR, 'rei_id', metadata.AGE_GROUP.GBD_2019_LBWSG_RELATIVE_RISK, metadata.GBD_2019_ROUND_ID, 'step4') data = data[data['year_id'] == 2019].drop(columns='year_id') data = utilities.process_relative_risk(data, key, entity, location, metadata.GBD_2019_ROUND_ID, metadata.AGE_GROUP.GBD_2020, whitelist_sids=True) data = ( data.query('year_start == 2019') .droplevel(['affected_entity', 'affected_measure']) ) data = data[~data.index.duplicated()] return data
def load_ikf_paf(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False) relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = (data.groupby('cause_id', as_index=False) .apply(core.filter_by_relative_risk, relative_risk) .reset_index(drop=True)) causes_map = {c.gbd_id: c for c in causes} temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def write_data_by_draw(artifact: Artifact, key: str, data: pd.DataFrame): """Writes data to the artifact on a per-draw basis. This is useful for large datasets like Low Birthweight Short Gestation (LBWSG). Parameters ---------- artifact The artifact to write to. key The entity key associated with the data to write. data The data to write. """ with pd.HDFStore(artifact.path, complevel=9, mode='a') as store: key = EntityKey(key) artifact._keys.append(key) store.put(f'{key.path}/index', data.index.to_frame(index=False)) data = data.reset_index(drop=True) for c in data.columns: store.put(f'{key.path}/{c}', data[c])
def load_and_write_cause_data(artifact: Artifact, location: str): key = EntityKey(project_globals.SHIGELLA_CSMR) csmr = load_and_write_data(artifact, key, location) key = EntityKey(project_globals.SHIGELLA_DISABILITY_WEIGHT) load_and_write_data(artifact, key, location) key = EntityKey(project_globals.SHIGELLA_INCIDENCE_RATE) incidence = load_and_write_data(artifact, key, location) key = EntityKey(project_globals.SHIGELLA_REMISSION_RATE) remission = load_and_write_data(artifact, key, location) key = EntityKey(project_globals.SHIGELLA_PREVALENCE) prevalence = write_data(artifact, key, incidence / remission) key = EntityKey(project_globals.SHIGELLA_EMR) write_data(artifact, key, (csmr / prevalence).fillna(0)) key = EntityKey(project_globals.SHIGELLA_RESTRICTIONS) write_data(artifact, key, causes.diarrheal_diseases.restrictions.to_dict())
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] data = utilities.convert_affected_entity(data, 'cause_id') data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)] morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate' data = core.filter_relative_risk_to_cause_restrictions(data) data = (data.groupby(['affected_entity', 'parameter']) .apply(utilities.normalize, fill_value=1) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + vi_globals.DRAW_COLUMNS) tmrel_cat = utility_data.get_tmrel_category(entity) tmrel_mask = data.parameter == tmrel_cat data.loc[tmrel_mask, value_cols] = ( data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0) ) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_and_write_vaccine_data(artifact: Artifact, location: str): key = EntityKey(project_globals.COVARIATE_DTP3) logger.debug(f'Loading data for {key} for location {location}.') dtp3_coverage = loader.get_data(key, location) key = EntityKey(project_globals.COVARIATE_MEASLES1) logger.debug(f'Loading data for {key} for location {location}.') measles1_coverage = loader.get_data(key, location) key = EntityKey(project_globals.COVARIATE_MEASLES2) logger.debug(f'Loading data for {key} for location {location}.') measles2_coverage = loader.get_data(key, location) key = EntityKey(project_globals.COVARIATE_SHIGELLA_6MO) write_data(artifact, key, 0.5 * (dtp3_coverage + measles1_coverage)) key = EntityKey(project_globals.COVARIATE_SHIGELLA_9MO) write_data(artifact, key, measles1_coverage) key = EntityKey(project_globals.COVARIATE_SHIGELLA_12MO) write_data(artifact, key, 0.5 * (measles1_coverage + measles2_coverage)) key = EntityKey(project_globals.COVARIATE_SHIGELLA_15MO) write_data(artifact, key, measles2_coverage)
def filter_relative_risk_to_cause_restrictions( data: pd.DataFrame) -> pd.DataFrame: """ It applies age restrictions according to affected causes and affected measures. If affected measure is incidence_rate, it applies the yld_age_restrictions. If affected measure is excess_mortality_rate, it applies the yll_age_restrictions to filter the relative_risk data""" temp = [] affected_entities = set(data.affected_entity) affected_measures = set(data.affected_measure) for cause, measure in product(affected_entities, affected_measures): df = data[(data.affected_entity == cause) & (data.affected_measure == measure)] cause = get_gbd_2020_entity(EntityKey(f'cause.{cause}.{measure}')) if measure == 'excess_mortality_rate': start, end = vi_utils.get_age_group_ids_by_restriction( cause, 'yll') else: # incidence_rate start, end = vi_utils.get_age_group_ids_by_restriction( cause, 'yld') temp.append(df[df.age_group_id.isin(range(start, end + 1))]) data = pd.concat(temp) return data
def load_standard_data(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) return interface.get_measure(entity, key.measure, location).droplevel('location')
def get_data(lookup_key: EntityKey, location: str): mapping = { EntityKey(project_globals.POPULATION_STRUCTURE): (load_forecast_data, EntityKey(project_globals.POPULATION_STRUCTURE)), EntityKey(project_globals.POPULATION_AGE_BINS): (load_age_bins, EntityKey(project_globals.POPULATION_AGE_BINS)), EntityKey(project_globals.POPULATION_DEMOGRAPHY): (load_demographic_dimensions, EntityKey(project_globals.POPULATION_DEMOGRAPHY)), EntityKey(project_globals.POPULATION_TMRLE): (load_theoretical_minimum_risk_life_expectancy, EntityKey(project_globals.POPULATION_TMRLE)), EntityKey(project_globals.POPULATION_LSLE): (load_location_specific_life_expectancy, EntityKey(project_globals.POPULATION_LSLE)), EntityKey(project_globals.ALL_CAUSE_CSMR): (load_forecast_data, EntityKey('cause.all_causes.cause_specific_mortality')), EntityKey(project_globals.COVARIATE_LIVE_BIRTHS): (load_live_births_by_year, EntityKey(project_globals.COVARIATE_LIVE_BIRTHS)), EntityKey(project_globals.SHIGELLA_CSMR): (load_forecast_data, EntityKey('etiology.shigellosis.cause_specific_mortality')), EntityKey(project_globals.SHIGELLA_INCIDENCE_RATE): (load_forecast_data, EntityKey('etiology.shigellosis.incidence')), EntityKey(project_globals.SHIGELLA_REMISSION_RATE): (load_shigella_remission_rate, EntityKey(project_globals.SHIGELLA_REMISSION_RATE)), EntityKey(project_globals.SHIGELLA_DISABILITY_WEIGHT): (load_shigella_disability_weight, EntityKey(project_globals.SHIGELLA_DISABILITY_WEIGHT)), EntityKey(project_globals.COVARIATE_DTP3): (load_forecast_data, EntityKey(project_globals.COVARIATE_DTP3)), EntityKey(project_globals.COVARIATE_MEASLES1): (load_forecast_data, EntityKey(project_globals.COVARIATE_MEASLES1)), EntityKey(project_globals.COVARIATE_MEASLES2): (load_forecast_data, EntityKey(project_globals.COVARIATE_MEASLES2)), } loader, access_key = mapping[lookup_key] return loader(access_key, location)