def get_population_structure(location: str) -> pd.DataFrame: """Pull GBD population data for the given location and standardize to the expected simulation input format, including scrubbing all GBD conventions to replace IDs with meaningful values or ranges and expanding over all demographic dimensions. Parameters ---------- location Location for which to pull population data. Returns ------- pandas.DataFrame Dataframe of population data for `location`, standardized to the format expected by `vivarium` simulations. """ pop = Population() data = core.get_data(pop, "structure", location) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, pop, "structure", location) data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def get_theoretical_minimum_risk_life_expectancy() -> pd.DataFrame: """Pull GBD theoretical minimum risk life expectancy data and standardize to the expected simulation input format, including binning age parameters as expected by simulations. Returns ------- pandas.DataFrame Dataframe of theoretical minimum risk life expectancy data, standardized to the format expected by `vivarium` simulations with binned age parameters. """ pop = Population() data = core.get_data(pop, "theoretical_minimum_risk_life_expectancy", "Global") data = utilities.set_age_interval(data) validation.validate_for_simulation( data, pop, "theoretical_minimum_risk_life_expectancy", "Global") data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def get_demographic_dimensions(location: str) -> pd.DataFrame: """Pull the full demographic dimensions for GBD data, standardized to the expected simulation input format, including scrubbing all GBD conventions to replace IDs with with meaningful values or ranges. Parameters ---------- location Location for which to pull demographic dimension data. Returns ------- pandas.DataFrame Dataframe with age and year bins from GBD, sexes, and the given location. """ pop = Population() data = core.get_data(pop, "demographic_dimensions", location) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, pop, "demographic_dimensions", location) data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def get_measure(entity: ModelableEntity, measure: str, location: str) -> pd.DataFrame: """Pull GBD data for measure and entity and prep for simulation input, including scrubbing all GBD conventions to replace IDs with meaningful values or ranges and expanding over all demographic dimensions. To pull data using this function, please have at least 50GB of memory available. Available measures: For entity kind 'sequela': incidence_rate, prevalence, birth_prevalence, disability_weight For entity kind 'cause': incidence_rate, prevalence, birth_prevalence, disability_weight, remission_rate, cause_specific_mortality_rate, excess_mortality_rate For entity kind 'risk_factor': exposure, exposure_standard_deviation, exposure_distribution_weights, relative_risk, population_attributable_fraction, mediation_factors For entity kind 'etiology': population_attributable_fraction For entity kind 'alternative_risk_factor': exposure, exposure_standard_deviation, exposure_distribution_weights For entity kind 'covariate': estimate Parameters ---------- entity Entity for which to pull `measure`. measure Measure for which to pull data, should be a measure available for the kind of entity which `entity` is. location Location for which to pull data. Returns ------- pandas.DataFrame Dataframe standardized to the format expected by `vivarium` simulations. """ data = core.get_data(entity, measure, location) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, entity, measure, location) data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def get_age_bins() -> pd.DataFrame: """Pull GBD age bin data and standardize to the expected simulation input format. Returns ------- pandas.DataFrame Dataframe of age bin data, with bin start and end values as well as bin names. """ pop = Population() data = core.get_data(pop, "age_bins", "Global") data = utilities.set_age_interval(data) validation.validate_for_simulation(data, pop, "age_bins", "Global") data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def fail_expected(entity_name, measure_name, location): with pytest.raises(Exception): df = core.get_data(entity_name, measure_name, location)
def test_core_healthsystem(entity, measure, location): df = core.get_data(entity, measure, utility_data.get_location_id(location))
def test_core_population(measures): pop = ModelableEntity("ignored", "population", None) df = core.get_data(pop, measures, utility_data.get_location_id("India"))
def test_core_covariatelike(entity, measure, location): df = core.get_data(entity, measure, utility_data.get_location_id(location))
def success_expected(entity_name, measure_name, location): df = core.get_data(entity_name, measure_name, location) return df