예제 #1
0
def get_population_structure(location: str) -> pd.DataFrame:
    """Pull GBD population data for the given location and standardize to the
    expected simulation input format, including scrubbing all GBD conventions
    to replace IDs with meaningful values or ranges and expanding over all
    demographic dimensions.

    Parameters
    ----------
    location
        Location for which to pull population data.

    Returns
    -------
    pandas.DataFrame
        Dataframe of population data for `location`, standardized to the format
        expected by `vivarium` simulations.

    """
    pop = Population()
    data = core.get_data(pop, "structure", location)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, pop, "structure", location)
    data = utilities.split_interval(data,
                                    interval_column="age",
                                    split_column_prefix="age")
    data = utilities.split_interval(data,
                                    interval_column="year",
                                    split_column_prefix="year")
    return utilities.sort_hierarchical_data(data)
예제 #2
0
def get_theoretical_minimum_risk_life_expectancy() -> pd.DataFrame:
    """Pull GBD theoretical minimum risk life expectancy data and standardize
    to the expected simulation input format, including binning age parameters
    as expected by simulations.

    Returns
    -------
    pandas.DataFrame
        Dataframe of theoretical minimum risk life expectancy data, standardized
        to the format expected by `vivarium` simulations with binned age parameters.

    """
    pop = Population()
    data = core.get_data(pop, "theoretical_minimum_risk_life_expectancy",
                         "Global")
    data = utilities.set_age_interval(data)
    validation.validate_for_simulation(
        data, pop, "theoretical_minimum_risk_life_expectancy", "Global")
    data = utilities.split_interval(data,
                                    interval_column="age",
                                    split_column_prefix="age")
    data = utilities.split_interval(data,
                                    interval_column="year",
                                    split_column_prefix="year")
    return utilities.sort_hierarchical_data(data)
예제 #3
0
def load_lbwsg_paf(key: str, location: str):
    path = paths.lbwsg_data_path('population_attributable_fraction', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id
    data = data[data.metric_id == vi_globals.METRICS['Percent']]
    # All lbwsg risk is about mortality.
    data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])]

    temp = []
    causes_map = {c.gbd_id: c for c in causes}
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS
                       + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'population_attributable_fraction', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
예제 #4
0
def get_demographic_dimensions(location: str) -> pd.DataFrame:
    """Pull the full demographic dimensions for GBD data, standardized to the
    expected simulation input format, including scrubbing all GBD conventions
    to replace IDs with with meaningful values or ranges.

    Parameters
    ----------
    location
        Location for which to pull demographic dimension data.

    Returns
    -------
    pandas.DataFrame
        Dataframe with age and year bins from GBD, sexes, and the given location.

    """
    pop = Population()
    data = core.get_data(pop, "demographic_dimensions", location)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, pop, "demographic_dimensions",
                                       location)
    data = utilities.split_interval(data,
                                    interval_column="age",
                                    split_column_prefix="age")
    data = utilities.split_interval(data,
                                    interval_column="year",
                                    split_column_prefix="year")
    return utilities.sort_hierarchical_data(data)
예제 #5
0
def load_lbwsg_relative_risk(key: str, location: str):
    path = paths.lbwsg_data_path('relative_risk', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id
    data = utilities.convert_affected_entity(data, 'cause_id')
    # RRs for all causes are the same.
    data = data[data.affected_entity == 'diarrheal_diseases']
    data['affected_entity'] = 'all'
    # All lbwsg risk is about mortality.
    data.loc[:, 'affected_measure'] = 'excess_mortality_rate'
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS
                       + ['affected_entity', 'affected_measure', 'parameter']
                       + vi_globals.DRAW_COLUMNS)
    data = (
        data
            .groupby(['affected_entity', 'parameter'])
            .apply(utilities.normalize, fill_value=1)
            .reset_index(drop=True)
    )

    tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation)
    tmrel_mask = data.parameter == tmrel_cat
    data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS] = (
        data
            .loc[tmrel_mask, vi_globals.DRAW_COLUMNS]
            .mask(np.isclose(data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS], 1.0), 1.0)
    )

    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'relative_risk', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
예제 #6
0
def load_lbwsg_exposure(key: str, location: str):
    path = paths.lbwsg_data_path('exposure', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id

    data = data.drop('modelable_entity_id', 'columns')
    data = data[data.parameter != 'cat124']  # LBWSG data has an extra residual category added by get_draws.
    data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation,
                                                 'outer', utility_data.get_age_group_ids())
    tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]
    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data.groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'exposure', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
예제 #7
0
def get_measure(entity: ModelableEntity, measure: str,
                location: str) -> pd.DataFrame:
    """Pull GBD data for measure and entity and prep for simulation input,
    including scrubbing all GBD conventions to replace IDs with meaningful
    values or ranges and expanding over all demographic dimensions. To pull data
    using this function, please have at least 50GB of memory available.

    Available measures:

        For entity kind 'sequela':
            incidence_rate, prevalence, birth_prevalence, disability_weight

        For entity kind 'cause':
            incidence_rate, prevalence, birth_prevalence, disability_weight,
            remission_rate, cause_specific_mortality_rate, excess_mortality_rate

        For entity kind 'risk_factor':
            exposure, exposure_standard_deviation, exposure_distribution_weights,
            relative_risk, population_attributable_fraction, mediation_factors

        For entity kind 'etiology':
            population_attributable_fraction

        For entity kind 'alternative_risk_factor':
            exposure, exposure_standard_deviation, exposure_distribution_weights

        For entity kind 'covariate':
            estimate

    Parameters
    ----------
    entity
        Entity for which to pull `measure`.
    measure
        Measure for which to pull data, should be a measure available for the
        kind of entity which `entity` is.
    location
        Location for which to pull data.

    Returns
    -------
    pandas.DataFrame
        Dataframe standardized to the format expected by `vivarium` simulations.

    """
    data = core.get_data(entity, measure, location)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, entity, measure, location)
    data = utilities.split_interval(data,
                                    interval_column="age",
                                    split_column_prefix="age")
    data = utilities.split_interval(data,
                                    interval_column="year",
                                    split_column_prefix="year")
    return utilities.sort_hierarchical_data(data)
예제 #8
0
def load_ikf_paf(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False)
    relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False)

    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]
    relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

    data = (data.groupby('cause_id', as_index=False)
            .apply(core.filter_by_relative_risk, relative_risk)
            .reset_index(drop=True))

    causes_map = {c.gbd_id: c for c in causes}
    temp = []
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
예제 #9
0
def get_age_bins() -> pd.DataFrame:
    """Pull GBD age bin data and standardize to the expected simulation input
    format.

    Returns
    -------
    pandas.DataFrame
        Dataframe of age bin data, with bin start and end values as well as bin
        names.

    """
    pop = Population()
    data = core.get_data(pop, "age_bins", "Global")
    data = utilities.set_age_interval(data)
    validation.validate_for_simulation(data, pop, "age_bins", "Global")
    data = utilities.split_interval(data,
                                    interval_column="age",
                                    split_column_prefix="age")
    data = utilities.split_interval(data,
                                    interval_column="year",
                                    split_column_prefix="year")
    return utilities.sort_hierarchical_data(data)
예제 #10
0
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    location_id = utility_data.get_location_id(location) if isinstance(location, str) else location
    measure = 'exposure'
    raw_validation.check_metadata(entity, measure)

    data = gbd.get_exposure(entity.gbd_id, location_id)
    data = normalize_ikf_exposure_distribution(data)
    raw_validation.validate_raw_data(data, entity, measure, location_id)

    data = data.drop('modelable_entity_id', 'columns')

    data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids())

    tmrel_cat = utility_data.get_tmrel_category(entity)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]

    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data
            .groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())

    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
예제 #11
0
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'relative_risk', location_id, validate=False)
    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]

    data = utilities.convert_affected_entity(data, 'cause_id')
    data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)]
    morbidity = data.morbidity == 1
    mortality = data.mortality == 1
    data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate'
    data = core.filter_relative_risk_to_cause_restrictions(data)

    data = (data.groupby(['affected_entity', 'parameter'])
            .apply(utilities.normalize, fill_value=1)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter']
                       + vi_globals.DRAW_COLUMNS)

    tmrel_cat = utility_data.get_tmrel_category(entity)
    tmrel_mask = data.parameter == tmrel_cat
    data.loc[tmrel_mask, value_cols] = (
        data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0)
    )

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)