Exemplo n.º 1
0
def normalize_age(data: pd.DataFrame, fill_value: Real,
                  cols_to_fill: List[str]) -> pd.DataFrame:
    data_ages = set(data.age_group_id.unique()
                    ) if "age_group_id" in data.columns else set()
    gbd_ages = set(utility_data.get_age_group_ids())

    if not data_ages:
        # Data does not correspond to individuals, so no age column necessary.
        pass
    elif data_ages == {SPECIAL_AGES["all_ages"]}:
        # Data applies to all ages, so copy.
        dfs = []
        for age in gbd_ages:
            missing = data.copy()
            missing.loc[:, "age_group_id"] = age
            dfs.append(missing)
        data = pd.concat(dfs, ignore_index=True)
    elif data_ages < gbd_ages:
        # Data applies to subset, so fill other ages with fill value.
        key_columns = list(data.columns.difference(cols_to_fill))
        key_columns.remove("age_group_id")
        expected_index = pd.MultiIndex.from_product(
            [data[c].unique() for c in key_columns] + [gbd_ages],
            names=key_columns + ["age_group_id"],
        )

        data = (data.set_index(key_columns + ["age_group_id"]).reindex(
            expected_index, fill_value=fill_value).reset_index())
    else:  # data_ages == gbd_ages
        pass
    return data
Exemplo n.º 2
0
def get_disability_weight(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame:
    if entity.kind == "cause":
        data = utility_data.get_demographic_dimensions(location_id, draws=True, value=0.0)
        data = data.set_index(
            utilities.get_ordered_index_cols(data.columns.difference(DRAW_COLUMNS))
        )
        if entity.sequelae:
            for sequela in entity.sequelae:
                try:
                    prevalence = get_data(sequela, "prevalence", location_id)
                except DataDoesNotExistError:
                    # sequela prevalence does not exist so no point continuing with this sequela
                    continue
                disability = get_data(sequela, "disability_weight", location_id)
                disability.index = disability.index.set_levels(
                    [location_id], level="location_id"
                )
                data += prevalence * disability
        cause_prevalence = get_data(entity, "prevalence", location_id)
        data = (data / cause_prevalence).fillna(0).reset_index()
    else:  # entity.kind == 'sequela'
        try:
            data = extract.extract_data(entity, "disability_weight", location_id)
            data = utilities.normalize(data)
            cause = [c for c in causes if c.sequelae and entity in c.sequelae][0]
            data = utilities.clear_disability_weight_outside_restrictions(
                data, cause, 0.0, utility_data.get_age_group_ids()
            )
            data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS)
        except (IndexError, DataDoesNotExistError):
            logger.warning(
                f"{entity.name.capitalize()} has no disability weight data. All values will be 0."
            )
            data = utility_data.get_demographic_dimensions(location_id, draws=True, value=0.0)
    return data
Exemplo n.º 3
0
def get_exposure_and_restriction_ages(exposure: pd.DataFrame,
                                      entity: RiskFactor) -> set:
    """Get the intersection of age groups found in exposure data and entity
    restriction age range. Used to filter other risk data where
    using just exposure age groups isn't sufficient because exposure at the
    point of extraction is pre-filtering by age restrictions.

    Parameters
    ----------
    exposure
        Exposure data for `entity`.
    entity
        Entity for which to find the intersecting exposure and restriction ages.

    Returns
    -------
    Set of age groups found in both the entity's exposure data and in the
    entity's age restrictions.
    """
    exposure_age_groups = set(exposure.age_group_id)
    start, end = get_age_group_ids_by_restriction(entity, "outer")
    restriction_age_groups = get_restriction_age_ids(
        start, end, utility_data.get_age_group_ids())
    valid_age_groups = exposure_age_groups.intersection(restriction_age_groups)

    return valid_age_groups
Exemplo n.º 4
0
def load_lbwsg_exposure(key: str, location: str):
    path = paths.lbwsg_data_path('exposure', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id

    data = data.drop('modelable_entity_id', 'columns')
    data = data[data.parameter != 'cat124']  # LBWSG data has an extra residual category added by get_draws.
    data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation,
                                                 'outer', utility_data.get_age_group_ids())
    tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]
    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data.groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'exposure', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Exemplo n.º 5
0
def load_lbwsg_paf(key: str, location: str):
    path = paths.lbwsg_data_path('population_attributable_fraction', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id
    data = data[data.metric_id == vi_globals.METRICS['Percent']]
    # All lbwsg risk is about mortality.
    data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])]

    temp = []
    causes_map = {c.gbd_id: c for c in causes}
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS
                       + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'population_attributable_fraction', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Exemplo n.º 6
0
def get_deaths(entity: Cause, location_id: int) -> pd.DataFrame:
    data = extract.extract_data(entity, "deaths", location_id)
    data = utilities.filter_data_by_restrictions(
        data, entity, "yll", utility_data.get_age_group_ids()
    )
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS)
    return data
def write_ckd_data(artifact, location):
    load = get_load(location)

    # Metadata
    key = f'cause.chronic_kidney_disease.restrictions'
    artifact.write(key, load(key))

    # Measures for Disease Model
    key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate'
    csmr = load(key)
    artifact.write(key, csmr.copy())

    # Measures for Disease States
    key = 'cause.chronic_kidney_disease.prevalence'
    prevalence = load(key)
    artifact.write(key, prevalence.copy())

    key = 'cause.chronic_kidney_disease.disability_weight'
    df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id,
                                      utility_data.get_location_id(location))
    ylds = df[df.measure_id == globals.MEASURES['YLDs']]
    ylds = utilities.filter_data_by_restrictions(
        ylds, causes.chronic_kidney_disease, 'yld',
        utility_data.get_age_group_ids())
    ylds = utilities.normalize(ylds, fill_value=0)
    ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS)
    ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS)
    ylds = utilities.scrub_gbd_conventions(ylds, location)
    ylds = split_interval(ylds,
                          interval_column='age',
                          split_column_prefix='age')
    ylds = split_interval(ylds,
                          interval_column='year',
                          split_column_prefix='year')
    ylds = utilities.sort_hierarchical_data(ylds)
    dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, dw)

    key = 'cause.chronic_kidney_disease.excess_mortality_rate'
    emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, emr)

    # Measures for Transitions
    key = 'cause.chronic_kidney_disease.incidence_rate'
    data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate',
                         location)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)
    data[
        data >
        50] = 50  # Russia has absurdly high values in some of the data and it breaks validation.
    artifact.write(key, data)
Exemplo n.º 8
0
def _load_diarrhea_sequela_disability_weight(sequela, location_id: int):
    logger.info(f'Loading disability weight for {sequela.name} from GBD 2016.')
    data = extract.get_auxiliary_data('disability_weight', 'sequela', 'all',
                                      location_id)
    data = data.loc[data.healthstate_id == sequela.healthstate.gbd_id, :]
    data = standardize.normalize(data)
    data = utilities.clear_disability_weight_outside_restrictions(
        data, causes.diarrheal_diseases, 0.0, utility_data.get_age_group_ids())
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    return utilities.reshape(data)
Exemplo n.º 9
0
def _load_prevalence(entity, location_id: int, entity_type: str):
    logger.info(f'Loading prevalence for {entity.name} from GBD 2016.')
    data = extract.get_como_draws(entity.gbd_id, location_id, entity_type)
    data = data[data.measure_id == vi_globals.MEASURES['Prevalence']]
    data = utilities.filter_data_by_restrictions(
        data, causes.diarrheal_diseases, 'yld',
        utility_data.get_age_group_ids())
    data = data[data.year_id == 2016].drop(
        columns='year_id')  # Use latest GBD results for all data
    data = standardize.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    return utilities.reshape(data)
Exemplo n.º 10
0
def get_prevalence(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame:
    data = extract.extract_data(entity, "prevalence", location_id)
    if entity.kind == "cause":
        restrictions_entity = entity
    else:  # sequela
        cause = [c for c in causes if c.sequelae and entity in c.sequelae][0]
        restrictions_entity = cause

    data = utilities.filter_data_by_restrictions(
        data, restrictions_entity, "yld", utility_data.get_age_group_ids()
    )
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS)
    return data
Exemplo n.º 11
0
def _get_raw_demographic_dimensions(location: str):
    location_id = extract.get_location_id(location)
    ages = utility_data.get_age_group_ids()
    years = range(project_globals.MIN_YEAR, project_globals.MAX_YEAR + 1)
    sexes = [vi_globals.SEXES['Male'], vi_globals.SEXES['Female']]
    location_id = [location_id]
    values = [location_id, sexes, ages, years]
    names = ['location_id', 'sex_id', 'age_group_id', 'year_id']

    data = (pd.MultiIndex.from_product(values,
                                       names=names).to_frame(index=False))
    data = standardize.normalize(data)
    data = utilities.reshape(data)
    return data
Exemplo n.º 12
0
def load_ikf_paf(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False)
    relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False)

    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]
    relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

    data = (data.groupby('cause_id', as_index=False)
            .apply(core.filter_by_relative_risk, relative_risk)
            .reset_index(drop=True))

    causes_map = {c.gbd_id: c for c in causes}
    temp = []
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Exemplo n.º 13
0
def get_exposure(
    entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int
) -> pd.DataFrame:
    data = extract.extract_data(entity, "exposure", location_id)
    data = data.drop("modelable_entity_id", "columns")

    if entity.name in EXTRA_RESIDUAL_CATEGORY:
        cat = EXTRA_RESIDUAL_CATEGORY[entity.name]
        data = data.drop(labels=data.query("parameter == @cat").index)
        data[DRAW_COLUMNS] = data[DRAW_COLUMNS].clip(lower=MINIMUM_EXPOSURE_VALUE)

    if entity.kind in ["risk_factor", "alternative_risk_factor"]:
        data = utilities.filter_data_by_restrictions(
            data, entity, "outer", utility_data.get_age_group_ids()
        )

    if entity.distribution in ["dichotomous", "ordered_polytomous", "unordered_polytomous"]:
        tmrel_cat = utility_data.get_tmrel_category(entity)
        exposed = data[data.parameter != tmrel_cat]
        unexposed = data[data.parameter == tmrel_cat]

        #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
        data = pd.concat(
            [
                utilities.normalize(exposed, fill_value=0),
                utilities.normalize(unexposed, fill_value=1),
            ],
            ignore_index=True,
        )

        # normalize so all categories sum to 1
        cols = list(set(data.columns).difference(DRAW_COLUMNS + ["parameter"]))
        sums = data.groupby(cols)[DRAW_COLUMNS].sum()
        data = (
            data.groupby("parameter")
            .apply(lambda df: df.set_index(cols).loc[:, DRAW_COLUMNS].divide(sums))
            .reset_index()
        )
    else:
        data = utilities.normalize(data, fill_value=0)
    data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS + ["parameter"])
    return data
Exemplo n.º 14
0
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    location_id = utility_data.get_location_id(location) if isinstance(location, str) else location
    measure = 'exposure'
    raw_validation.check_metadata(entity, measure)

    data = gbd.get_exposure(entity.gbd_id, location_id)
    data = normalize_ikf_exposure_distribution(data)
    raw_validation.validate_raw_data(data, entity, measure, location_id)

    data = data.drop('modelable_entity_id', 'columns')

    data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids())

    tmrel_cat = utility_data.get_tmrel_category(entity)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]

    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data
            .groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())

    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Exemplo n.º 15
0
def load_shigella_remission_rate(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    data = extract.get_modelable_entity_draws(
        causes.diarrheal_diseases.dismod_id, location_id)
    data = data[data.measure_id == vi_globals.MEASURES['Remission rate']]
    data = utilities.filter_data_by_restrictions(
        data, causes.diarrheal_diseases, 'yld',
        utility_data.get_age_group_ids())
    data = data[data.year_id == 2016].drop(
        columns='year_id')  # Use latest GBD results for all data
    data = standardize.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Exemplo n.º 16
0
def get_population_attributable_fraction(
    entity: Union[RiskFactor, Etiology], location_id: int
) -> pd.DataFrame:
    causes_map = {c.gbd_id: c for c in causes}
    if entity.kind == "risk_factor":
        data = extract.extract_data(entity, "population_attributable_fraction", location_id)
        relative_risk = extract.extract_data(entity, "relative_risk", location_id)

        # FIXME: we don't currently support yll-only causes so I'm dropping them because the data in some cases is
        #  very messed up, with mort = morb = 1 (e.g., aortic aneurysm in the RR data for high systolic bp) -
        #  2/8/19 K.W.
        yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
        data = data[~data.cause_id.isin(yll_only_causes)]
        relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

        data = (
            data.groupby("cause_id", as_index=False)
            .apply(filter_by_relative_risk, relative_risk)
            .reset_index(drop=True)
        )

        temp = []
        # We filter paf age groups by cause level restrictions.
        for (c_id, measure), df in data.groupby(["cause_id", "measure_id"]):
            cause = causes_map[c_id]
            measure = "yll" if measure == MEASURES["YLLs"] else "yld"
            df = utilities.filter_data_by_restrictions(
                df, cause, measure, utility_data.get_age_group_ids()
            )
            temp.append(df)
        data = pd.concat(temp, ignore_index=True)

    else:  # etiology
        data = extract.extract_data(
            entity, "etiology_population_attributable_fraction", location_id
        )
        cause = [c for c in causes if entity in c.etiologies][0]
        data = utilities.filter_data_by_restrictions(
            data, cause, "inner", utility_data.get_age_group_ids()
        )
        if np.any(data[DRAW_COLUMNS] < 0):
            logger.warning(
                f"{entity.name.capitalize()} has negative values for paf. These will be replaced with 0."
            )
            other_cols = [c for c in data.columns if c not in DRAW_COLUMNS]
            data.set_index(other_cols, inplace=True)
            data = data.where(data[DRAW_COLUMNS] > 0, 0).reset_index()

    data = utilities.convert_affected_entity(data, "cause_id")
    data.loc[
        data["measure_id"] == MEASURES["YLLs"], "affected_measure"
    ] = "excess_mortality_rate"
    data.loc[data["measure_id"] == MEASURES["YLDs"], "affected_measure"] = "incidence_rate"
    data = (
        data.groupby(["affected_entity", "affected_measure"])
        .apply(utilities.normalize, fill_value=0)
        .reset_index(drop=True)
    )
    data = data.filter(
        DEMOGRAPHIC_COLUMNS + ["affected_entity", "affected_measure"] + DRAW_COLUMNS
    )
    return data