def load_metadata(key: str, location: str):
    key = EntityKey(key)
    entity = get_entity(key)
    entity_metadata = entity[key.measure]
    if hasattr(entity_metadata, 'to_dict'):
        entity_metadata = entity_metadata.to_dict()
    return entity_metadata
def open_artifact(output_path: Path, location: str) -> Artifact:
    """Creates or opens an artifact at the output path.

    Parameters
    ----------
    output_path
        Fully resolved path to the artifact file.
    location
        Proper GBD location name represented by the artifact.

    Returns
    -------
        A new artifact.

    """
    if not output_path.exists():
        logger.debug(f"Creating artifact at {str(output_path)}.")
    else:
        logger.debug(f"Opening artifact at {str(output_path)} for appending.")

    artifact = Artifact(output_path,
                        filter_terms=[get_location_term(location)])

    key = EntityKey(project_globals.METADATA_LOCATIONS)
    if str(key) not in artifact:
        artifact.write(key, [location])

    return artifact
Пример #3
0
def load_forecast_data(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    path = paths.forecast_data_path(key)
    data = extract.load_forecast_from_xarray(path, location_id)
    data = data[data.scenario == project_globals.FORECASTING_SCENARIO].drop(
        columns='scenario')
    if key == EntityKey('etiology.shigellosis.incidence'):
        # Only one draw for incidence
        data = pd.concat(project_globals.NUM_DRAWS * [
            data.set_index(
                ['location_id', 'age_group_id', 'sex_id', 'year_id']).value
        ],
                         axis=1)
    else:
        data = data.set_index(
            ['location_id', 'age_group_id', 'sex_id', 'year_id',
             'draw']).unstack()
    if len(data.columns) == 100:  # Not 1000 draws for everything
        data = pd.concat([data] * 10, axis=1)
    data.columns = pd.Index([f'draw_{i}' for i in range(1000)])
    data = data.reset_index()
    data = standardize.normalize(data)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
def get_entity(key: str):
    # Map of entity types to their gbd mappings.
    type_map = {
        'cause': causes,
        'covariate': covariates,
        'risk_factor': risk_factors,
        'alternative_risk_factor': alternative_risk_factors
    }
    key = EntityKey(key)
    return type_map[key.type][key.name]
Пример #5
0
def load_live_births_by_year(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    asfr_key = EntityKey('covariate.age_specific_fertility_rate.estimate')
    pop_key = EntityKey(project_globals.POPULATION_STRUCTURE)

    asfr_data = extract.load_forecast_from_xarray(
        paths.forecast_data_path(asfr_key), location_id)
    asfr_data = asfr_data[
        (asfr_data.scenario == project_globals.FORECASTING_SCENARIO)
        & (asfr_data.year_id >= project_globals.MIN_YEAR)].drop(
            columns='scenario')
    asfr_data = asfr_data.set_index(
        ['location_id', 'age_group_id', 'sex_id', 'year_id',
         'draw']).unstack()
    asfr_data.columns = pd.Index([f'draw_{i}' for i in range(1000)])

    pop_data = extract.load_forecast_from_xarray(
        paths.forecast_data_path(pop_key), location_id)
    pop_data = pop_data[(
        pop_data.scenario == project_globals.FORECASTING_SCENARIO)].drop(
            columns='scenario')
    pop_data = pop_data.set_index(
        ['location_id', 'age_group_id', 'sex_id', 'year_id',
         'draw']).unstack()
    pop_data.columns = pd.Index([f'draw_{i}' for i in range(1000)])
    pop_data = pop_data.loc[asfr_data.index]

    live_births = asfr_data * pop_data
    live_births = (live_births.reset_index().drop(
        columns=['sex_id', 'age_group_id']).groupby(['location_id', 'year_id'
                                                     ]).sum().reset_index())

    data = standardize.normalize(live_births)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #6
0
def load_lbwsg_exposure(key: str, location: str) -> pd.DataFrame:
    if key != data_keys.LBWSG.EXPOSURE:
        raise ValueError(f'Unrecognized key {key}')

    key = EntityKey(key)
    entity = utilities.get_entity(key)
    data = utilities.get_data(key, entity, location, gbd_constants.SOURCES.EXPOSURE, 'rei_id',
                              metadata.AGE_GROUP.GBD_2019_LBWSG_EXPOSURE, metadata.GBD_2019_ROUND_ID, 'step4')
    data = data[data['year_id'] == 2019].drop(columns='year_id')
    data = utilities.process_exposure(data, key, entity, location, metadata.GBD_2019_ROUND_ID,
                                      metadata.AGE_GROUP.GBD_2019_LBWSG_EXPOSURE | metadata.AGE_GROUP.GBD_2020)
    data = data[data.index.get_level_values('year_start') == 2019]
    return data
Пример #7
0
def load_gbd_2020_rr(key: str, location: str) -> pd.DataFrame:
    entity_key = EntityKey(key)
    entity = utilities.get_gbd_2020_entity(entity_key)

    data = utilities.get_data(
        entity_key,
        entity,
        location,
        gbd_constants.SOURCES.RR,
        'rei_id',
        metadata.AGE_GROUP.GBD_2020,
        metadata.GBD_2020_ROUND_ID
    )
    data = utilities.process_relative_risk(data, entity_key, entity, location, metadata.GBD_2020_ROUND_ID,
                                           metadata.AGE_GROUP.GBD_2020)

    if key == data_keys.STUNTING.RELATIVE_RISK:
        # Remove neonatal relative risks
        neonatal_age_ends = data.index.get_level_values('age_end').unique()[:2]
        data.loc[data.index.get_level_values('age_end').isin(neonatal_age_ends)] = 1.0
    elif key == data_keys.WASTING.RELATIVE_RISK:
        # Remove relative risks for simulants under 6 months
        data.loc[data.index.get_level_values('age_end') <= data_values.WASTING.START_AGE] = 1.0

        # Set risk to affect diarrheal emr
        diarrhea_rr = data.query(f"affected_entity == '{data_keys.DIARRHEA.name}'")
        data = pd.concat([
            diarrhea_rr.rename(
                index={'incidence_rate': 'excess_mortality_rate'}, level='affected_measure'
            ), data.drop(diarrhea_rr.index)
        ]).sort_index()
    elif key == data_keys.DISCONTINUED_BREASTFEEDING.RELATIVE_RISK:
        # Remove RR outside of [6 months, 2 years)
        discontinued_tmrel_index = data.query(
            f'age_start < {data_values.DISCONTINUED_BREASTFEEDING_START_AGE}'
            f' or age_end > {data_values.DISCONTINUED_BREASTFEEDING_END_AGE}'
        ).index
        discontinued_tmrel_rr = pd.DataFrame(
            1.0, columns=metadata.ARTIFACT_COLUMNS, index=discontinued_tmrel_index
        )
        data.update(discontinued_tmrel_rr)
    elif key == data_keys.NON_EXCLUSIVE_BREASTFEEDING.RELATIVE_RISK:
        # Remove month [6, months, 1 year) exposure
        non_exclusive_tmrel_index = data.query(
            f'age_start == {data_values.NON_EXCLUSIVE_BREASTFEEDING_END_AGE}'
        ).index
        non_exclusive_tmrel_rr = pd.DataFrame(
            1.0, columns=metadata.ARTIFACT_COLUMNS, index=non_exclusive_tmrel_index
        )
        data.update(non_exclusive_tmrel_rr)
    return data
Пример #8
0
def load_sids_csmr(key: str, location: str) -> pd.DataFrame:
    if key == data_keys.AFFECTED_UNMODELED_CAUSES.SIDS_CSMR:
        key = EntityKey(key)
        entity: Cause = utilities.get_entity(key)

        # get around the validation rejecting yll only causes
        entity.restrictions.yll_only = False
        entity.restrictions.yld_age_group_id_start = min(metadata.AGE_GROUP.GBD_2019_SIDS)
        entity.restrictions.yld_age_group_id_end = max(metadata.AGE_GROUP.GBD_2019_SIDS)

        data = interface.get_measure(entity, key.measure, location).droplevel('location')
        return data
    else:
        raise ValueError(f'Unrecognized key {key}')
Пример #9
0
def load_gbd_2020_exposure(key: str, location: str) -> pd.DataFrame:
    entity_key = EntityKey(key)
    entity = utilities.get_gbd_2020_entity(entity_key)

    data = utilities.get_data(entity_key, entity, location, gbd_constants.SOURCES.EXPOSURE, 'rei_id',
                              metadata.AGE_GROUP.GBD_2020, metadata.GBD_2020_ROUND_ID)
    data = utilities.process_exposure(data, entity_key, entity, location, metadata.GBD_2020_ROUND_ID,
                                      metadata.AGE_GROUP.GBD_2020)

    if entity_key == data_keys.STUNTING.EXPOSURE:
        # Remove neonatal exposure
        neonatal_age_ends = data.index.get_level_values('age_end').unique()[:2]
        data.loc[data.index.get_level_values('age_end').isin(neonatal_age_ends)] = 0.0
        data.loc[data.index.get_level_values('age_end').isin(neonatal_age_ends)
                 & (data.index.get_level_values('parameter') == data_keys.STUNTING.CAT4)] = 1.0
    return data
def load_and_write_demographic_data(artifact: Artifact, location: str):
    keys = [
        EntityKey(project_globals.POPULATION_STRUCTURE),
        EntityKey(project_globals.POPULATION_AGE_BINS),
        EntityKey(project_globals.POPULATION_DEMOGRAPHY),
        EntityKey(
            project_globals.POPULATION_TMRLE),  # Theoretical life expectancy
        EntityKey(project_globals.POPULATION_LSLE
                  ),  # Location specific life expectancy
        EntityKey(project_globals.ALL_CAUSE_CSMR),
        EntityKey(project_globals.COVARIATE_LIVE_BIRTHS),
    ]

    for key in keys:
        load_and_write_data(artifact, key, location)
Пример #11
0
def load_lbwsg_rr(key: str, location: str) -> pd.DataFrame:
    if key != data_keys.LBWSG.RELATIVE_RISK:
        raise ValueError(f'Unrecognized key {key}')

    key = EntityKey(key)
    entity = utilities.get_entity(key)
    data = utilities.get_data(key, entity, location, gbd_constants.SOURCES.RR, 'rei_id',
                              metadata.AGE_GROUP.GBD_2019_LBWSG_RELATIVE_RISK, metadata.GBD_2019_ROUND_ID, 'step4')
    data = data[data['year_id'] == 2019].drop(columns='year_id')
    data = utilities.process_relative_risk(data, key, entity, location, metadata.GBD_2019_ROUND_ID,
                                           metadata.AGE_GROUP.GBD_2020, whitelist_sids=True)
    data = (
        data.query('year_start == 2019')
        .droplevel(['affected_entity', 'affected_measure'])
    )
    data = data[~data.index.duplicated()]
    return data
Пример #12
0
def load_ikf_paf(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False)
    relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False)

    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]
    relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

    data = (data.groupby('cause_id', as_index=False)
            .apply(core.filter_by_relative_risk, relative_risk)
            .reset_index(drop=True))

    causes_map = {c.gbd_id: c for c in causes}
    temp = []
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #13
0
def write_data_by_draw(artifact: Artifact, key: str, data: pd.DataFrame):
    """Writes data to the artifact on a per-draw basis. This is useful
    for large datasets like Low Birthweight Short Gestation (LBWSG).

    Parameters
    ----------
    artifact
        The artifact to write to.
    key
        The entity key associated with the data to write.
    data
        The data to write.

    """
    with pd.HDFStore(artifact.path, complevel=9, mode='a') as store:
        key = EntityKey(key)
        artifact._keys.append(key)
        store.put(f'{key.path}/index', data.index.to_frame(index=False))
        data = data.reset_index(drop=True)
        for c in data.columns:
            store.put(f'{key.path}/{c}', data[c])
def load_and_write_cause_data(artifact: Artifact, location: str):
    key = EntityKey(project_globals.SHIGELLA_CSMR)
    csmr = load_and_write_data(artifact, key, location)
    key = EntityKey(project_globals.SHIGELLA_DISABILITY_WEIGHT)
    load_and_write_data(artifact, key, location)

    key = EntityKey(project_globals.SHIGELLA_INCIDENCE_RATE)
    incidence = load_and_write_data(artifact, key, location)
    key = EntityKey(project_globals.SHIGELLA_REMISSION_RATE)
    remission = load_and_write_data(artifact, key, location)

    key = EntityKey(project_globals.SHIGELLA_PREVALENCE)
    prevalence = write_data(artifact, key, incidence / remission)

    key = EntityKey(project_globals.SHIGELLA_EMR)
    write_data(artifact, key, (csmr / prevalence).fillna(0))

    key = EntityKey(project_globals.SHIGELLA_RESTRICTIONS)
    write_data(artifact, key, causes.diarrheal_diseases.restrictions.to_dict())
Пример #15
0
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    location_id = utility_data.get_location_id(location) if isinstance(location, str) else location
    measure = 'exposure'
    raw_validation.check_metadata(entity, measure)

    data = gbd.get_exposure(entity.gbd_id, location_id)
    data = normalize_ikf_exposure_distribution(data)
    raw_validation.validate_raw_data(data, entity, measure, location_id)

    data = data.drop('modelable_entity_id', 'columns')

    data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids())

    tmrel_cat = utility_data.get_tmrel_category(entity)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]

    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data
            .groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())

    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #16
0
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'relative_risk', location_id, validate=False)
    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]

    data = utilities.convert_affected_entity(data, 'cause_id')
    data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)]
    morbidity = data.morbidity == 1
    mortality = data.mortality == 1
    data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate'
    data = core.filter_relative_risk_to_cause_restrictions(data)

    data = (data.groupby(['affected_entity', 'parameter'])
            .apply(utilities.normalize, fill_value=1)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter']
                       + vi_globals.DRAW_COLUMNS)

    tmrel_cat = utility_data.get_tmrel_category(entity)
    tmrel_mask = data.parameter == tmrel_cat
    data.loc[tmrel_mask, value_cols] = (
        data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0)
    )

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
def load_and_write_vaccine_data(artifact: Artifact, location: str):
    key = EntityKey(project_globals.COVARIATE_DTP3)
    logger.debug(f'Loading data for {key} for location {location}.')
    dtp3_coverage = loader.get_data(key, location)
    key = EntityKey(project_globals.COVARIATE_MEASLES1)
    logger.debug(f'Loading data for {key} for location {location}.')
    measles1_coverage = loader.get_data(key, location)
    key = EntityKey(project_globals.COVARIATE_MEASLES2)
    logger.debug(f'Loading data for {key} for location {location}.')
    measles2_coverage = loader.get_data(key, location)

    key = EntityKey(project_globals.COVARIATE_SHIGELLA_6MO)
    write_data(artifact, key, 0.5 * (dtp3_coverage + measles1_coverage))

    key = EntityKey(project_globals.COVARIATE_SHIGELLA_9MO)
    write_data(artifact, key, measles1_coverage)

    key = EntityKey(project_globals.COVARIATE_SHIGELLA_12MO)
    write_data(artifact, key, 0.5 * (measles1_coverage + measles2_coverage))

    key = EntityKey(project_globals.COVARIATE_SHIGELLA_15MO)
    write_data(artifact, key, measles2_coverage)
Пример #18
0
def filter_relative_risk_to_cause_restrictions(
        data: pd.DataFrame) -> pd.DataFrame:
    """ It applies age restrictions according to affected causes
    and affected measures. If affected measure is incidence_rate,
    it applies the yld_age_restrictions. If affected measure is
    excess_mortality_rate, it applies the yll_age_restrictions to filter
    the relative_risk data"""

    temp = []
    affected_entities = set(data.affected_entity)
    affected_measures = set(data.affected_measure)
    for cause, measure in product(affected_entities, affected_measures):
        df = data[(data.affected_entity == cause)
                  & (data.affected_measure == measure)]
        cause = get_gbd_2020_entity(EntityKey(f'cause.{cause}.{measure}'))
        if measure == 'excess_mortality_rate':
            start, end = vi_utils.get_age_group_ids_by_restriction(
                cause, 'yll')
        else:  # incidence_rate
            start, end = vi_utils.get_age_group_ids_by_restriction(
                cause, 'yld')
        temp.append(df[df.age_group_id.isin(range(start, end + 1))])
    data = pd.concat(temp)
    return data
def load_standard_data(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)
    return interface.get_measure(entity, key.measure,
                                 location).droplevel('location')
Пример #20
0
def get_data(lookup_key: EntityKey, location: str):
    mapping = {
        EntityKey(project_globals.POPULATION_STRUCTURE):
        (load_forecast_data, EntityKey(project_globals.POPULATION_STRUCTURE)),
        EntityKey(project_globals.POPULATION_AGE_BINS):
        (load_age_bins, EntityKey(project_globals.POPULATION_AGE_BINS)),
        EntityKey(project_globals.POPULATION_DEMOGRAPHY):
        (load_demographic_dimensions,
         EntityKey(project_globals.POPULATION_DEMOGRAPHY)),
        EntityKey(project_globals.POPULATION_TMRLE):
        (load_theoretical_minimum_risk_life_expectancy,
         EntityKey(project_globals.POPULATION_TMRLE)),
        EntityKey(project_globals.POPULATION_LSLE):
        (load_location_specific_life_expectancy,
         EntityKey(project_globals.POPULATION_LSLE)),
        EntityKey(project_globals.ALL_CAUSE_CSMR):
        (load_forecast_data,
         EntityKey('cause.all_causes.cause_specific_mortality')),
        EntityKey(project_globals.COVARIATE_LIVE_BIRTHS):
        (load_live_births_by_year,
         EntityKey(project_globals.COVARIATE_LIVE_BIRTHS)),
        EntityKey(project_globals.SHIGELLA_CSMR):
        (load_forecast_data,
         EntityKey('etiology.shigellosis.cause_specific_mortality')),
        EntityKey(project_globals.SHIGELLA_INCIDENCE_RATE):
        (load_forecast_data, EntityKey('etiology.shigellosis.incidence')),
        EntityKey(project_globals.SHIGELLA_REMISSION_RATE):
        (load_shigella_remission_rate,
         EntityKey(project_globals.SHIGELLA_REMISSION_RATE)),
        EntityKey(project_globals.SHIGELLA_DISABILITY_WEIGHT):
        (load_shigella_disability_weight,
         EntityKey(project_globals.SHIGELLA_DISABILITY_WEIGHT)),
        EntityKey(project_globals.COVARIATE_DTP3):
        (load_forecast_data, EntityKey(project_globals.COVARIATE_DTP3)),
        EntityKey(project_globals.COVARIATE_MEASLES1):
        (load_forecast_data, EntityKey(project_globals.COVARIATE_MEASLES1)),
        EntityKey(project_globals.COVARIATE_MEASLES2):
        (load_forecast_data, EntityKey(project_globals.COVARIATE_MEASLES2)),
    }
    loader, access_key = mapping[lookup_key]
    return loader(access_key, location)