Пример #1
0
def load_lbwsg_exposure(key: str, location: str):
    path = paths.lbwsg_data_path('exposure', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id

    data = data.drop('modelable_entity_id', 'columns')
    data = data[data.parameter != 'cat124']  # LBWSG data has an extra residual category added by get_draws.
    data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation,
                                                 'outer', utility_data.get_age_group_ids())
    tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]
    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data.groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'exposure', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #2
0
def validate_and_reshape_gbd_data(
        data: pd.DataFrame,
        entity: ModelableEntity,
        key: EntityKey,
        location: str,
        gbd_round_id: int,
        age_group_ids: List[int] = None) -> pd.DataFrame:

    # from vivarium_inputs.core.get_data
    data = vi_utils.reshape(data, value_cols=vi_globals.DRAW_COLUMNS)

    # from interface.get_measure
    data = _scrub_gbd_conventions(data, location, age_group_ids)

    estimation_years = get_gbd_estimation_years(gbd_round_id)
    validation_years = pd.DataFrame({
        'year_start':
        range(min(estimation_years),
              max(estimation_years) + 1)
    })
    validation_years['year_end'] = validation_years['year_start'] + 1

    # validate_for_simulation(data, entity, key.measure, location, years=validation_years,
    #                         age_bins=get_gbd_age_bins(age_group_ids))
    data = vi_utils.split_interval(data,
                                   interval_column='age',
                                   split_column_prefix='age')
    data = vi_utils.split_interval(data,
                                   interval_column='year',
                                   split_column_prefix='year')
    data = vi_utils.sort_hierarchical_data(data).droplevel('location')
    return data
Пример #3
0
def load_forecast_data(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    path = paths.forecast_data_path(key)
    data = extract.load_forecast_from_xarray(path, location_id)
    data = data[data.scenario == project_globals.FORECASTING_SCENARIO].drop(
        columns='scenario')
    if key == EntityKey('etiology.shigellosis.incidence'):
        # Only one draw for incidence
        data = pd.concat(project_globals.NUM_DRAWS * [
            data.set_index(
                ['location_id', 'age_group_id', 'sex_id', 'year_id']).value
        ],
                         axis=1)
    else:
        data = data.set_index(
            ['location_id', 'age_group_id', 'sex_id', 'year_id',
             'draw']).unstack()
    if len(data.columns) == 100:  # Not 1000 draws for everything
        data = pd.concat([data] * 10, axis=1)
    data.columns = pd.Index([f'draw_{i}' for i in range(1000)])
    data = data.reset_index()
    data = standardize.normalize(data)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #4
0
def load_lbwsg_relative_risk(key: str, location: str):
    path = paths.lbwsg_data_path('relative_risk', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id
    data = utilities.convert_affected_entity(data, 'cause_id')
    # RRs for all causes are the same.
    data = data[data.affected_entity == 'diarrheal_diseases']
    data['affected_entity'] = 'all'
    # All lbwsg risk is about mortality.
    data.loc[:, 'affected_measure'] = 'excess_mortality_rate'
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS
                       + ['affected_entity', 'affected_measure', 'parameter']
                       + vi_globals.DRAW_COLUMNS)
    data = (
        data
            .groupby(['affected_entity', 'parameter'])
            .apply(utilities.normalize, fill_value=1)
            .reset_index(drop=True)
    )

    tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation)
    tmrel_mask = data.parameter == tmrel_cat
    data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS] = (
        data
            .loc[tmrel_mask, vi_globals.DRAW_COLUMNS]
            .mask(np.isclose(data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS], 1.0), 1.0)
    )

    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'relative_risk', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #5
0
def load_shigella_disability_weight(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)

    data = _get_raw_demographic_dimensions(location)
    data = pd.DataFrame(0, columns=vi_globals.DRAW_COLUMNS, index=data.index)

    for sequela in causes.diarrheal_diseases.sequelae:
        prevalence = _load_prevalence(sequela, location_id, 'sequela')
        disability = _load_diarrhea_sequela_disability_weight(
            sequela, location_id)
        disability.index = disability.index.set_levels([location_id],
                                                       'location_id')
        data += prevalence * disability

    diarrhea_prevalence = _load_prevalence(causes.diarrheal_diseases,
                                           location_id, 'cause')
    data = (data / diarrhea_prevalence).fillna(0).reset_index()
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #6
0
def load_lbwsg_paf(key: str, location: str):
    path = paths.lbwsg_data_path('population_attributable_fraction', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id
    data = data[data.metric_id == vi_globals.METRICS['Percent']]
    # All lbwsg risk is about mortality.
    data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])]

    temp = []
    causes_map = {c.gbd_id: c for c in causes}
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS
                       + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'population_attributable_fraction', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
def write_utilization_rate(artifact, location):
    key = 'healthcare_entity.outpatient_visits.utilization_rate'
    from vivarium_csu_hypertension_sdc import external_data
    data_dir = Path(external_data.__file__).parent
    data = pd.read_csv(data_dir / f'outpatient_utilization.csv')
    loc_id = utility_data.get_location_id(location)
    data = data[data.location_id == loc_id].reset_index(drop=True)
    data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean'])
    data['log_sd'] = (
        np.log(data['outpatient_visits_per_cap_95_upper']) -
        np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96
    draws = np.exp(
        np.random.normal(loc=data['log_mean'],
                         scale=data['log_sd'],
                         size=(1000, len(data)))).T
    draws = pd.DataFrame(data=draws, columns=globals.DRAW_COLUMNS)
    data = pd.concat(
        [data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws],
        axis=1)
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = split_interval(data,
                          interval_column='age',
                          split_column_prefix='age')
    data = split_interval(data,
                          interval_column='year',
                          split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)
    artifact.write(key, data)
def write_ckd_data(artifact, location):
    load = get_load(location)

    # Metadata
    key = f'cause.chronic_kidney_disease.restrictions'
    artifact.write(key, load(key))

    # Measures for Disease Model
    key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate'
    csmr = load(key)
    artifact.write(key, csmr.copy())

    # Measures for Disease States
    key = 'cause.chronic_kidney_disease.prevalence'
    prevalence = load(key)
    artifact.write(key, prevalence.copy())

    key = 'cause.chronic_kidney_disease.disability_weight'
    df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id,
                                      utility_data.get_location_id(location))
    ylds = df[df.measure_id == globals.MEASURES['YLDs']]
    ylds = utilities.filter_data_by_restrictions(
        ylds, causes.chronic_kidney_disease, 'yld',
        utility_data.get_age_group_ids())
    ylds = utilities.normalize(ylds, fill_value=0)
    ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS)
    ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS)
    ylds = utilities.scrub_gbd_conventions(ylds, location)
    ylds = split_interval(ylds,
                          interval_column='age',
                          split_column_prefix='age')
    ylds = split_interval(ylds,
                          interval_column='year',
                          split_column_prefix='year')
    ylds = utilities.sort_hierarchical_data(ylds)
    dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, dw)

    key = 'cause.chronic_kidney_disease.excess_mortality_rate'
    emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, emr)

    # Measures for Transitions
    key = 'cause.chronic_kidney_disease.incidence_rate'
    data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate',
                         location)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)
    data[
        data >
        50] = 50  # Russia has absurdly high values in some of the data and it breaks validation.
    artifact.write(key, data)
def write_hypertension_medication_data(artifact, location):
    external_data_specification = {
        'adherence': {
            'seed_columns':
            ['location'],  # all adherence will use the same seeds
            'distribution': 'beta',
        },
        'medication_probabilities': {
            'seed_columns': [
                'location', 'measure', 'thiazide_type_diuretics',
                'beta_blockers', 'ace_inhibitors', 'angiotensin_ii_blockers',
                'calcium_channel_blockers'
            ],
            'distribution':
            'beta',
        },
        'therapy_category': {
            'seed_columns': ['location', 'therapy_category'],
            'distribution': 'beta',
        },
        'treatment_coverage': {
            'seed_columns': ['location', 'measure'],
            'distribution': 'beta',
        },
        'drug_efficacy': {
            'seed_columns': [
                'location', 'drug'
            ],  # don't include dosage so all dosages of same drug will use same seeds
            'distribution': 'normal',
        },
    }

    for k, spec in external_data_specification.items():
        data = load_external_data(k, location)
        data = generate_draws(data, spec['seed_columns'], spec['distribution'])

        if set(data.location) == {'Global'}:
            # do this post draw generation so all locations use the same draws if data is global
            data.location = location

        if k == 'medication_probabilities':  # drop ACE + ARB single pill because not used
            data = data.loc[~((data.ace_inhibitors == 1) &
                              (data.angiotensin_ii_blockers == 1))]

        data = utilities.sort_hierarchical_data(utilities.reshape(data))

        if k == 'therapy_category':  # normalize so that sum of all categories = 1
            data = data.divide(data.sum(axis=0), axis=1)

        key = f'health_technology.hypertension_medication.{k}'
        data = split_interval(data,
                              interval_column='age',
                              split_column_prefix='age')
        data = split_interval(data,
                              interval_column='year',
                              split_column_prefix='year')
        artifact.write(key, data)
Пример #10
0
def _load_em_from_meid(location, meid, measure):
    location_id = utility_data.get_location_id(location)
    data = gbd.get_modelable_entity_draws(meid, location_id)
    data = data[data.measure_id == vi_globals.MEASURES[measure]]
    data = vi_utils.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS)
    data = vi_utils.reshape(data)
    data = vi_utils.scrub_gbd_conventions(data, location)
    data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age')
    data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year')
    return vi_utils.sort_hierarchical_data(data)
Пример #11
0
def _load_diarrhea_sequela_disability_weight(sequela, location_id: int):
    logger.info(f'Loading disability weight for {sequela.name} from GBD 2016.')
    data = extract.get_auxiliary_data('disability_weight', 'sequela', 'all',
                                      location_id)
    data = data.loc[data.healthstate_id == sequela.healthstate.gbd_id, :]
    data = standardize.normalize(data)
    data = utilities.clear_disability_weight_outside_restrictions(
        data, causes.diarrheal_diseases, 0.0, utility_data.get_age_group_ids())
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    return utilities.reshape(data)
Пример #12
0
def _load_prevalence(entity, location_id: int, entity_type: str):
    logger.info(f'Loading prevalence for {entity.name} from GBD 2016.')
    data = extract.get_como_draws(entity.gbd_id, location_id, entity_type)
    data = data[data.measure_id == vi_globals.MEASURES['Prevalence']]
    data = utilities.filter_data_by_restrictions(
        data, causes.diarrheal_diseases, 'yld',
        utility_data.get_age_group_ids())
    data = data[data.year_id == 2016].drop(
        columns='year_id')  # Use latest GBD results for all data
    data = standardize.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    return utilities.reshape(data)
Пример #13
0
def _get_raw_demographic_dimensions(location: str):
    location_id = extract.get_location_id(location)
    ages = utility_data.get_age_group_ids()
    years = range(project_globals.MIN_YEAR, project_globals.MAX_YEAR + 1)
    sexes = [vi_globals.SEXES['Male'], vi_globals.SEXES['Female']]
    location_id = [location_id]
    values = [location_id, sexes, ages, years]
    names = ['location_id', 'sex_id', 'age_group_id', 'year_id']

    data = (pd.MultiIndex.from_product(values,
                                       names=names).to_frame(index=False))
    data = standardize.normalize(data)
    data = utilities.reshape(data)
    return data
Пример #14
0
def load_ikf_paf(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False)
    relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False)

    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]
    relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

    data = (data.groupby('cause_id', as_index=False)
            .apply(core.filter_by_relative_risk, relative_risk)
            .reset_index(drop=True))

    causes_map = {c.gbd_id: c for c in causes}
    temp = []
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #15
0
def load_location_specific_life_expectancy(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    data = extract.get_location_specific_life_expectancy(location_id)
    data = data.rename(columns={'age': 'age_start'})
    data['age_end'] = data.age_start.shift(-1).fillna(5.01)
    earliest_year = data[data.year_id == 2025]
    out = []
    for year in range(project_globals.MIN_YEAR, 2025):
        df = earliest_year.copy()
        df['year_id'] = year
        out.append(df)
    data = pd.concat(out + [data], ignore_index=True)
    data = utilities.normalize_sex(data, None, ['value'])
    data = standardize.normalize_year(data)
    data = utilities.reshape(data, value_cols=['value'])
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #16
0
def load_lri_birth_prevalence_from_meid(_, location):
    """Ignore the first argument to fit in to the get_data model. """
    location_id = utility_data.get_location_id(location)
    data = get_draws('modelable_entity_id', project_globals.LRI_BIRTH_PREVALENCE_MEID,
                     source=project_globals.LRI_BIRTH_PREVALENCE_DRAW_SOURCE,
                     age_group_id=project_globals.LRI_BIRTH_PREVALENCE_AGE_ID,
                     measure_id=vi_globals.MEASURES['Prevalence'],
                     gbd_round_id=project_globals.LRI_BIRTH_PREVALENCE_GBD_ROUND,
                     location_id=location_id)
    data = data[data.measure_id == vi_globals.MEASURES['Prevalence']]
    data = utilities.normalize(data, fill_value=0)

    idx_columns = list(vi_globals.DEMOGRAPHIC_COLUMNS)
    idx_columns.remove('age_group_id')
    data = data.filter(idx_columns + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #17
0
def load_healthcare_utilization(key: str, location: str) -> pd.DataFrame:
    data = pd.read_csv(paths.HEALTHCARE_UTILIZATION,
                       dtype={'location_id': np.int64, 'sex_id': np.int64, 'age_group_id': np.int64,
                              'year_id': np.int64, 'outpatient_visits_per_cap_mean': np.float64,
                              'outpatient_visits_per_cap_95_upper': np.float64,
                              'outpatient_visits_per_cap_95_lower': np.float64})
    loc_id = utility_data.get_location_id(location)
    data = data[data.location_id == loc_id].reset_index(drop=True)
    data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean'])
    data['log_sd'] = (np.log(data['outpatient_visits_per_cap_95_upper'])
                      - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96
    draws = np.exp(np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T
    draws = pd.DataFrame(data=draws, columns=vi_globals.DRAW_COLUMNS)
    data = pd.concat([data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1)
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #18
0
def load_live_births_by_year(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    asfr_key = EntityKey('covariate.age_specific_fertility_rate.estimate')
    pop_key = EntityKey(project_globals.POPULATION_STRUCTURE)

    asfr_data = extract.load_forecast_from_xarray(
        paths.forecast_data_path(asfr_key), location_id)
    asfr_data = asfr_data[
        (asfr_data.scenario == project_globals.FORECASTING_SCENARIO)
        & (asfr_data.year_id >= project_globals.MIN_YEAR)].drop(
            columns='scenario')
    asfr_data = asfr_data.set_index(
        ['location_id', 'age_group_id', 'sex_id', 'year_id',
         'draw']).unstack()
    asfr_data.columns = pd.Index([f'draw_{i}' for i in range(1000)])

    pop_data = extract.load_forecast_from_xarray(
        paths.forecast_data_path(pop_key), location_id)
    pop_data = pop_data[(
        pop_data.scenario == project_globals.FORECASTING_SCENARIO)].drop(
            columns='scenario')
    pop_data = pop_data.set_index(
        ['location_id', 'age_group_id', 'sex_id', 'year_id',
         'draw']).unstack()
    pop_data.columns = pd.Index([f'draw_{i}' for i in range(1000)])
    pop_data = pop_data.loc[asfr_data.index]

    live_births = asfr_data * pop_data
    live_births = (live_births.reset_index().drop(
        columns=['sex_id', 'age_group_id']).groupby(['location_id', 'year_id'
                                                     ]).sum().reset_index())

    data = standardize.normalize(live_births)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #19
0
def load_shigella_remission_rate(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    data = extract.get_modelable_entity_draws(
        causes.diarrheal_diseases.dismod_id, location_id)
    data = data[data.measure_id == vi_globals.MEASURES['Remission rate']]
    data = utilities.filter_data_by_restrictions(
        data, causes.diarrheal_diseases, 'yld',
        utility_data.get_age_group_ids())
    data = data[data.year_id == 2016].drop(
        columns='year_id')  # Use latest GBD results for all data
    data = standardize.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #20
0
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    location_id = utility_data.get_location_id(location) if isinstance(location, str) else location
    measure = 'exposure'
    raw_validation.check_metadata(entity, measure)

    data = gbd.get_exposure(entity.gbd_id, location_id)
    data = normalize_ikf_exposure_distribution(data)
    raw_validation.validate_raw_data(data, entity, measure, location_id)

    data = data.drop('modelable_entity_id', 'columns')

    data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids())

    tmrel_cat = utility_data.get_tmrel_category(entity)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]

    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data
            .groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())

    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #21
0
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'relative_risk', location_id, validate=False)
    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]

    data = utilities.convert_affected_entity(data, 'cause_id')
    data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)]
    morbidity = data.morbidity == 1
    mortality = data.mortality == 1
    data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate'
    data = core.filter_relative_risk_to_cause_restrictions(data)

    data = (data.groupby(['affected_entity', 'parameter'])
            .apply(utilities.normalize, fill_value=1)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter']
                       + vi_globals.DRAW_COLUMNS)

    tmrel_cat = utility_data.get_tmrel_category(entity)
    tmrel_mask = data.parameter == tmrel_cat
    data.loc[tmrel_mask, value_cols] = (
        data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0)
    )

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Пример #22
0
def get_data(entity, measure: str, location: Union[str, int]):
    measure_handlers = {
        # Cause-like measures
        "incidence_rate": (get_incidence_rate, ("cause", "sequela")),
        "raw_incidence_rate": (get_raw_incidence_rate, ("cause", "sequela")),
        "prevalence": (get_prevalence, ("cause", "sequela")),
        "birth_prevalence": (get_birth_prevalence, ("cause", "sequela")),
        "disability_weight": (get_disability_weight, ("cause", "sequela")),
        "remission_rate": (get_remission_rate, ("cause",)),
        "cause_specific_mortality_rate": (get_cause_specific_mortality_rate, ("cause",)),
        "excess_mortality_rate": (get_excess_mortality_rate, ("cause",)),
        "deaths": (get_deaths, ("cause",)),
        # Risk-like measures
        "exposure": (
            get_exposure,
            (
                "risk_factor",
                "alternative_risk_factor",
            ),
        ),
        "exposure_standard_deviation": (
            get_exposure_standard_deviation,
            ("risk_factor", "alternative_risk_factor"),
        ),
        "exposure_distribution_weights": (
            get_exposure_distribution_weights,
            ("risk_factor", "alternative_risk_factor"),
        ),
        "relative_risk": (get_relative_risk, ("risk_factor",)),
        "population_attributable_fraction": (
            get_population_attributable_fraction,
            ("risk_factor", "etiology"),
        ),
        # Covariate measures
        "estimate": (get_estimate, ("covariate",)),
        # Population measures
        "structure": (get_structure, ("population",)),
        "theoretical_minimum_risk_life_expectancy": (
            get_theoretical_minimum_risk_life_expectancy,
            ("population",),
        ),
        "age_bins": (get_age_bins, ("population",)),
        "demographic_dimensions": (get_demographic_dimensions, ("population",)),
    }

    if measure not in measure_handlers:
        raise InvalidQueryError(f"No functions available to pull data for measure {measure}.")

    handler, entity_types = measure_handlers[measure]

    if entity.kind not in entity_types:
        raise InvalidQueryError(f"{measure.capitalize()} not available for {entity.kind}.")

    location_id = (
        utility_data.get_location_id(location) if isinstance(location, str) else location
    )
    data = handler(entity, location_id)

    if measure in [
        "structure",
        "theoretical_minimum_risk_life_expectancy",
        "estimate",
        "exposure_distribution_weights",
    ]:
        value_cols = ["value"]
    else:
        value_cols = DRAW_COLUMNS

    data = utilities.reshape(data, value_cols=value_cols)

    return data
def write_sbp_data(artifact, location):
    load = get_load(location)
    affected_entity_map = {
        'ischemic_heart_disease': 'acute_myocardial_infarction',
        'ischemic_stroke': 'acute_ischemic_stroke',
        'intracerebral_hemorrhage': 'acute_intracerebral_hemorrhage',
        'subarachnoid_hemorrhage': 'acute_subarachnoid_hemorrhage',
        'chronic_kidney_disease': 'chronic_kidney_disease'
    }

    prefix = 'risk_factor.high_systolic_blood_pressure.'
    measures = [
        "restrictions", "distribution", "tmred", "exposure",
        "exposure_standard_deviation", "relative_risk_scalar",
        "exposure_distribution_weights"
    ]
    for m in measures:
        key = prefix + m
        artifact.write(key, load(key))

    sbp = risk_factors.high_systolic_blood_pressure

    data = gbd.get_paf(sbp.gbd_id, utility_data.get_location_id(location))
    data = data[data.metric_id == globals.METRICS['Percent']]
    data = data[data.measure_id == globals.MEASURES['YLDs']]
    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == globals.MEASURES['YLDs'],
             'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'
                          ]).apply(utilities.normalize,
                                   fill_value=0).reset_index(drop=True))
    data = data.loc[data.affected_entity.isin(affected_entity_map.keys())]
    data.affected_entity.replace(to_replace=affected_entity_map, inplace=True)
    data = data.filter(globals.DEMOGRAPHIC_COLUMNS +
                       ['affected_entity', 'affected_measure'] +
                       globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = split_interval(data,
                          interval_column='age',
                          split_column_prefix='age')
    data = split_interval(data,
                          interval_column='year',
                          split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)

    key = prefix + 'population_attributable_fraction'
    artifact.write(key, data)

    data = gbd.get_relative_risk(sbp.gbd_id,
                                 utility_data.get_location_id(location))
    data = utilities.convert_affected_entity(data, 'cause_id')
    morbidity = data.morbidity == 1
    mortality = data.mortality == 1
    data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality'

    data = data.loc[data.affected_entity.isin(affected_entity_map.keys())]
    data = core.filter_relative_risk_to_cause_restrictions(data)
    data.affected_entity.replace(to_replace=affected_entity_map, inplace=True)
    data = data.filter(globals.DEMOGRAPHIC_COLUMNS +
                       ['affected_entity', 'affected_measure', 'parameter'] +
                       globals.DRAW_COLUMNS)
    data = (data.groupby(['affected_entity', 'parameter'
                          ]).apply(utilities.normalize,
                                   fill_value=1).reset_index(drop=True))
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.sort_hierarchical_data(data)
    data = split_interval(data,
                          interval_column='age',
                          split_column_prefix='age')
    data = split_interval(data,
                          interval_column='year',
                          split_column_prefix='year')
    loc = location.lower().replace(' ', '_')
    ckd_rr = pd.read_hdf(
        f'/share/costeffectiveness/artifacts/vivarium_csu_hypertension_sdc/ckd_rr/{loc}.hdf'
    )
    ckd_rr = ckd_rr.reset_index()
    ckd_rr['parameter'] = 'per unit'
    ckd_rr['affected_entity'] = 'chronic_kidney_disease'
    ckd_rr['affected_measure'] = 'incidence_rate'
    ckd_rr = ckd_rr.set_index([
        'location', 'sex', 'age_start', 'year_start', 'affected_entity',
        'affected_measure', 'parameter', 'age_end', 'year_end'
    ])
    data = pd.concat([data, ckd_rr])
    key = prefix + 'relative_risk'
    artifact.write(key, data)