def custom_age_weights(age_group_ids, gbdrid=4): """Get age weights scaled to age group start and end""" t = qry.get_age_weights(gbdrid) t = t.query('age_group_id in {}'.format(age_group_ids)) # scale weights to 1 t['age_group_weight_value'] = \ t['age_group_weight_value'] / \ t['age_group_weight_value'].sum() return t[['age_group_id', 'age_group_weight_value']]
def get_custom_age_weights(age_group_years_start, age_group_years_end): """Get age weights scaled to age group start and end""" t = qry.get_age_weights() t = t.query( 'age_group_years_start >= {start} & age_group_years_end <= {end}'. format(start=age_group_years_start, end=age_group_years_end)) # scale weights to 1 t['age_group_weight_value'] = t['age_group_weight_value'] / t[ 'age_group_weight_value'].sum() return t[['age_group_id', 'age_group_weight_value']]
def custom_age_weights(age_group_id_start, age_group_id_end): """Get age weights scaled to age group start and end""" t = qry.get_age_weights(3) # default is gbd 2010? Why? t = t.query('age_group_id >= {start} & age_group_id <= {end}'.format( start=age_group_id_start, end=age_group_id_end)) # scale weights to 1 t['age_group_weight_value'] = \ t['age_group_weight_value'] / \ t['age_group_weight_value'].sum() return t
def age_standardize(df): """ Age standardize. """ age_weights = qry.get_age_weights(4) age_weights = age_weights[['age_group_id', 'age_group_weight_value']] df = df.merge(age_weights, on=['age_group_id'], how='left') assert df.age_group_weight_value.notnull().values.all(), 'age weights merg' # make weighted product and sum df = pd.concat([ df[dw.EPI_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.EPI_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() return df
def process_location_risk_burden_draws(location_id, test=False): ''' Given a list of rei_ids, use gopher to get attributable burden draws and save to out directory. ''' dfs = [] for rei_id in dw.RISK_BURDEN_REI_IDS + dw.RISK_BURDEN_DALY_REI_IDS: print(rei_id) if rei_id in dw.RISK_BURDEN_REI_IDS: measure_id = 1 elif rei_id in dw.RISK_BURDEN_DALY_REI_IDS: measure_id = 2 else: raise ValueError("no measure found") print('Getting draws') df = get_draws(gbd_id_field=['cause_id', 'rei_id'], gbd_id=[294, rei_id], source='burdenator', version=dw.BURDENATOR_VERS, location_ids=location_id, year_ids=[], age_group_ids=[], sex_ids=[], num_workers=3, n_draws=1000, resample=True) # keep years we want df = df.query('measure_id == {}'.format(measure_id)) df = df.query('metric_id == 1') df = df.query('age_group_id in {} and sex_id in [1, 2]'.format( range(2, 21) + range(30, 33) + [235])) df = df.query('year_id in {}'.format(range(1990, 2011, 5) + [2016])) # aggregate to both sexes df['sex_id'] = 3 df = df.groupby(dw.RISK_BURDEN_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() pops = qry.get_pops(both_sexes=True) df = df.merge(pops, how='left', on=['location_id', 'age_group_id', 'sex_id', 'year_id']) df = pd.concat([ df[dw.RISK_BURDEN_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x / df['population']) ], axis=1) df['metric_id'] = 3 # keep the right columns df = df[dw.RISK_BURDEN_GROUP_COLS + dw.DRAW_COLS] # interpolate years print('Interpolating') df = custom_interpolate(df) # age-standardize age_weights = qry.get_age_weights(4) df = df.merge(age_weights) df = pd.concat([ df[dw.RISK_BURDEN_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.RISK_BURDEN_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() dfs.append(df) df = pd.concat(dfs) write_output(df, 'risk_burden', location_id) return df
def load_location_file(iso): df = pd.read_csv(os.path.join(dw.HIV_DIR, iso + '.csv')) df = df.query('year_id >= 1990 and year_id <= 2030 and variable == "Incidence"') df['run_num'] = 'draw_' + (df['run_num'] - 1).astype(str) df = pd.pivot_table(df, values='value', index=['location_id', 'year_id', 'age_group_id', 'sex_id'], columns='run_num') df = df.reset_index() return df if __name__ == '__main__': # Supplementary datasets print('Collecting supplementary datasets') age_weights = qry.get_age_weights(4) age_weights.loc[age_weights.age_group_id.isin([30, 31, 32, 235]), 'age_group_id'] = 21 age_weights = age_weights.groupby(['age_group_id'], as_index=False)['age_group_weight_value'].sum() gbd_popdf = qry.get_pops() gbd_popdf.loc[gbd_popdf.age_group_id.isin([30, 31, 32, 235]), 'age_group_id'] = 21 gbd_popdf = gbd_popdf.groupby(['location_id', 'year_id', 'age_group_id', 'sex_id'], as_index=False)['population'].sum() wpp_popdf = pd.read_csv('FILEPATH/wpp2015_to2063.csv') wpp_popdf = wpp_popdf.loc[wpp_popdf.year_id >= 2016] wpp_popdf = wpp_popdf.rename(index=str, columns={'pop':'population'}) locsdf = qry.get_sdg_reporting_locations() locsdf['L3_loc'] = [loc[3] for loc in locsdf.path_to_top_parent.str.split(',').tolist()] # Compile all countries
import sdg_utils.draw_files as dw import sdg_utils.queries as qry # read asfr file print 'reading input file...' df = pd.read_csv("{d}/asfr_10_19.csv".format(d=dw.ASFR_DIR)) # DRAW NAME STANDARDS # rename asfr_draw_X to draw_X like others print 'cleaning...' df = df.rename(columns=lambda x: x.replace('asfr_draw', 'draw')) # shift from 1-1000 to 0-999 df = df.rename(columns={'draw_1000': 'draw_0'}) # AGE STANDARDIZE weights = qry.get_age_weights(ref_pop=3) weights = weights.ix[weights['age_group_id'].isin([7, 8])] weights['age_group_weight_value'] = weights['age_group_weight_value'] / \ weights.age_group_weight_value.sum() df = df.merge(weights, how='left') assert df.age_group_weight_value.notnull().values.all(), \ 'merge failed' id_cols = [ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'measure_id', 'metric_id' ] # just call this a continuous rate? idk df['measure_id'] = 18 df['metric_id'] = 3 df['age_group_id'] = 27