def get_data(self, model_version_id, id_template_df): print("Loading CSMR csv...") # NOW PULLS A DF INSTEAD OF QUERYING THE DATABASE # Required fields: # location_id, year_id, age_group_id, sex_id, mean, upper, lower # The old way: Getting mortality data from a csv # data_filepath = '/ihme/gbd/WORK/04_epi/01_database/02_data/tb/csmr/custom_csmr.csv' age_group_ids = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 33 ] df = get_model_results('epi', model_version_id=model_version_id, age_group_id=age_group_ids, measure_id=15) df = df[[ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'mean', 'lower', 'upper' ]].copy() print(df.head(5)) df = self.drop_zeros_nulls(df, "mean", "lower", "upper") df = df.merge(id_template_df, on=["location_id", "year_id", "age_group_id", "sex_id"]) if df.empty: raise NoNonZeroValues df = self.calc_se_from_ui(df, "mean", "lower", "upper") df = self.calc_aggregate_se(df, self._data_key, "mean", "se") df = df.set_index(self._data_key) return df
def __get_model__(self, pass_id): self.results = get_model_results('epi', model_version_id=pass_id) self.results['se'] = (self.results['mean'] - self.results['lower']) / 1.96 # only vars I care aboult self.results = self.results[[ 'location_id', 'age_group_id', 'sex_id', 'mean', 'se', 'year_id' ]]
def _get_results(self): df = get_model_results(self.gbd_team, gbd_id=self.gbd_id, gbd_round_id=self.metadata.gbd_round_id, model_version_id=self.model_version_id, location_id=self.metadata.location_ids, year_id=self.metadata.year_id) assert not df.empty, "No round 5 data found for this model." return df
def enhanced_get_model_results(model_version_id, measure_id=5): get_age_groups = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 22,30,31,32,33,235] # Get the sex specific results results_by_sex = get_model_results('epi', model_version_id=model_version_id, measure_id=measure_id, age_group_id=get_age_groups, location_set_id=22) # Get the both-sex results both_sex_results = both_sex_model_results(results_by_sex) results_combined = pd.concat([results_by_sex,both_sex_results]) return results_combined
def main(): gbd_team, mvid, measure_id, age_group_id, path = parse_arguments() if path[-1] != '/': path += '/' location_df, location_id_list = generate_location_data() covariate_df = generate_covariate_data() if not age_group_id: if gbd_team == 'cod': age_group_id = -1 else: age_group_id = 27 model_results = get_model_results(gbd_team, model_version_id=mvid, age_group_id=age_group_id, measure_id=measure_id, gbd_round_id=4, year_id=2016, location_id=location_id_list) # Check dataframe for information assert not model_results.iloc[:, 0].empty, 'No gbd round 4 data found for this model version or age group id' label_df = query_cause_data(mvid, gbd_team) loc_cov_df = merge_dataframes(location_df, covariate_df) if gbd_team == 'cod': model_results = generate_asr( model_results[['location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_death_rate']], ['location_id', 'year_id', 'age_group_id', 'sex_id'], ['mean_death_rate'], get_age_weights() ) sex = get_unique_values(model_results, 'sex_id')[0] sex = 'males' if sex == 1 else 'females' measure = 'deaths' cause = label_df.iloc[0, 1] cod_df = merge_dataframes(loc_cov_df, model_results) cod_df = cod_df.iloc[:, [1, 2, 3, 7]].copy(deep=True) cod_df[['death_rate_x_100000']] = cod_df[['mean_death_rate']] * 100000 cod_df.drop(labels='mean_death_rate', axis=1, inplace=True) len_error = 'Model version id {} does not return results for all 195 \ countries'.format(mvid) if len(cod_df) != 195: assert len_error graph_data(cod_df, cause, sex, measure) name_str = output_pdf(mvid, cause, sex, measure, age_group_id, path) else: cause = label_df.iloc[0, 1] sex = ['males', 'females'] male_epi_results = model_results[model_results.sex_id == 1] epi_xy_df = merge_dataframes(loc_cov_df, male_epi_results) epi_xy_df = prune_epi_dataframe(epi_xy_df) female_epi_results = model_results[model_results.sex_id == 2] epi_xx_df = merge_dataframes(loc_cov_df, female_epi_results) epi_xx_df = prune_epi_dataframe(epi_xx_df) if measure_id == 5: measure = 'prevalence' elif measure_id == 6: measure = 'incidence' elif measure_id == 18: measure = 'proportion' if len(epi_xx_df) != 195 or len(epi_xy_df) != 195: assert len_error graph_data(epi_xy_df, cause, sex[0], measure) epi_xy_name_str = output_pdf(mvid, cause, sex[0], measure, age_group_id, path) graph_data(epi_xx_df, cause, sex[1], measure) epi_xx_name_str = output_pdf(mvid, cause, sex[1], measure, age_group_id, path) plt.clf() generate_legend(path) print 'Success!\n-------------' if gbd_team == 'epi': print 'File name: {}'.format(epi_xy_name_str) print 'File name: {}'.format(epi_xx_name_str) else: print 'File name: {}'.format(name_str) print 'File location: {}'.format(path) print '-------------' print 'legend.pdf also saved to {}'.format(path)
def hiv_adjust(unadjusted_df, ind_grouping, identifiers_list, hiv_rr=hiv_adjustment_rr): # Get all unique locations that we have data for unique_locs = ind_grouping.location_id.unique().tolist() age_groups = [ 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 33 ] # This program only considers individuals with a CD4 count < 200 hiv_meid = 9322 # Get relevant HIV prevalence for all age groups hiv_prev = get_model_results('epi', gbd_id=hiv_meid, measure_id=5, location_id=unique_locs, age_group_id=age_groups, sex_id=[1, 2], status='best', year_id=-1) # Subset to useful columns only hiv_prev = hiv_prev[[ 'year_id', 'age_group_id', 'location_id', 'sex_id', 'mean' ]].copy() hiv_prev = hiv_prev.rename(columns={'mean': 'hiv_prev'}) # Get populations for all listed locations gbd_years = hiv_prev.year_id.unique().tolist() pops = get_population(age_group_id=age_groups, location_id=unique_locs, year_id=gbd_years, sex_id=[1, 2]) # Subset to useful columns only pops = pops[[ 'year_id', 'age_group_id', 'location_id', 'sex_id', 'population' ]] # Join the two datasets join_on = ['year_id', 'age_group_id', 'location_id', 'sex_id'] hiv_prev_pops = pd.merge(left=hiv_prev, right=pops, how='inner', on=join_on) # Add a sex_id = 3 column by merging male_only = hiv_prev_pops[hiv_prev_pops['sex_id'] == 1].copy() female_only = hiv_prev_pops[hiv_prev_pops['sex_id'] == 2].copy() both_sexes = pd.merge(left=male_only, right=female_only, on=['age_group_id', 'location_id', 'year_id'], suffixes=('_male', '_female'), how='inner') both_sexes['population'] = (both_sexes['population_male'] + both_sexes['population_female']) both_sexes['hiv_prev'] = ( (both_sexes['hiv_prev_male'] * both_sexes['population_male'] + both_sexes['hiv_prev_female'] * both_sexes['population_female']) / both_sexes['population']) both_sexes.drop(labels=[ 'hiv_prev_male', 'hiv_prev_female', 'population_male', 'population_female', 'sex_id_male', 'sex_id_female' ], axis=1, inplace=1) both_sexes['sex_id'] = 3 hiv_prev_pops = pd.concat([hiv_prev_pops, both_sexes]) hiv_prev_pops['sex_id'] = hiv_prev_pops['sex_id'].apply(float) # Get the upper and lower age groups from the GBD database q = """SELECT age_group_id,age_group_years_start,age_group_years_end FROM shared.age_group WHERE age_group_id IN {}""".format(tuple(age_groups)) # For the next line, you'll need an 'epi' definition in your .ODBC file age_groups_df = query(q, conn_def='epi') # Merge back onto the HIV/population df hiv_prev_pops = pd.merge(left=hiv_prev_pops, right=age_groups_df, on='age_group_id') # Get the middle year to join on ind_grouping['year_start'] = ind_grouping['year_start'].apply(float) ind_grouping['year_end'] = ind_grouping['year_end'].apply(float) ind_grouping['year_id'] = np.round( ((ind_grouping['year_start'] + ind_grouping['year_end']) / 2), 0) ind_grouping['year_id'] = (ind_grouping['year_id'].apply(int).apply( lambda x: 1980 if x < 1980 else x)) # The HIV adjustment calculation should be done only for people with 0mm indurations ind_grouping = ind_grouping.loc[ind_grouping['ind_bin_high'] < .1] # Get the sex_id from the sex sex_id_dict = { 'Male': 1, 'male': 1, 'Female': 2, 'female': 2, 'Both': 3, 'both': 3 } ind_grouping['sex_id'] = ind_grouping['sex'].apply( lambda x: sex_id_dict[x]) # Now, merge with the HIV population data on sex, location, and year (NOT age) joined = pd.merge(left=ind_grouping, right=hiv_prev_pops, on=['sex_id', 'location_id', 'year_id'], how='inner') # Select only columns where the GBD age group range and the data age group # range intersect # First, set age_start and age_end back to floats joined['age_start'] = joined['age_start'].apply(float) joined['age_end'] = joined['age_end'].apply(float) joined = joined[(joined['age_group_years_start'] <= joined['age_end']) & (joined['age_group_years_end'] > joined['age_start'])] # Create updated age group categories to fit the actual age-start and age-end joined['age_group_start_adj'] = joined.apply( lambda x: np.max([x['age_group_years_start'], x['age_start']]), axis=1) # Subtract 1 from age_group_years_end to reflect our use of demographer notation # (that is, using age 4 to represent 4 years, 0 days to 4 years, 364.99.. days) joined['age_group_end_adj'] = joined.apply( lambda x: min([x['age_group_years_end'] - 1, x['age_end']]), axis=1) # Create updated population count reflecting the fraction of the age group # actually contained within the range # Again, the +1 reflects differences with the GBD age range due to demographer notation joined['pop_adj'] = ( joined['population'] * (joined['age_group_end_adj'] + 1 - joined['age_group_start_adj']) / (joined['age_group_years_end'] - joined['age_group_years_start'])) joined['hiv_prev_count'] = joined['hiv_prev'] * joined['pop_adj'] # Subset only to identifiers + pop_adj and hiv_prev_count group_identifiers = ['group_id'] to_group = joined[ group_identifiers + ['hiv_prev_count', 'pop_adj', 'cases_proportional']].copy() # Now, group by identifiers and sum hiv_prev_count (num.) and pop_adj (denom.) summed = to_group.groupby(by=group_identifiers).sum().reset_index( drop=False) # Divide combined numerator by combined denominator to get total prevalence summed[ 'hiv_prev_weighted_avg'] = summed['hiv_prev_count'] / summed['pop_adj'] summed = summed.drop(labels=['hiv_prev_count', 'pop_adj'], axis=1) # Create the adjustement: # Adjustment = HIV prevalence in this population * proportion of 0mm in study * proportion of HIV patients # who return 0mm results when they actually have latent TB summed['hiv_adjustment'] = summed['cases_proportional'] * summed[ 'hiv_prev_weighted_avg'] * hiv_rr summed = summed[['group_id', 'hiv_adjustment']] summed['group_id'] = summed['group_id'].apply(lambda x: int(float(x))) unadjusted_df['group_id'] = unadjusted_df['group_id'].apply( lambda x: int(float(x))) # Merge onto the results df, using group_id as the unique identifier adjusted_df = pd.merge(left=unadjusted_df, right=summed, on=['group_id'], how='left') # Fill any NaNs adjusted_df['hiv_adjustment'] = adjusted_df['hiv_adjustment'].fillna(0) # Add the adjustment to the mean, lower, and upper for i in ['mean', 'lower', 'upper']: adjusted_df[i] = adjusted_df[i] + adjusted_df['hiv_adjustment'] # Drop the adjustment column and return adjusted_df = adjusted_df.drop(labels=['hiv_adjustment'], axis=1) return adjusted_df
def get_dismod_model(self): self.dismod_model = get_model_results(QUERY)
def get_dismod_model(self): self.dismod_model = get_model_results('epi', model_version_id=self.dismod_model_num, location_id=self.df.location_id.unique().tolist(), year_id=-1, sex_id=[1,2], age_group_id=-1)
def combined_get_model_results(gbd_id=None, location_id='all', prev_filepath=None, inc_filepath=None, model_version_id=263738): age_ids = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,30,31,32,235] year_ids = [1990,1995,2000,2005,2010,2017] sex_ids = [1,2] #get incidence and prevalence data if (prev_filepath): print("Using file for prev") prev = pd.read_excel(prev_filepath) # get excel else: print("querying get_model_results for prev...") prev = get_model_results('epi', gbd_id=gbd_id, measure_id=5, location_id='all', year_id=year_ids, age_group_id=age_ids, sex_id=sex_ids, status='best', gbd_round_id=4) if (inc_filepath): print("Using file for inc") inc = pd.read_excel(inc_filepath) # get excel else: print("querying get_model_results for inc...") inc = get_model_results('epi', gbd_id=gbd_id, measure_id=6, location_id='all', year_id=year_ids, age_group_id=age_ids, sex_id=sex_ids, status='best', gbd_round_id=4) #prev['prev_se'] = (prev["upper"] - prev["lower"]) / (2*1.96) #inc['inc_se'] = (inc["upper"] - inc["lower"]) / (2*1.96) prev = prev.rename(columns={'mean':'prev_mean', 'lower':'prev_lower', 'upper':'prev_upper', 'standard_error':'prev_se'}) inc = inc.rename(columns={'mean':'inc_mean', 'lower':'inc_lower', 'upper':'inc_upper', 'standard_error':'inc_se'}) #prev = adj_data_template(df=prev) #inc = adj_data_template(df=inc) #load custom (HIV-neg + HIV-pos) csmr print("loading custom csmr data...") csmr = pd.read_csv("FILEPATH") #csmr['csmr_se'] = (csmr["upper"] - csmr["lower"]) / (2*1.96) csmr = csmr.rename(columns={'mean':'csmr_mean', 'lower':'csmr_lower', 'upper':'csmr_upper', 'standard_error':'csmr_se'}) csmr = csmr[['age_group_id', 'location_id', 'year_id', 'sex_id', 'csmr_mean', 'csmr_se', 'csmr_lower', 'csmr_upper']].copy() #get acmr data print("querying get_envelope for acmr...") acmr = get_envelope(age_group_id=age_ids, location_id='all', year_id=year_ids, sex_id=sex_ids, gbd_round_id=5, with_shock=1, with_hiv=1, rates=1) acmr['acmr_se'] = (acmr["upper"] - acmr["lower"]) / (2*1.96) acmr = acmr.rename(columns={'mean':'acmr_mean', 'lower':'acmr_lower', 'upper':'acmr_upper'}) #get remission data #remission should equal 2. upper and lower bounds 1.8-2.2 #get emr-predicted data emrpred = get_emr_pred(model_version_id) merge_inc = pd.merge(left=inc, right=csmr, on=['age_group_id', 'sex_id', 'year_id', 'location_id'], how='left') merge_inc = pd.merge(left=merge_inc, right=acmr, on=['age_group_id', 'sex_id', 'year_id', 'location_id'], how='left') merge_inc = pd.merge(left=merge_inc, right=emrpred, on=['age_group_id', 'sex_id', 'year_id'], how='left') merge_inc['rem_mean'] = 2 merge_inc['rem_se'] = .1020408 merge_inc = merge_inc.rename(columns={'location_id_x':'location_id'}) #merge data required for incidence-based emr calculation merge_prev = pd.merge(left=prev, right=csmr, on=['age_group_id', 'sex_id', 'year_id', 'location_id'], how='left') #merge data required for prevalence-based emr calculation return (merge_prev, merge_inc)