def _load_data_frame(self): if self.location_set_id is not None: df_out = get_population(age_group_id=self.age_group_id, location_set_id=self.location_set_id, year_id=self.year_id, location_id=self.location_id, sex_id=self.sex_id, gbd_round_id=self.gbd_round_id) elif self.location_set_version_id is not None: df_out = get_population( age_group_id=self.age_group_id, location_set_version_id=self.location_set_version_id, location_id=self.location_id, year_id=self.year_id, sex_id=self.sex_id, gbd_round_id=self.gbd_round_id) else: df_out = get_population(age_group_id=self.age_group_id, location_id=self.location_id, year_id=self.year_id, sex_id=self.sex_id, gbd_round_id=self.gbd_round_id, decomp_step=self.decomp_step) if 'run_id' in df_out.columns.tolist(): del df_out['run_id'] df_out.rename(columns={'population': 'pop_scaled'}, inplace=True) return df_out
def get_sample_size(df, fix_group237=False): """ This function attaches sample size to hospital data. It's for sources that should have fully covered populations, so sample size is just population. Checks if age_group_id is a column that exists and if not, it attaches it. Parameters df: Pandas DataFrame contains the data that you want to add sample_size to. Will add pop to every row. """ # process ## attach age group id to data ## get pop with those age group ids in the data ## attach pop by age group id if 'age_group_id' not in df.columns: # pull age_group to age_start/age_end map age_group = hosp_prep.get_hospital_age_groups() # merge age group id on pre = df.shape[0] df = df.merge(age_group, how='left', on=['age_start', 'age_end']) assert df.shape[0] == pre, "number of rows changed during merge" assert df.age_group_id.notnull().all(), ("age_group_id is missing " "for some rows") # get population pop = get_population(QUERY) if fix_group237: fix_pop = get_population(QUERY) pre = fix_pop.shape[0] fix_pop['age_group_id'] = 237 fix_pop = fix_pop.groupby(fix_pop.columns.drop('population').tolist()).agg({'population': 'sum'}).reset_index() assert pre/2 == fix_pop.shape[0] pop = pd.concat([pop, fix_pop], ignore_index=True) # rename pop columns to match hospital data columns pop.rename(columns={'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] pop.drop("run_id", axis=1, inplace=True) demography = ['location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'] # merge on population pre_shape = df.shape[0] df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" assert df.population.notnull().all(),\ "population is missing for some rows. look at this df! \n {}".\ format(df.loc[df.population.isnull(), demography].drop_duplicates()) return(df)
def transform_to_rate_space(summary_df): ''' Transform the summary data into rate space to allow for easier validation ''' print("**TRANSFORMING TO RATE SPACE**") demos = get_demographics(gbd_round_id=5, gbd_team='cod') year_ids = list(range(1950,2018)) full_pops = get_population(gbd_round_id=5, age_group_id=demos['age_group_id'], sex_id=demos['sex_id'], year_id=year_ids, run_id= 104, location_id=demos['location_id']) full_pops = full_pops.loc[:,['age_group_id','sex_id','year_id','location_id', 'population']] summary_df = pd.merge(left=summary_df, right=full_pops, on=['age_group_id','sex_id','year_id','location_id'], how='inner') for col in ['val','lower','upper']: summary_df[col] = summary_df[col] / summary_df['population'] summary_df.loc[summary_df[col]>1,col] = 1 summary_df.loc[summary_df[col]<0,col] = 0 summary_df = summary_df.drop(labels=['population'], axis=1) print(" ...Successfully transformed summary values into rate space.\n") return summary_df
def prep_norway_pop_weights(code_dir): loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=5) loc_meta.to_csv(os.path.join(code_dir, 'location_metadata.csv'), index=False, encoding='utf8') norway_id = 90 norway_subs = loc_meta.loc[loc_meta.parent_id==norway_id, 'location_id'].tolist() + [norway_id] country_pop = get_population(location_id=norway_id, year_id='all', sex_id='all', age_group_id=164, gbd_round_id=5, status='best') country_pop.drop('location_id', axis=1, inplace=True) subs_pop = get_population(location_id=norway_subs, year_id='all', sex_id='all', age_group_id=164, gbd_round_id=5, status='best') population = subs_pop.merge(country_pop, on=[c for c in subs_pop.columns if c not in ['location_id','population']], suffixes=('_subs', '_national')) population.loc[:, 'pop_weight'] = population['population_subs'] / population['population_national']
def get_hosp_pops(df): """ The Shiny/Rmarkdown tool which this data feeds into was running get_population() live but it was taking too long. This function pulls the population for a source and writes it to a csv in the hospital diagnostics dir Parameters: df: Pandas DataFrame """ today = re.sub("\W", "_", str(datetime.datetime.now()))[0:10] pop = get_population(age_group_id=list(df.age_group_id.unique()), location_id=list(df.location_id.unique()), year_id=list(df.year_start.unique()), sex_id=[1, 2]) for source in df.source.unique(): dat = df[df.source == source] locs = dat.location_id.unique() years = dat[dat.location_id.isin(locs)].year_start.unique() ages = dat.age_group_id.unique() src_pop = pop[pop.location_id.isin(locs) & pop.year_id.isin(years) &\ pop.age_group_id.isin(ages)].copy() src_pop['source'] = source src_pop.to_csv("FILENAME" r"FILEPATH".format(source, today), index=False) return
def get_sy_populations(dems): """Pull populations for single-year age-groups without under 1 age groups (already contained in get_populations()) This does not include 95+ because it is already in the get_populations call. Will be appended in scripts if necessary.""" print("Getting single-year populations") df = db.get_population(year_id=dems['year_id'], sex_id=dems['sex_id'], location_id=dems["location_id"], age_group_id=list(range(49, 143)), single_year_age=True, gbd_round_id=help.GBD_ROUND) return df
def get_data(self, id_template_df): locs = id_template_df.location_id.tolist() ages = id_template_df.age_group_id.tolist() sexes = id_template_df.sex_id.tolist() years = id_template_df.year_id.tolist() # get deaths and pop env_df = get_envelope( age_group_id=ages, location_id=locs, year_id=years, sex_id=sexes, with_hiv=1, with_shock=0) pop_df = get_population( age_group_id=ages, location_id=locs, year_id=years, sex_id=sexes) df = env_df.merge( pop_df, on=["location_id", "year_id", "age_group_id", "sex_id"]) # convert to rates for col in ["mean", "lower", "upper"]: df[col] = df[col] / df["population"] df = self.drop_zeros_nulls(df, "mean", "lower", "upper") # add input_data_key df = df.merge(id_template_df, on=["location_id", "year_id", "age_group_id", "sex_id"]) if df.empty: raise NoNonZeroValues # aggregate df = self.calc_se_from_ui(df, "mean", "lower", "upper") df = self.calc_aggregate_se(df, self._data_key, "mean", "se") df = df.set_index(self._data_key) return df
def both_sex_model_results_figures(by_sex_df): # Get the set of locations, age groups, and years in the input df in_age_groups = by_sex_df.age_group_id.unique().tolist() in_years = by_sex_df.year_id.unique().tolist() in_locs = by_sex_df.location_id.unique().tolist() # Get populations for both male and female for each of these age groups, # years, and locations pops = get_population(age_group_id=in_age_groups, location_id=in_locs, year_id=in_years, sex_id=[1,2]) # Drop non-useful columns from the populations df pops = pops.drop(labels=['process_version_map_id'],axis=1) # Merge the input dataframe on location, age-group, year, sex merge_on_cols=['location_id','year_id','age_group_id','sex_id'] merged = by_sex_df.merge(pops,on=merge_on_cols,how='inner') # Now, multiply the 'mean', 'lower', and 'upper' columns by the population # to get a total COUNT for each location-year-age-sex group merged['mean_count'] = merged['mean'] * merged['population'] merged['lower_count'] = merged['lower'] * merged['population'] merged['upper_count'] = merged['upper'] * merged['population'] # Drop the old rate columns and the sex column merged = merged.drop(labels=['mean','lower','upper','sex_id'],axis=1) # Group by all columns except mean_count, lower_count, upper_count, # and population dont_group_by = ['mean_count','lower_count','upper_count','population'] group_by_these = [i for i in merged.columns.tolist() if i not in dont_group_by] # Get the sum of the counts AND the total population # This is the both-sex df both_sex = merged.groupby(group_by_these).sum().reset_index() both_sex['sex_id'] = 3 return both_sex
def pull_pop(self): pop_df = get_population(age_group_id=self.age_group_ids, location_id=-1, location_set_id=35, year_id=self.year_id, sex_id=self.sex_id) return pop_df[self.index_cols + ['population']]
def mortQuery(sex, start_year, start_age, end_age, location_set_version_id, gbd_round, db_connection): ''' Strings indicating model parameters -> Pandas Data Frame Given a set of model parameters will query from the mortality database and return a pandas data frame. The data frame contains the base variables used in the CODEm process. ''' loc_df = locQuery(location_set_version_id, db_connection) loc_list = loc_df.location_id.values.tolist() age_df = createAgeDF(db_connection) age_restrict = "all_ages >= {0} & all_ages <= {1}".format(start_age, end_age) age_list = age_df.query(age_restrict).all_ages.values.tolist() env = get_envelope(age_group_id=age_list, sex_id=sex, year_id=range(start_year, gbd_round+1), location_set=35, location_id=loc_list) pop = get_population(age_group_id=age_list, sex_id=sex, year_id=range(start_year, gbd_round+1), location_set=35, location_id=loc_list) df = pd.merge(env, pop, on=['age_group_id', 'location_id', 'year_id', 'sex_id']) df.drop(['upper', 'lower', 'run_id_x', 'run_id_y'], axis=1, inplace=True) df.rename(columns={'age_group_id': 'age', 'year_id': 'year', 'sex_id': 'sex', 'mean': 'envelope', 'population': 'pop'}, inplace=True) return df
def rate_count_switcher(df, gbd_round_id, decomp_step, rate_cols=['mean', 'lower', 'upper']): pre_cols = df.columns if 'age_group_id' not in pre_cols: good_ages = hosp_prep.get_hospital_age_groups() df = df.merge(good_ages[['age_start', 'age_group_id']], how='left', on='age_start', validate='m:1') if 'year_id' in pre_cols: df['year_start'], df['year_end'] = df['year_id'], df['year_id'] if 'sex_id' not in pre_cols: assert set(df['sex'].unique()) == set(['Male', 'Female']) df['sex_id'] = 2 df.loc[df['sex'] == 'Male', 'sex_id'] = 1 years = list(np.arange(1990, 2018, 1)) ages = df.age_group_id.unique().tolist() locs = df.location_id.unique().tolist() pop = db_queries.get_population(gbd_round_id=gbd_round_id, decomp_step=decomp_step, year_id=years, age_group_id=ages, sex_id=[1, 2], location_id=locs) pop['year_start'], pop['year_end'] = pop['year_id'], pop['year_id'] pop.drop(['year_id', 'run_id'], axis=1, inplace=True) agg_pop = pop.copy() agg_pop = hosp_prep.year_binner(agg_pop) agg_pop = agg_pop.groupby(['age_group_id', 'year_start', 'year_end', 'location_id', 'sex_id']).\ agg({'population': 'mean'}).reset_index() if (df['year_start'] + 4 == df['year_end']).all(): df = df.merge(agg_pop, how='left', on=[ 'age_group_id', 'sex_id', 'location_id', 'year_start', 'year_end' ], validate='m:1') else: df = df.merge(pop, how='left', on=[ 'age_group_id', 'sex_id', 'location_id', 'year_start', 'year_end' ], validate='m:1') assert df['population'].isnull().sum() == 0 for col in rate_cols: df["count_{}".format(col)] = df[col] * df['population'] return df
def pull_pop(self): logger.info("Pulling populations...") pop_df = get_population(age_group_id=self.age_group_ids, location_id=-1, location_set_id=35, year_id=self.year_id, sex_id=self.sex_id) return pop_df[self.index_cols + ['population']]
def __init__(self, gbd_round_id: int = gbd.GBD_ROUND_ID, decomp_step: str = gbd.decomp_step.ONE, location_set_id: int = LocationSetId.OUTPUTS): self.gbd_round_id: int = gbd_round_id self.decomp_step: str = decomp_step self.location_set_id: int = location_set_id self._run_id: int = get_population( gbd_round_id=self.gbd_round_id, decomp_step=self.decomp_step, location_set_id=self.location_set_id).run_id.item()
def get_population( gbd_round_id: int, decomp_step: str, location_set_id: int, year_ids: List[int], year_end: int, age_group_ids: List[int], sex_ids: List[int], square_df: pd.DataFrame ) -> pd.DataFrame: """ Pulls population estimates for given demographics. Requests forecasted population if modeler's prediction years go beyond the current cycle, and passes null decomp step for rounds prior to 2019. Args: gbd_round_id: the GBD round for which to pull population decomp_step: the decomp step for which to pull population location_set_id: the location set for which to pull population year_ids: year IDs for which to pull population year_end: the last year for which to pull population age_group_ids: age group IDs for which to pull population sex_ids: sex IDs for which to pull population square_df: square dataframe to use to validate that population contains required demographics Returns: DataFrame of population estimates. Has demographics columns and population column """ logging.info('Pulling population') is_forecasting_model = ( decomp_step == gbd.constants.decomp_step.ITERATIVE and year_end > demographics.FORECASTING_YEAR ) population_df = db_queries.get_population( location_set_id=location_set_id, location_id='all', year_id=year_ids, age_group_id=age_group_ids, sex_id=sex_ids, gbd_round_id=gbd_round_id, decomp_step=( None if gbd_round_id < 6 else decomp_step ), forecasted_pop=is_forecasting_model )[columns.DEMOGRAPHICS + [columns.POPULATION]] data_validation.validate_population_matches_data( population_df, square_df ) return population_df
def get_pop(locset_id=8): ''' returns population estimates ''' d_step = utils.get_gbd_parameter('current_decomp_step') gbd_id = utils.get_gbd_parameter('current_gbd_round') yr_range = range(1980,2030) yr_list = list(yr_range) pop_df = get_population(age_group_id=-1, location_id=-1, location_set_id=locset_id, year_id=yr_list, sex_id = -1, decomp_step = d_step, gbd_round_id = gbd_id) return(pop_df)
def merge_population(df, gbd_round_id, decomp_step, use_draws): """ Function that attaches population info to the DataFrame. Checks that there are no nulls in the population columns. This has to be ran after the data has been made square! Parameters: df: Pandas DataFrame """ if use_draws: chkcol = 'draw_3' # not draw0 to make sure 0's are getting propogated else: chkcol = 'mean' zero_msg = """There are no rows with zeros, implying that the data has not been made square. This function should be ran after the data is square""" # assert (df[chkcol] == 0).any(), zero_msg if not (df[chkcol] == 0).any(): warnings.warn(zero_msg) # create age/year/location lists to use for pulling population age_list = list(df.age_group_id.unique()) loc_list = list(df.location_id.unique()) year_list = list(df.year_id.unique()) # pull population pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list, gbd_round_id=gbd_round_id, decomp_step=decomp_step) demography = ['location_id', 'year_id', 'sex_id', 'age_group_id'] # keep only merge cols and pop pop = pop[demography + ['population']] pre_shape = df.shape[0] # store for before comparison # then merge population onto the hospital data df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" # assert that there are no nulls in population column: hosp_prep.report_if_merge_fail(df, check_col="population", id_cols=demography, store=True, filename="population_merge_failure") return df
def get_data_frame(self, desired_index): if self.location_set_id is not None: df_out = get_population( age_group_id=self.age_group_id, location_set_id=self.location_set_id, year_id=self.year_id, location_id=self.location_id, sex_id=self.sex_id, gbd_round_id=self.gbd_round_id) elif self.location_set_version_id is not None: df_out = get_population( age_group_id=self.age_group_id, location_set_version_id=self.location_set_version_id, location_id=self.location_id, year_id=self.year_id, sex_id=self.sex_id, gbd_round_id=self.gbd_round_id) else: df_out = get_population( age_group_id=self.age_group_id, location_id=self.location_id, year_id=self.year_id, sex_id=self.sex_id, gbd_round_id=self.gbd_round_id) df_out = DataSource.normalize_columns(df_out, self.name, desired_index) df_out.rename(columns={'population': 'pop_scaled'}, inplace=True) return df_out
def prep_dismod_results(self): ## create all sex self.dismod_model = self.dismod_model.loc[~self.dismod_model.age_group_id.isin([1,33, 27, 164])] print(self.dismod_model) print('Getting population') locs = self.dismod_model.location_id.unique() years = self.dismod_model.year_id.unique().tolist() ages = self.dismod_model.age_group_id.unique().tolist() print(locs) print(years) print(ages) self.population = get_population(location_id=self.dismod_model.location_id.unique().tolist(), year_id = self.dismod_model.year_id.unique().tolist(), age_group_id=self.dismod_model.age_group_id.unique().tolist(), sex_id=[1,2]) self.dismod_model = self.dismod_model.merge(self.population, on=['location_id', 'age_group_id', 'sex_id', 'year_id'], how='left') ## aggregate to all sex self.all_sex = self.dismod_model.copy() self.all_sex['mean'] *= self.all_sex['population'] self.all_sex['upper'] *= self.all_sex['population'] self.all_sex = self.all_sex.groupby(['location_id', 'year_id', 'age_group_id']).sum().reset_index() self.all_sex['mean'] /= self.all_sex.population self.all_sex.upper /= self.all_sex.population ## add sex self.dismod_model = pd.concat((self.dismod_model[['location_id', 'age_group_id', 'sex_id', 'year_id', 'mean', 'population', 'upper']], self.all_sex[['location_id', 'age_group_id', 'sex_id', 'year_id', 'mean', 'population', 'upper']])) del self.all_sex ## aggregate under 1 under_5 = self.dismod_model.loc[self.dismod_model.age_group_id.isin([2,3,4])] self.dismod_model = self.dismod_model.loc[~self.dismod_model.age_group_id.isin([2,3,4])] under_5['mean'] *= under_5['population'] under_5['upper'] *= under_5['population'] under_5 = under_5.groupby(['location_id', 'year_id', 'sex_id']).sum().reset_index() under_5['mean'] /= under_5.population under_5.upper /= under_5.population under_5.age_group_id=28 ## add age self.dismod_model = self.dismod_model.append(under_5) self.dismod_model['se_age'] = (self.dismod_model.upper -self.dismod_model['mean'])/1.96
def prep_pop(years, upload_dir, vers): population = get_population(year_id=years, location_id=-1, age_group_id=-1, sex_id=-1) index_cols = ['location_id', 'year_id', 'sex_id', 'age_group_id'] population = population[index_cols + ['population']] population = set_sort_index(population, index_cols) output_dir = '%s/v%s/temps' % (upload_dir, vers) if not os.path.exists(output_dir): os.makedirs(output_dir) time.sleep(1) population.to_csv('%s/population.csv' % output_dir)
def prep_dismod_results(self): ## create all sex self.dismod_model = self.dismod_model.loc[ ~self.dismod_model.age_group_id.isin([1, 33, 27, 164])] self.population = get_population(QUERY) self.dismod_model = self.dismod_model.merge(self.population, on=[cols], how='left') ## aggregate to all sex self.all_sex = self.dismod_model.copy() self.all_sex['mean'] *= self.all_sex['population'] self.all_sex['upper'] *= self.all_sex['population'] self.all_sex = self.all_sex.groupby( ['location_id', 'year_id', 'age_group_id']).sum().reset_index() self.all_sex['mean'] /= self.all_sex.population self.all_sex.upper /= self.all_sex.population ## add sex self.dismod_model = pd.concat((self.dismod_model[[ 'location_id', 'age_group_id', 'sex_id', 'year_id', 'mean', 'population', 'upper' ]], self.all_sex[[ 'location_id', 'age_group_id', 'sex_id', 'year_id', 'mean', 'population', 'upper' ]])) del self.all_sex ## aggregate under 1 under_5 = self.dismod_model.loc[self.dismod_model.age_group_id.isin( [2, 3, 4])] self.dismod_model = self.dismod_model.loc[~self.dismod_model. age_group_id.isin([2, 3, 4])] under_5['mean'] *= under_5['population'] under_5['upper'] *= under_5['population'] under_5 = under_5.groupby(['location_id', 'year_id', 'sex_id']).sum().reset_index() under_5['mean'] /= under_5.population under_5.upper /= under_5.population under_5.age_group_id = 28 ## add age self.dismod_model = self.dismod_model.append(under_5) self.dismod_model['se_age'] = (self.dismod_model.upper - self.dismod_model['mean']) / 1.96
def merge_population(df): """ Function that attaches population info to the DataFrame. Checks that there are no nulls in the population columns. This has to be run after the data has been made square! Parameters: df: Pandas DataFrame """ assert (df.mean_raw == 0).any(), """There are no rows with zeros, implying that the data has not been made square. This function should be run after the data is square""" # create age/year/location lists to use for pulling population age_list = list(df.age_group_id.unique()) loc_list = list(df.location_id.unique()) year_list = list(df.year_start.unique()) # pull population pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # rename pop columns to match hospital data columns pop.rename(columns={'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] demography = [ 'location_id', 'year_start', 'year_end', 'sex_id', 'age_group_id' ] pre_shape = df.shape[0] # store for before comparison # then merge population onto the hospital data df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" # assert that there are no nulls in population column: hosp_prep.report_if_merge_fail(df, check_col="population", id_cols=demography, store=True, filename="population_merge_failure") return df
def run_cod_age_sex_splitting(db): # CHECK COMPLETENESS cause_set_version = 269 cm = get_cause_metadata(cause_set_version_id=cause_set_version) possible_causes = cm['cause_id'].unique().tolist() for cause_id in db['cause_id'].unique().tolist(): assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format( cause_id) loc_meta = get_location_metadata(gbd_round_id=5, location_set_id=21) possible_locs = loc_meta['location_id'].tolist() db = db.loc[db['location_id'].isin(possible_locs), :] db = db.loc[db['best'] > 0, :] db['hi_best_ratio'] = db['high'] / db['best'] db['lo_best_ratio'] = db['low'] / db['best'] db = db.reset_index(drop=True) db['unique_join'] = db.index db_merge_later = db.loc[:, ['unique_join', 'hi_best_ratio', 'lo_best_ratio']] db = db.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'], axis=1) id_cols = [ i for i in db.columns if i not in ['best', 'age_group_id', 'sex_id'] ] cause_set_version_id = query("""SELECT cause_set_version_id FROM ADDRESS WHERE gbd_round_id=5 AND cause_set_id=4;""", conn_def='epi').iloc[0, 0] pop_run_id = get_population(gbd_round_id=5, status="recent")['run_id'].iloc[0] splitter = AgeSexSplitter(cause_set_version_id=cause_set_version, pop_run_id=104, distribution_set_version_id=29, id_cols=['unique_join'], value_column='best') split_db = splitter.get_computed_dataframe(df=db, location_meta_df=loc_meta) split_db = pd.merge(left=split_db, right=db_merge_later, on=['unique_join'], how='left') split_db['low'] = split_db['best'] * split_db['lo_best_ratio'] split_db['high'] = split_db['best'] * split_db['hi_best_ratio'] split_db = split_db.drop( labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1) return split_db
def split_by_pop(full_df, cause_id): total_b = round(full_df['best'].sum()) total_h = round(full_df['high'].sum()) total_l = round(full_df['low'].sum()) final = full_df[full_df['cause_id'] != cause_id] df = full_df[full_df['cause_id'] == cause_id] if cause_id == 387: final.append(df.query("age_group_id == 2 | age_group_id == 3")) df = df.query("age_group_id != 2 & age_group_id != 3") locations = df.location_id.unique() years = df.year_id.unique() ages = df.age_group_id.unique() pop = get_population(age_group_id=list(ages), location_id=list(locations), year_id=list(years), sex_id=[1, 2], run_id=104) df = pd.merge(df, pop, how='left', on=['age_group_id', 'location_id', 'year_id', 'sex_id']) df['tpop'] = df.groupby(['location_id', 'year_id'])['population'].transform(sum) df['tbest'] = df.groupby(['location_id', 'year_id'])['best'].transform(sum) df['thigh'] = df.groupby(['location_id', 'year_id'])['high'].transform(sum) df['tlow'] = df.groupby(['location_id', 'year_id'])['low'].transform(sum) df['rate'] = df['population'] / df['tpop'] df['best'] = df['rate'] * df['tbest'] df['high'] = df['rate'] * df['thigh'] df['low'] = df['rate'] * df['tlow'] df.drop(['population', 'run_id', 'tpop', 'rate', "tbest"], axis=1, inplace=True) final = final.append(df) assert round(final['best'].sum()) == total_b assert round(final['high'].sum()) == total_h assert round(final['low'].sum()) == total_l return final
def get_sample_size(df): """ This function attaches sample size to hospital data. It's for sources that should have fully covered populations, so sample size is just population. """ # process ## attach age group id to data ## get pop with those age group ids in the data ## attach pop by age group id if 'age_group_id' not in df.columns: # pull age_group to age_startFILEPATH map age_group = hosp_prep.get_hospital_age_groups() # merge age group id on pre = df.shape[0] df = df.merge(age_group, how='left', on=['age_start', 'age_end']) assert df.shape[0] == pre, "number of rows changed during merge" assert df.age_group_id.notnull().all(), ("age_group_id is missing " "for some rows") # get population pop = get_population(age_group_id=list(df.age_group_id.unique()), location_id=list(df.location_id.unique()), sex_id=[1, 2], year_id=list(df.year_start.unique())) # rename pop columns to match hospital data columns pop.rename(columns={'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] pop.drop("process_version_map_id", axis=1, inplace=True) demography = [ 'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id' ] # merge on population pre_shape = df.shape[0] df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" assert df.population.notnull().all(), ("population is missing for some" " rows") return (df)
def create_age_weight(df): """Create mean age weights for wide age groups""" df['age_diff'] = df['age_end'] - df['age_start'] df['age_mean'] = df['age_end'] + df['age_start'] df['age_mean'] /= 2 df['age_diff'] = df['age_end'] - df['age_start'] df_sub = df.loc[df.age_diff > 10] df_final = df.loc[df.age_diff <= 10] pops = get_population(QUERY) pops.rename(columns={'age_group_id': 'age_start'}, inplace=True) pops['age_start'] -= 49 ## correct these age_groups df_sub = df_sub.reset_index() for i in np.arange(len(df_sub)): try: temp = df_sub.iloc[i].to_dict() pop_sub = pops.loc[(pops.location_id == temp['location_id']) & (pops.year_id == temp['year_start'])] pop_sub = pop_sub.loc[(pop_sub.age_start >= temp['age_start']) & (pop_sub.age_start <= temp['age_end'])] ## get sex temp['sex'] = [x.lower() for x in temp['sex']] if temp['sex'] == 'male': pop_sub = pop_sub.loc[pop_sub.sex_id == 1] if temp['sex'] == 'female': pop_sub = pop_sub.loc[pop_sub.sex_id == 2] if temp['sex'] == 'both': pop_sub = pop_sub.loc[pop_sub.sex_id.isin([1, 2])] df_sub.ix[i, 'age_mean'] = ( pop_sub['age_start'] * pop_sub['population']).sum() / pop_sub['population'].sum() #print 'worked! ' + str(temp['location_id']) except: if temp['location_id'] == 6: df_sub.ix[i, 'age_mean'] = 37. ## china is wierd else: df_sub.ix[i, 'age_mean'] = 41 print temp['location_id'] pass return pd.concat((df_final, df_sub))
def run_subnational_splitting(df): original_death_count = df.copy()['best'].sum() df['high'] = df['high'].astype("float") df['low'] = df['low'].astype("float") df['location_id'] = df['location_id'].apply(lambda x: int(x)) all_locations = get_location_metadata(location_set_id=21) not_detailed_locations = all_locations.query("most_detailed == 0") not_detailed_locations = set(not_detailed_locations['location_id']) df_locations = set(df['location_id']) has_nationals = bool(len(not_detailed_locations.intersection(df_locations))) locs = get_location_metadata(location_set_id=21) pop = get_population(location_id=-1, year_id=-1, decomp_step="step1", location_set_id=21) count = 0 while has_nationals: count += 1 print("iteration {}".format(count)) if count >= 7: df = parallelize(df, locs, pop, iterate_through_df_and_split_nationals_by_population) df_locations = set(df['location_id']) has_nationals = bool(len(not_detailed_locations.intersection(df_locations))) split_death_count = df['best'].sum() if "not_detailed" not in df.columns: df['not_detailed'] = 0 difference = split_death_count - original_death_count assert np.isclose(difference, 0, atol=10), ( "deaths before split does not equal deaths after split: Difference {}".format(difference)) df['high'].fillna(0, inplace=True) df['low'].fillna(0, inplace=True) df['best'] = df['best'].apply(lambda x: float(x)) df = df.groupby(["source_event_id", "location_id", "cause_id", "year_id", "nid", "source_id", "raw_data_id", "year_split_data_id", "split_status", "not_detailed", "event_name", "sex_id", "age_group_id", "notes"], as_index=False)['low', 'best', 'high'].sum() df['high'] = df['high'].replace(0, float('nan')) df['low'] = df['low'].replace(0, float('nan')) return df
def load_mortality_envelope(location_id_list, age_group_list, year_list): ''' Returns the current all-cause mortality envelope ''' env = get_envelope(sex_id=[1, 2], location_id=location_id_list, year_id=year_list, age_group_id=age_group_list) env.rename(columns={'mean': 'envelope'}, inplace=True) pop = get_population(sex_id=[1, 2], location_id=location_id_list, year_id=year_list, age_group_id=age_group_list) env = env.merge(pop, on=['location_id', 'year_id', 'sex_id', 'age_group_id']) env['death_rate'] = env['envelope'] / env['population'] env = env[[ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'death_rate' ]] return (env)
def assemble_most_detailed_map(not_detailed_locs, loc_meta): ''' For each of the locations that is not listed as most detailed, create a dataframe of all the most deatiled locations that fall within that non-detailed location. Inputs: not_detailed_locs: A list of all locations to create the dataframe for loc_meta: Dataframe of location metadata for a given location set Outputs: detailed_map: A dataframe with three fields: "map_from_loc", the non-detailed location_id; "map_to_loc", the most detailed location_ids, and "split_fraction" the proportion of the total population in the non-detailed loc that can be found within the most detailed loc. ''' # Keep only most detailed locations detailed = loc_meta.loc[loc_meta['most_detailed'] == 1, :] # Get population data for all most detailed locations pops = (get_population(location_id=detailed['location_id'].unique().tolist( )).loc[:, ['location_id', 'population']]) detailed = pd.merge(left=detailed, right=pops, on=['location_id'], how='inner') # Create an empty list that will store subsets of the final dataframe sub_map_dfs = list() # Iterate through all of the non-detailed locations not_detailed_locs = list(set(not_detailed_locs)) for parent_loc in not_detailed_locs: # Subset to all most-detailed locations that fall under this location sub_df = (detailed.loc[detailed['path_to_top_parent'].apply( lambda x: ",{},".format(int(parent_loc)) in x), :]) # Get the fraction of the total population in each most detailed location sub_df['split_fraction'] = sub_df['population'] / sub_df[ 'population'].sum() # Keep only needed columns sub_df = sub_df.loc[:, ['location_id', 'split_fraction']] sub_df['map_from_loc'] = parent_loc # Append to the list of sub-dataframes sub_map_dfs.append(sub_df) # Concatenate all sub-dataframes detailed_map = (pd.concat(sub_map_dfs).rename( columns={"location_id": "map_to_loc"})) return detailed_map
def run_shared_funcs(mat): """ get all the central inputs we'll need. Population and asfr and ifd covariates """ years = list(np.arange(1988, 2018, 1)) locs = mat.location_id.unique().tolist() ages = mat.age_group_id.unique().tolist() # get pop pop = get_population(age_group_id=ages, location_id=locs, year_id=years, sex_id=[2]) # GET ASFR and IFD # has age/location/year asfr = get_covariate_estimates(covariate_id=13, location_id=locs, age_group_id=ages, year_id=years) ifd = get_covariate_estimates(covariate_id=51) return pop, asfr, ifd
def get_sample(df): # get population for sample size ages = df.age_group_id.unique().tolist() + [31, 32, 235] pop = get_population(year_id=df.year_id.unique().tolist(), location_id=69, age_group_id=ages, sex_id=[1, 2]) pop.drop('run_id', axis=1, inplace=True) pop.rename(columns={'population': 'sample_size'}, inplace=True) pop.loc[pop.age_group_id.isin([31, 32, 235]), 'age_group_id'] = 160 pop = pop.groupby(pop.columns.drop('sample_size').tolist()).agg({ 'sample_size': 'sum' }).reset_index() # merge on sample size df = df.merge(pop, how='left', on=['age_group_id', 'location_id', 'sex_id', 'year_id']) assert df.sample_size.isnull().sum() == 0 return df