def rate_count_switcher(df, gbd_round_id, decomp_step, rate_cols=['mean', 'lower', 'upper']): pre_cols = df.columns if 'age_group_id' not in pre_cols: good_ages = hosp_prep.get_hospital_age_groups() df = df.merge(good_ages[['age_start', 'age_group_id']], how='left', on='age_start', validate='m:1') if 'year_id' in pre_cols: df['year_start'], df['year_end'] = df['year_id'], df['year_id'] if 'sex_id' not in pre_cols: assert set(df['sex'].unique()) == set(['Male', 'Female']) df['sex_id'] = 2 df.loc[df['sex'] == 'Male', 'sex_id'] = 1 years = list(np.arange(1990, 2018, 1)) ages = df.age_group_id.unique().tolist() locs = df.location_id.unique().tolist() pop = db_queries.get_population(gbd_round_id=gbd_round_id, decomp_step=decomp_step, year_id=years, age_group_id=ages, sex_id=[1, 2], location_id=locs) pop['year_start'], pop['year_end'] = pop['year_id'], pop['year_id'] pop.drop(['year_id', 'run_id'], axis=1, inplace=True) agg_pop = pop.copy() agg_pop = hosp_prep.year_binner(agg_pop) agg_pop = agg_pop.groupby(['age_group_id', 'year_start', 'year_end', 'location_id', 'sex_id']).\ agg({'population': 'mean'}).reset_index() if (df['year_start'] + 4 == df['year_end']).all(): df = df.merge(agg_pop, how='left', on=[ 'age_group_id', 'sex_id', 'location_id', 'year_start', 'year_end' ], validate='m:1') else: df = df.merge(pop, how='left', on=[ 'age_group_id', 'sex_id', 'location_id', 'year_start', 'year_end' ], validate='m:1') assert df['population'].isnull().sum() == 0 for col in rate_cols: df["count_{}".format(col)] = df[col] * df['population'] return df
def get_sample_size(df, fix_group237=False): """ This function attaches sample size to hospital data. It's for sources that should have fully covered populations, so sample size is just population. Checks if age_group_id is a column that exists and if not, it attaches it. Parameters df: Pandas DataFrame contains the data that you want to add sample_size to. Will add pop to every row. """ # process ## attach age group id to data ## get pop with those age group ids in the data ## attach pop by age group id if 'age_group_id' not in df.columns: # pull age_group to age_start/age_end map age_group = hosp_prep.get_hospital_age_groups() # merge age group id on pre = df.shape[0] df = df.merge(age_group, how='left', on=['age_start', 'age_end']) assert df.shape[0] == pre, "number of rows changed during merge" assert df.age_group_id.notnull().all(), ("age_group_id is missing " "for some rows") # get population pop = get_population(QUERY) if fix_group237: fix_pop = get_population(QUERY) pre = fix_pop.shape[0] fix_pop['age_group_id'] = 237 fix_pop = fix_pop.groupby(fix_pop.columns.drop('population').tolist()).agg({'population': 'sum'}).reset_index() assert pre/2 == fix_pop.shape[0] pop = pd.concat([pop, fix_pop], ignore_index=True) # rename pop columns to match hospital data columns pop.rename(columns={'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] pop.drop("run_id", axis=1, inplace=True) demography = ['location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'] # merge on population pre_shape = df.shape[0] df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" assert df.population.notnull().all(),\ "population is missing for some rows. look at this df! \n {}".\ format(df.loc[df.population.isnull(), demography].drop_duplicates()) return(df)
def get_sample_size(df): """ This function attaches sample size to hospital data. It's for sources that should have fully covered populations, so sample size is just population. """ # process ## attach age group id to data ## get pop with those age group ids in the data ## attach pop by age group id if 'age_group_id' not in df.columns: # pull age_group to age_startFILEPATH map age_group = hosp_prep.get_hospital_age_groups() # merge age group id on pre = df.shape[0] df = df.merge(age_group, how='left', on=['age_start', 'age_end']) assert df.shape[0] == pre, "number of rows changed during merge" assert df.age_group_id.notnull().all(), ("age_group_id is missing " "for some rows") # get population pop = get_population(age_group_id=list(df.age_group_id.unique()), location_id=list(df.location_id.unique()), sex_id=[1, 2], year_id=list(df.year_start.unique())) # rename pop columns to match hospital data columns pop.rename(columns={'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] pop.drop("process_version_map_id", axis=1, inplace=True) demography = [ 'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id' ] # merge on population pre_shape = df.shape[0] df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" assert df.population.notnull().all(), ("population is missing for some" " rows") return (df)
def all_group_id_start_end_switcher(df, remove_cols=True, ignore_nulls=False): """ Takes a dataframe with age start/end OR age group ID and switches from one to the other Args: df: (Pandas DataFrame) data to swich age labelling remove_cols: (bool) If True, will drop the column that was switched from ignore_nulls: (bool) If True, assertions about missing ages will be ignored. Not a good idea to use in production but is useful for when you just need to quickly see what ages you have. """ if sum([w in ['age_start', 'age_end', 'age_group_id'] for w in df.columns]) == 3: assert False,\ "All age columns are present, unclear which output is desired. "\ "Simply drop the columns you don't want" elif sum([w in ['age_start', 'age_end'] for w in df.columns]) == 2: merge_on = ['age_start', 'age_end'] switch_to = ['age_group_id'] elif 'age_group_id' in df.columns: merge_on = ['age_group_id'] switch_to = ['age_start', 'age_end'] else: assert False, "Age columns not present or named incorrectly" ages = hosp_prep.get_hospital_age_groups() age_set = "hospital" for m in merge_on: ages_unique = ages[merge_on].drop_duplicates() df_unique = df[merge_on].drop_duplicates() if ages_unique.shape[0] != df_unique.shape[0]: age_set = 'non_hospital' elif (ages_unique[m].sort_values().reset_index(drop=True) != df_unique[m].sort_values().reset_index(drop=True)).all(): age_set = 'non_hospital' if age_set == 'non_hospital': ages = query("""SQL""", conn_def="shared") ages.rename(columns={ "age_group_years_start": "age_start", "age_group_years_end": "age_end" }, inplace=True) if 'age_end' in merge_on: df.loc[df['age_end'] == 100, 'age_end'] = 125 duped_ages = [294, 308, 27, 161, 38, 301] ages = ages[~ages.age_group_id.isin(duped_ages)] dupes = ages[ages.duplicated(['age_start', 'age_end'], keep=False)].sort_values('age_start') pre = df.shape[0] df = df.merge(ages, how='left', on=merge_on) assert pre == df.shape[0],\ "Rows were duplicated, probably from these ages \n{}".format(dupes) if not ignore_nulls: for s in switch_to: assert df[s].isnull().sum() == 0, ( "{} contains missing values from " "the merge. The values with Nulls are {}".format( s, df.loc[df[s].isnull(), merge_on].drop_duplicates().sort_values( by=merge_on))) if remove_cols: df.drop(merge_on, axis=1, inplace=True) return (df)
def get_sample_size(df, gbd_round_id, decomp_step, fix_group237=False): """ This function attaches sample size to hospital data. It's for sources that should have fully covered populations, so sample size is just population. Checks if age_group_id is a column that exists and if not, it attaches it. Parameters df: Pandas DataFrame contains the data that you want to add sample_size to. Will add pop to every row. """ if 'year_id' in df.columns: df.rename(columns={'year_id': 'year_start'}, inplace=True) df['year_end'] = df['year_start'] if 'age_group_id' not in df.columns: age_group = hosp_prep.get_hospital_age_groups() pre = df.shape[0] df = df.merge(age_group, how='left', on=['age_start', 'age_end']) assert df.shape[0] == pre, "number of rows changed during merge" assert df.age_group_id.notnull().all(), ("age_group_id is missing " "for some rows") pop = get_population(age_group_id=list(df.age_group_id.unique()), location_id=list(df.location_id.unique()), sex_id=[1, 2], year_id=list(df.year_start.unique()), gbd_round_id=gbd_round_id, decomp_step=decomp_step) if fix_group237: fix_pop = get_population(age_group_id=[235, 32], location_id=list(df.location_id.unique()), sex_id=[1, 2], year_id=list(df.year_start.unique()), gbd_round_id=gbd_round_id, decomp_step=decomp_step) pre = fix_pop.shape[0] fix_pop['age_group_id'] = 237 fix_pop = fix_pop.groupby( fix_pop.columns.drop('population').tolist()).agg({ 'population': 'sum' }).reset_index() assert pre / 2 == fix_pop.shape[0] pop = pd.concat([pop, fix_pop], sort=False, ignore_index=True) pop.rename(columns={'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] pop.drop("run_id", axis=1, inplace=True) demography = [ 'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id' ] pre_shape = df.shape[0] df = df.merge(pop, how='left', on=demography) assert pre_shape == df.shape[0], "number of rows don't match after merge" assert df.population.notnull().all(),\ "population is missing for some rows. look at this df! \n {}".\ format(df.loc[df.population.isnull(), demography].drop_duplicates()) return (df)
# ** parent inj ** df = get_parent_injuries(df) hosp_prep.check_parent_injuries(df, col_to_sum='val') # ** correction factors ** df = apply_outpatient_correction(df) # ** age sex restrictions ** df = outpatient_restrictions(df) # # test making square df = df[df.val != 0] ages = hosp_prep.get_hospital_age_groups() df = df.merge(ages, how='left', on=['age_start', 'age_end']) df_square = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id', cols_to_square=['val', 'val_corrected'], icd_len=5) np.abs(df.val.sum() - df_square.val.sum()) pre_check_val = df['val'].sort_values().reset_index(drop=True) post_check_val = df_square.loc[df_square['val'] > 0, 'val'].sort_values().reset_index(drop=True) merged = df_square.merge(df, how='left', on=['age_start', 'age_end', 'sex_id', 'location_id', 'year_start', 'source', 'nid', 'age_group_id', 'bundle_id', 'year_end', 'facility_id', 'representative_id', ])
def prepare_envelope(df, fix_maternal=False, return_only_inj=False, apply_age_sex_restrictions=True, want_to_drop_data=True): """ function that accepts as a parameter hospital dataframe and adds the the envelope to it, and adds meids and bundle_ids to it, duplicating baby sequela in the process (as expected) Parameters: df: DataFrame contains hospital inpatient primary diagnosis that has been mapped to Baby Sequelae fix_maternal: Boolean Should always be on, it's just an option for testing purposes to save time. switch that runs fix_maternal_denominators. This function transforms hospital data from 'population' to 'live births' using Age Specific Fertility Rates. return_only_inj: Boolean switch that will make it so this function only returns injuries data. should always be off, it's just there for testing. apply_age_sex_restrictions: Boolean switch that if True, will apply age and sex restrictions as determined by cause_set_id=9. DUSERts to true. Useful if you want to look at data all the data that was present in the source. """ ############################################### # DROP DATA ############################################### if want_to_drop_data: df = hosp_prep.drop_data(df, verbose=False) ############################################### # MAP AGE_GROUP_ID ONTO DATA ############################################### # These are our weird hospital age_group_id: we use 235 for 95-99, and we # don't use age_group_ids 2,3,4, we use 28 in their place ages = hosp_prep.get_hospital_age_groups( ) # df that has age_group_id, age_start, # and age_end df = df.merge(ages, how='left', on=['age_start', 'age_end']) # comment out to test the new sources in the process if not df.age_group_id.notnull().all(): warnings.warn("""Shouldn't be any null age_group_id, there could be unsplit ages in the data, or maybe an age_start/age_end changed""") # maps = pd.read_csv("FILEPATH") # maps = maps[['nonfatal_cause_name', 'cause_id', 'level']].copy() # maps = maps[maps.cause_id.notnull()] # maps = maps[maps.level == 1] # maps.drop("level", axis=1, inplace=True)# no need to merge level onto data # maps.drop_duplicates(inplace=True) # # pre = df.shape[0] # df = df.merge(maps, how='left', on='nonfatal_cause_name') # assert df.shape[0] == pre, "df shape changed during merge" ######################################################### # Get denominator for sources where we don't want to use envelope ######################################################### # Make list of sources that we don't want to use envelope full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] if pd.Series(full_coverage_sources).isin(df.source.unique()).any(): # make dataframe with sources full_coverage_df = df[df.source.isin(full_coverage_sources)].copy() # get denominator (aka population) full_coverage_df = gbd_hosp_prep.get_sample_size(full_coverage_df) full_coverage_df.rename(columns={'population': 'sample_size'}, inplace=True) # NOTE now there's going be a column "sample_size" that # is null for every source except the fully covered ones # make product full_coverage_df[ 'product'] = full_coverage_df.val / full_coverage_df.sample_size else: # make empty dataframe so that it exists and wont break anything full_coverage_df = pd.DataFrame(columns=list(df.columns) + ['product']) # drop these sources from dataframe for now df = df[~df.source.isin(full_coverage_sources)].copy() ######################################### # CREATE CAUSE FRACTIONS ######################################### df = hosp_prep.create_cause_fraction(df) ############################################### # APPLY CAUSE RESTRICTIONS ############################################### # NOTE it is okay to apply corrections after cause fractions, because restrictions # were also applied before the cause fractions were made ( right before splitting) if apply_age_sex_restrictions: df = hosp_prep.apply_restrictions(df, 'cause_fraction') full_coverage_df = hosp_prep.apply_restrictions( full_coverage_df, 'product') # drop the restricted values df = df[df['cause_fraction'].notnull()] full_coverage_df = full_coverage_df[ full_coverage_df['product'].notnull()] ######################################### # APPLY ENVELOPE ######################################### # read envelope env_df = pd.read_csv("FILEPATH") locs = df.location_id.unique() # all locations in our data # find locations that are missing age end 99 missing_99 = list(set(locs) -\ set(env_df.loc[(env_df.location_id.isin(locs))&(env_df.age_end == 99), 'location_id'].unique())) # change their age_ends to be 99 env_df.loc[(env_df.location_id.isin(missing_99)) & (env_df.age_start == 95), 'age_end'] = 99 # since we do NOT want to merge on by age_group_id env_df.drop('age_group_id', axis=1, inplace=True) demography = [ 'location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id' ] # merge envelope onto data pre_shape = df.shape[0] df = df.merge(env_df, how='left', on=demography) assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # compute nfc hospitalization rate # aka "apply the envelope" df['product'] = df['cause_fraction'] * df['mean'] df['upper_product'] = df['cause_fraction'] * df['upper'] df['lower_product'] = df['cause_fraction'] * df['lower'] ############################################ # RE-ATTACH data that has sample size instead of ############################################ df = pd.concat([df, full_coverage_df]).reset_index(drop=True) # NOTE now there's going be a column "sample_size" and "cases" that # is null for every source except the fully covered ones # if we just want injuries: if return_only_inj == True: inj_me = pd.read_csv("FILEPATH") inj_me = list(inj_me['level1_meid']) df = df[df['modelable_entity_id'].isin(inj_me)] # drop columns we don't need anymore # cause fraction: finished using it, was used to make product # mean: finished using it, was used to make product # upper: finished using, it was used ot make product_upper # lower: finished using, it was used ot make product_lower # val, numerator, denomantor: no longer in count space # NON FATAL_CAUSE_NAME IS LEFT TO SIGNIFY THAT THIS DATA IS STILL AT THE # NONFATAL_CAUSE_NAME LEVEL df.drop([ 'cause_fraction', 'mean', 'upper', 'lower', 'val', 'numerator', 'denominator' ], axis=1, inplace=True) if "bundle_id" in df.columns: level_of_analysis = "bundle_id" if "nonfatal_cause_name" in df.columns: level_of_analysis = "nonfatal_cause_name" # write some summary stats of the env being applied # this is to review against our aggregated data to find large differences today = datetime.datetime.today().strftime("%Y_%m_%d").replace(":", "_") # Zeros can really skew the summary stats so drop them env_stats = df[df['product'] != 0].\ groupby(['location_id', level_of_analysis])['product'].\ describe().reset_index() env_stats.rename(columns={'level_2': 'stats'}, inplace=True) env_stats.to_csv("FILEPATH", index=False) return (df)