def read_clean_map(): """ Read in the clean map file and do some light prepping to get it ready to merge onto the data Returns: Pandas DataFrame containing the map """ # read in clean map maps_path = r"FILEPATH" maps = pd.read_csv(maps_path, dtype={'cause_code': object}) assert hosp_prep.verify_current_map(maps) map_version = int(maps['map_version'].unique()) # cast to string maps['cause_code'] = maps['cause_code'].astype(str) maps['cause_code'] = maps['cause_code'].str.upper() # keep relevant columns maps = maps[['cause_code', 'nonfatal_cause_name', 'code_system_id']].copy() # drop duplicate values maps = maps.drop_duplicates() return maps, map_version
def add_measure(df): """ Adds bundle measure onto the df and renames the cols to fit with remaining process. Data must already have bundle_id attached. Parameters: df: Pandas DataFrame Must have a 'bundle_id' column """ assert "measure" not in df.columns, "'measure' already exists." assert "bundle_id" in df.columns, "'bundle_id' must exist." # Read in clean maps to merge measure onto data # need this for ELMO reqs clean_maps = pd.read_csv(root + r"{FILEPATH}" r"clean_map.csv") assert hosp_prep.verify_current_map(clean_maps) clean_maps = clean_maps[['bundle_id', 'bid_measure']] clean_maps.drop_duplicates(inplace=True) clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True) # remove null bundle ids in map clean_maps = clean_maps[clean_maps.bundle_id.notnull()] # check to make sure rows aren't duplicated pre_shape = df.shape[0] # merge measure onto data df = df.merge(clean_maps, how='left', on='bundle_id') # get injuries bids so we can check for missing measures pc_injuries = pd.read_csv(root + r"{FILEPATH}" r"parent_child_injuries_gbd2017.csv") inj_bids = pc_injuries['Level1-Bundle ID'].unique() # some injuries bids didn't get measures! assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)), """ We expect that all null measures belong to injuries, but that is not the case. Something went wrong!""" # fix any injuries that are missing measure, all inj are inc: df.loc[(df.measure.isnull()) & (df.bundle_id.isin(inj_bids)), 'measure'] = 'inc' # assert that all rows had a measure merged on hosp_prep.report_if_merge_fail(df, check_col='measure', id_cols='bundle_id', store=True, filename="measure_merge_failure") assert pre_shape == df.shape[0], "number of rows don't match after merge" return (df)
def verify_missing(bundle, locs, age, sex, years, run_id, map_path=None): """ pass a clean map, a bundle and set of demographic info and the func will return the data if it exists and a print statement if not """ if map_path == None: map_path = "FILEPATH".format(run_id) if type(locs) == int: locs = [locs] if type(years) == int: locs = [years] loc_source = pd.read_csv(r"FILENAME" r"FILEPATH") loc_source = loc_source[loc_source.location_id.isin(locs)] df_list = [] for source in loc_source.source.unique(): for year in years: try: df = pd.read_hdf(r"FILENAME" r"FILENAME" + source + "_" + str(year) + "FILEPATH", key='df') df_list.append(df) del df except: print("couldn't read in " + source + str(year)) data = pd.concat(df_list, sort=False) del df_list amap = pd.read_csv(map_path) amap.rename({'icg_name': 'icg_id'}, axis=1, inplace=True) assert hosp_prep.verify_current_map(amap) amap = amap.query("bundle_id == @bundle") data = data[(data.cause_code.isin(amap.cause_code))] data = data.query("age_start == @age & sex_id == @sex") if data.shape[0] == 0: print("uuhhh, yeah. there's no data here") print("age {} sex {} bundle {} years {} locations {}".format( age, sex, bundle, years, locs)) else: return (data)
def expand_bundles(df, drop_null_bundles=True): """ This Function maps groups of ICD codes to Bundles. """ assert "bundle_id" not in df.columns, "bundle_id has already been attached" assert "nonfatal_cause_name" in df.columns,\ "'nonfatal_cause_name' must be a column" # merge on bundle id maps = pd.read_csv(root + r"{FILEPATH}clean_map.csv") assert hosp_prep.verify_current_map(maps) maps = maps[['nonfatal_cause_name', 'bundle_id']].copy() maps = maps.drop_duplicates() df = df.merge(maps, how='left', on='nonfatal_cause_name') if drop_null_bundles: # drop rows without bundle id df = df[df.bundle_id.notnull()] return (df)
def get_bundle_paths(cfpath): cf_bundles = glob.glob(cfpath + "*.csv") len(cf_bundles) cm = pd.read_csv(r"filepath".format(root)) assert hosp_prep.verify_current_map(cm), "The map version is not correct" # create a list of bundle id ints to check against the map bundles = [ int(re.sub("[^0-9]", "", os.path.basename(f))) for f in cf_bundles ] map_bundles = cm[cm.bundle_id.notnull()].bundle_id.unique() bdiff = set(bundles).symmetric_difference(set(map_bundles)) if bdiff: msg = """ {} bundles are different between the current map and the CF folder. Bundles {} are present in the CFs but not the map and bundles {} are present in the map but not the CF folder. """.format(len(bdiff), set(bundles) - set(map_bundles), set(map_bundles) - set(bundles)) warnings.warn(msg) return cf_bundles
def verify_missing(bundle, locs, age, sex, years, map_path="FILEPATH"): """ pass a clean map, a bundle and set of demographic info and the func will return the data if it exists and a print statement if not """ if type(locs) == int: locs = [locs] if type(years) == int: locs = [years] # read in location/source map loc_source = pd.read_csv(r"FILEPATH") loc_source = loc_source[loc_source.location_id.isin(locs)] df_list = [] for source in loc_source.source.unique(): for year in years: try: df = pd.read_hdf(FILEPATH) df_list.append(df) del df except: print("couldn't read in " + source + str(year)) data = pd.concat(df_list) del df_list amap = pd.read_csv(map_path) assert hosp_prep.verify_current_map(amap) # get the icd codes associated with a bundle amap = amap.query("bundle_id == @bundle") # subset on icd codes data = data[(data.cause_code.isin(amap.cause_code))] data = data.query("age_start == @age & sex_id == @sex") if data.shape[0] == 0: print("uuhhh, yeah. there's no data here") print("age {} sex {} bundle {} years {} locations {}".format(age, sex, bundle, years, locs)) else: return(data)
import sys import hosp_prep # print warning message about latest version of maps: print("=====================================") print("=== PLEASE MAKE SURE YOU ARE USING===") print("==== LATEST VERSION OF CLEAN MAP ====") print("=====================================") # read in clean maps # TODO: change this to real clean maps when time comes clean_map_9 df = pd.read_csv(root + r"FILEPATH") map_vers = int(df.map_version.unique()) assert hosp_prep.verify_current_map(df) df = df[['cause_code', 'bundle_id', 'code_system_id', 'bid_measure', 'level']] # select only ICD9 parts df = df[df['code_system_id'] == 1] # get rid of null values df.dropna(axis=0, inplace=True) # make icd codes strings df['cause_code'] = df['cause_code'].astype(str) # drop code sys id column df.drop(['code_system_id', 'level'], axis=1, inplace=True)
def prep_weights(df, level): """ Function that computes weights for use in age sex splitting. Parameters: df: pandas DataFrame input data to inform weights. Should already have the appropriate age groups. Should have column "product" which is the product of cause_fraction and the hospital utilization envelope. age_group_id should be present. df is used only as an input to make weights. level: string Must be "bundle_id" or "nonfatal_cause_name". indicates if we're making bundle level weights or cause level weights. Returns: DataFrame that has weights computed at level of the Parameter 'level' """ print("Getting {} weights...".format(level)) # remove the all sexes sex id df = df[df.sex_id != 3].copy() # code is set up to use both age_start/age_end and age_group_id df = gbd_hosp_prep.all_group_id_start_end_switcher(df, remove_cols=False) for source in df.source.unique(): age_min = df[df.source == source].age_start.min() age_max = df[df.source == source].age_start.max() assert age_max == 95,\ "source {} doesn't have max age_start == 95".format(source) # keep relevant columns df = df[[ 'age_group_id', 'age_start', 'age_end', 'location_id', 'sex_id', 'year_end', 'year_start', 'product', 'nonfatal_cause_name' ]].copy() # make square aka cartesian. We want population to be conributed by all # age-countries, regardless if there are any cases for all age-country pairs # For every location-year that we already have, we want all age-sex # (and bundle/cause) combinations. This introduces Nulls where there wasn't # any data, which are then filled with zeros template = hosp_prep.make_square(df) df = template.merge(df, how='left', on=[ 'age_group_id', 'sex_id', 'location_id', 'year_start', 'nonfatal_cause_name', 'year_end', 'age_start', 'age_end' ]) # fill zeros df.update(df['product'].fillna(0)) # merge pop on so we can convert to count space, so we can do addition # get info for pop age_list = list(df.age_group_id.unique()) loc_list = list(df.location_id.unique()) year_list = list(df.year_start.unique()) # get pop pop = get_population(QUERY) # format pop pop.drop("run_id", axis=1, inplace=True) # and lock it ?? pop['year_start'] = pop['year_id'] pop['year_end'] = pop['year_id'] pop.drop('year_id', axis=1, inplace=True) # merge pop df = df.merge( pop, how='left', on=['location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id']) # multiply by pop to get into count space so we can get to # age / sex / bundle groups df['counts'] = df['product'] * df['population'] if level == 'bundle_id': assert False, "Level can no longer be bundle id" # in this section you merge on bundle_id via nonfatal_cause_name, and # then drop nonfatal_cause_name maps = pd.read_csv(r"FILEPATH/clean_map.csv") assert hosp_prep.verify_current_map(maps) maps = maps[['nonfatal_cause_name', 'bundle_id', 'level']].copy() maps = maps[maps.level == 1].copy() maps = maps.drop('level', axis=1) maps = maps.dropna(subset=['bundle_id']) maps = maps.drop_duplicates() df = df.merge(maps, how='left', on='nonfatal_cause_name') df.drop("nonfatal_cause_name", axis=1, inplace=True) group_cols = ['age_end', 'age_start', 'age_group_id', 'sex_id', level] df = df.groupby(by=group_cols).agg({ 'counts': 'sum', 'population': 'sum' }).reset_index() # divide by pop to get back into ratespace ... and we have the start of our # weights df['weight'] = df['counts'] / df['population'] return (df)