def get_icd_to_bundle_map(map_version): """ read in a specific version of the map from our mapping table """ cc = clinical_mapping.get_clinical_process_data("cause_code_icg", map_version=map_version) cc = cc[cc.code_system_id < 3].copy() dur = clinical_mapping.create_bundle_durations(map_version=map_version) bun = clinical_mapping.get_clinical_process_data("icg_bundle", map_version=map_version) df = bun.merge(cc, how='left', on=['icg_id', 'icg_name', 'map_version']) df = df.merge(dur, how='left', on=['bundle_id', 'map_version']) keeps = [ 'bundle_id', 'cause_code', 'icg_measure', 'bundle_duration', 'code_system_id', 'map_version' ] df = df[keeps].drop_duplicates() df['cause_code'] = df['cause_code'].str.replace("\W", "") return df
def create_bundle_file(self): """Create a bundle file at: `FILEPATH` Parameters ---------- None Returns ------- None """ df = clinical_mapping.get_clinical_process_data('icg_bundle', prod=True) mdf = clinical_mapping.get_clinical_process_data('icg_durations', prod=True) mdf = mdf.drop(['icg_name', 'icg_duration', 'map_version'], axis=1).drop_duplicates() df = df.merge(mdf, how='left', on='icg_id') prev_bundles = [545, 543, 544] warnings.warn("We're forcing the bundles {} to have measure prevalence for the modeler".format(prev_bundles)) df.loc[df.bundle_id.isin(prev_bundles), 'icg_measure'] = 'prev' df = df[['bundle_id', 'icg_measure']].drop_duplicates() df = df.drop_duplicates() assert df[df.bundle_id.duplicated(keep=False)].shape[0] == 0,\ "Bundles have duped icg measures" df.rename(columns={'icg_measure': 'measure'}, inplace=True) df['measure'] = df['measure'].astype(str) stata_path = "FILEPATH"\ "FILEPATH".format(self.run_id) df.to_stata(stata_path, write_index=False) return
def get_map_data(map_version, append_unmapped_bundles): """ For a given map version, return the bundles which are present in it, along with more detailed info like which code systems a bundle codes to, and the min length of ICD codes it requires Params: map_version: (int) use in sql where clause to get specific map version append_unmapped_bundles: (bool) should the parent injuries and maternal ratios be appended on? Returns: df : a pd.dataframe of bundle IDs present in our mapping table bundle_icg_detail: a more detailed pd.dataframe containing icg_id and code system. We can use this to determine when bundles aren't expected by code system and the minimum cause_code (just icd9/10) that an icg maps to """ bundle_df = clinical_mapping.get_clinical_process_data( 'icg_bundle', map_version=map_version) cc_icg_df = clinical_mapping.get_clinical_process_data( 'cause_code_icg', map_version=map_version) min_cc_len = cc_icg_df.copy() min_cc_len['min_cc_length'] = min_cc_len['cause_code'].apply(len) min_cc_len = min_cc_len.query("code_system_id in (1, 2)").groupby( 'icg_id').agg({ 'min_cc_length': 'min' }).reset_index() pre_cc = cc_icg_df.shape cc_icg_df = cc_icg_df.merge(min_cc_len, how='left', on=['icg_id']) assert pre_cc[0] == cc_icg_df.shape[0] cc_icg_df = cc_icg_df[[ 'code_system_id', 'icg_id', 'min_cc_length', 'map_version' ]].drop_duplicates() df = bundle_df.merge(cc_icg_df, how='left', on=['icg_id', 'map_version']) if append_unmapped_bundles: unmapped = [ 264, 269, 270, 272, 275, 276, 362, 6113, 6116, 6119, 6122, 6125 ] tmp = pd.DataFrame({'bundle_id': unmapped}) df = pd.concat([df, tmp], sort=False, ignore_index=True) bundle_icg_detail = df.copy() df = df[['bundle_id', 'map_version']].drop_duplicates() if append_unmapped_bundles: assert df.shape[0] == bundle_df.bundle_id.unique().size + len(unmapped) else: assert df.shape[0] == bundle_df.bundle_id.unique().size df['present_in_map'] = True confirm_bundles_exist(df) confirm_bundles_exist(bundle_icg_detail) return df, bundle_icg_detail
def update_map_version(dev=True, tables=None): ''' Pulls the lastest version of desired table, updates the version id and then appends to the desired tables. Looks for row with map_value of -1 (a dummy value signalling either a new or updated row) updates to next map version, by a value of 1, then removes any instance of rows with map_vaule of -1. If there are no new / updated rows then the map version is incremented by 1 and appended to table(s). Args: table(list str): Identfies which table(s) should be updated. Keyword 'all' can be invoked to update all map related tables in the database. Defaults to None Raises: TypeError: When the tables is not in list format. This exludes the usage of the term 'all' ''' engine = create_engine() tables = [ 'cause_code_icg', 'icg_bundle', 'icg_durations', 'age_sex_restrictions' ] if table == 'all' and dev: tables = [e + '_dev' for e in tables] if type(tables) is not list: raise TypeError('tables must be in list format') for table in tables: tbl = cm.get_clinical_process_data(table) try: tbl_new = cm.get_clinical_process_data(table, map_version=-1) except AssertionError: tbl_new = None if type(tbl_new) == pd.core.frame.DataFrame: cols = list(tbl.columns) cols.remove('map_version') df = pd.concat([tbl, tbl_new]) df.sort_values(by=['map_version'], inplace=True) tbl = df.drop_duplicates(subset=col, keep='first') q = ''' DELETE FROM {} WHERE map_version = -1 '''.format(table) engine.execute(q) tbl['map_version'] = max(tbl.map_version.unique()) + 1 tbl = remove_nulls(tbl) tbl.to_sql(table, con=engine, if_exists='append', index=False)
def write_acause_rei_to_bundle_map(run_id): old_file = pd.read_csv("FILEPATH") bun = clinical_mapping.get_clinical_process_data('icg_bundle', prod=True, map_version='current') bun = bun[['bundle_id', 'map_version']].drop_duplicates() bun_map = query("SQL") acause_df = query("SQL") rei_df = query("SQL") new_map = bun_map[['bundle_id', 'cause_id', 'rei_id']].copy() new_map = new_map.merge(acause_df, how='outer', on='cause_id') new_map = new_map.merge(rei_df, how='outer', on='rei_id') new_map['bundle_acause_rei'] = "Missing" mask = "new_map['cause_id'].notnull()" new_map.loc[eval(mask), 'bundle_acause_rei'] = new_map.loc[eval(mask), 'acause'] mask = "new_map['rei_id'].notnull()" new_map.loc[eval(mask), 'bundle_acause_rei'] = new_map.loc[eval(mask), 'rei'] print("beginning a few tests") for acause in acause_df.acause.unique(): cid = new_map.query("acause == @acause").cause_id.drop_duplicates() assert cid.size == 1 cid = cid.iloc[0] assert acause == new_map.query("cause_id == @cid").acause.iloc[0] for rei in rei_df.rei.unique(): rid = new_map.query("rei == @rei").rei_id.drop_duplicates() assert rid.size == 1 rid = rid.iloc[0] assert rei == new_map.query("rei_id == @rid").rei.iloc[0] new_map.drop(['cause_id', 'rei_id', 'acause', 'rei'], axis=1, inplace=True) assert new_map.shape[0] == new_map.bundle_id.unique( ).size, "duplicated bundles" assert (new_map['bundle_acause_rei'] == "Missing").sum() == 0 print("tests passed, writing to run {}".format(run_id)) write_path = "FILEPATH".format(run_id) new_map.to_csv(write_path, index=False) return
def convert_obsolete_special_map_icgs(self, df): """Converts the out of use ICGs from special maps to ones currently in ICD 9/10 mapping Parameters ---------- df : pd.DataFrame A DataFrame of inpatient data Returns ------- pd.DataFrame The `df` after it has had old icgs removed """ fix_dict = { 'e-code, (inj_poisoning_other)': 'poisoning_other', 'lower respiratory infection(all)': 'lower respiratory infection(unspecified)', 'neoplasm non invasive other': 'other benign and in situ neoplasms', 'e-code, (inj_trans)': 'z-code, (inj_trans)', 'digest, gastritis and duodenitis': '_none', 'resp, copd, bronchiectasis': 'resp, other' } chk_df = clinical_mapping.get_clinical_process_data("icg_durations", map_version=self.map) chk_df = chk_df[['icg_name', 'icg_id']] chk_df = chk_df[chk_df['icg_name'].isin(list(fix_dict.values()))] exp_id_dict = {1267: 864, 1266: 1116, 1265: 159, 1264: 1241, 1263: 1, 1262: 434} sdiff = set(chk_df['icg_id']).symmetric_difference(set(exp_id_dict.values())) assert not sdiff, "The icg_ids don't match our hardcoding. This is unexpected" for key, value in list(fix_dict.items()): an_id = chk_df.query("icg_name == @value") assert len(an_id) == 1 an_id = an_id.icg_id.iloc[0] df.loc[df['icg_name'] == key, ['icg_name', 'icg_id']] = (value, an_id) return df
def simple_sex_split(df, map_version='current'): """ The age-sex restricted weights should have a value of zero perform a super simple sex splitting on values where sex is unknown/both but the icg maps to only 1 sex Params: df : (pandas.dataframe) icg level hospital data that's being fed into age-sex splitting map_version : (str or int) Can be a specific int, but defaults to 'current' which matches our process """ pre = df.shape cases = df['val'].sum() if 'icg_id' and 'icg_name' not in df.columns: raise ValueError("We need icg id and name to merge on restricts") sex_diff = set(df.sex_id.unique()) - set([1, 2, 3]) if sex_diff: raise ValueError( f"There are some unexpected sex values here {sex_diff}") restrict = clinical_mapping.get_clinical_process_data( 'age_sex_restrictions', map_version=map_version, prod=True) restrict = restrict.query("male == 0 or female == 0").copy() assert len( restrict.query("male == 1 and female == 1")) == 0, "logic was wrong" restrict = restrict[['icg_id', 'icg_name', 'male', 'female']] df = df.merge(restrict, how='left', on=['icg_id', 'icg_name'], validate='m:1') assert len(df) == pre[0] df.loc[(df['male'] == 0) & (df['sex_id'] == 3), 'sex_id'] = 2 df.loc[(df['female'] == 0) & (df['sex_id'] == 3), 'sex_id'] = 1 df.drop(['male', 'female'], axis=1, inplace=True) post = df.shape if pre[0] != post[0] or pre[1] != post[1]: raise ValueError(f"This df changed shape!! from {pre} to {post}") assert cases == df['val'].sum(), "CASES changed!!!" return df
def add_icg_ids(df): ''' The nfc maps are inconsistent for bs_ids, that is different ids are assigned to the same icg across different versions of the maps. We enforce consistent bs_ids as well as adding new ids for new icgs Args: df(pd.df): Must have Return: pd.DataFrame ''' maps = df.copy() dur = cm.get_clinical_process_data('icg_durations', prod=prod) dur.drop(['map_version', 'icg_measure'], axis=1, inplace=True) df = maps.merge(dur, on="icg_name", how='left') msg = 'Some rows went missing' assert temp.shape[0] == df.shape[0], msg m_icgs = maps.icg_name.unique().tolist() d_icgs = d_cp.icg_name.unique().tolist() m = set(m_icgs) - set(d_icgs) print(len(m)) msg = 'Mismatch of new ICGs' assert len(m) == df.loc[df.icg_id.isnull(), 'icg_name'].unique().size, msg start = dur.icg_id.max() + 1 stop = start + len(new_icg) new_icg = list(m) new_icg_id = np.arange(start, stop, 1) r = list(zip(new_icg, new_icg_id)) for e in r: df.loc[df.icg_name == e[0], 'icg_id'] = e[1] msg = 'There are still ICGs without an ID' assert df.icg_id.isnull().sum() == 0, msg return df
def insert_icg_helper(tbl_name, df): ''' Helper function which assists in adding an icg to the database. Args: tbl_name(str) : mirrors the name of the table that is to be updated. df(pd.df) : DataFrame of the new values. Saved in a long format Return: result: Either a string or a list based on the results of the testing df_sub(pd.sub) ''' tbl = cm.get_clinical_process_data(tbl_name) tbl_cols = tbl.columns.tolist() tbl_cols.remove('icg_id') tbl_v = tbl['map_version'].unique().item() df_sub = df[df.col_name.isin(tbl_cols)] df_sub = pd.pivot_table(df_sub, index='icg_id', values='value', columns='col_name', aggfunc=np.sum).reset_index() tbl = pd.concat([tbl, df_sub], sort=False).reset_index(drop=True) tbl['map_version'] = tbl_v tbl = cast_to_int(tbl, tbl_name) if tbl_name.startswith('cause_code'): result = cm.test_cc_icg_map(tbl) elif tbl_name.startswith('icg_bundle'): result = cm.test_icg_bundle(tbl) elif tbl_name.startswith('icg_durations'): result = cm.test_icg_vals(tbl, tbl_v) df_sub = cast_to_int(df_sub, tbl_name) return result, df_sub
def test_df(df, tbl_name): ''' Generates a dataframe that is used to tested for data quality and ensure mapping rules. In addition, a second dataframe is produced and contains only the updated / new table rows. Args: df(pd.df): Dataframe that contains the column to be updated as well as the old and new values tbl_name(str): Name of the table that is being updated Returns: tbl: pd.DataFrame update_df: pd.DataFrame ''' tbl = cm.get_clinical_process_data(tbl_name) ids = df.icg_id.unique().tolist() assert tbl[tbl.icg_ids.isin( ids)], 'ICG IDs are not present in the database table' df = df.replace(None, -1) tbl['update'] = 0 for i, r in df.iterrows(): col = r['col_name'] tbl.loc[(tbl.icg_id == r['icg_id']) & (tbl[col] == r['old_val']), [col, 'update']] = [r['new_val'], 1] tbl = cast_to_int(tbl, tbl_name) tbl = tbl.replace(-1, np.nan) update_df = tbl[tbl.update == 1] update_df.drop(['update'], axis=1, inplace=True) tbl.drop(['update'], axis=1, inplace=True) tbl = remove_nulls(tbl) return tbl, update_df
# this is what we need our data to look like. each file is a cf type example_df = pd.read_csv(FILEPATH) # cols = ['cf_location_id', 'sex_id', 'age_start', 'mean_incidence', 'icg_name', 'icg_id'] files = glob.glob(FILEPATH) # get the draw data back = pd.concat([pd.read_csv(f) for f in files], sort=False, ignore_index=True) df = add_UI(back.copy()) icg_df = clinical_mapping.get_clinical_process_data( "icg_durations", map_version=20)[['icg_id', 'icg_name']] df = df.merge(icg_df, how='left', on='icg_id') df.drop(['upper', 'lower', 'age_group_id'], axis=1, inplace=True) for col in ['age_start', 'sex_id']: df[col] = pd.to_numeric(df[col], downcast='integer') for cft in df.cf_type.unique(): if cft == 'cf1': cfname = 'indvcf' elif cft == 'cf2': cfname = 'incidence' elif cft == 'cf3': cfname = 'prevalence' else: assert False, 'the hell?!'
def insert_icg(df, dev=True, update_version=False): ''' Inserts a new ICG to the map. By default the map version is updated for any new insert. There are, however, instances where several inerts may be applied to the map so the user has the opportunity not to update the version. The ICG is used as the key to bind the new mappings. Args: df(pd.df): Contains the complete mapping of the icd, that is there at least one associated cause_code and bundle_id and measure / duration update_version(bol): Updates the entire map version to the next iteration for all tables. Defaults to False ''' count = 0 cols = [ 'cause_code', 'code_system_id', 'icg_name', 'icg_measure', 'icg_duration', 'bundle_id', 'male', 'female', 'yld_age_start', ' yld_age_end' ] ids = df.icg_id.unique().tolist() for id in ids: r = df.loc[df.icg_id == id, 'col_name'].unique().tolist() assert cols == r, 'Missing columns for icg_id {}'.format(id) tbl_cur = cm.get_clinical_process_data('icg_durations_dev') try: tbl_new = cm.get_clinical_process_data('icg_durations_dev', map_version=-1) tbl = tbl = pd.concat([tbl_cur, tbl_new]) except AssertionError: print('There are no pending icg inserts') tbl = tbl_cur icg_new = df.icg_id.unique().tolist() assert tbl[tbl.icg_id.isin( icg_new)].empty, 'You are adding ICGs that already exisit' tbls = [ 'cause_code_icg', 'icg_bundle', 'icg_durations', 'age_sex_restrictions' ] if dev: tbls = [e + '_dev' for e in tbls] insert_dict = {} for e in tbls: result, insert_df = insert_icg_helper(e, df) if type(result) == str: insert_dict[e] = insert_df count += 1 else: return result if count == 3: for k, v in list(insert_dict.items()): update_table(df=v, tbl_name=k, map_version=-1) if update_version and count == 3: update_map_version(update_version)