def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" # VA sources are the only ones where this may not work # might need to split dataframe by data_type_id for bridge map df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() df.drop(columns='data_type_id', inplace=True) if self.needs_bridging(has_verbal_autopsy): file_name = self.get_file_name(has_verbal_autopsy) map_df = pd.read_csv(self.bridge_map_path / file_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) # perform zz bridge code redistribution before other bridge mapping bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df
def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() if self.needs_bridging(has_verbal_autopsy): sheet_name = self.get_sheet_name(has_verbal_autopsy) map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df
def add_metadata(df): """Add key metadata.""" print_log_message("Adding key metadata") df = add_nid_metadata(df, ['source', 'code_system_id', 'parent_nid'], force_rerun=False, cache_dir='standard') # this column is not yet comprehensive in nid metadata file df['representative_id'] = 1 report_if_merge_fail(df, 'source', 'nid') # map extract type df = map_extract_type_id(df) # map site df = map_site_id(df) return df
def generate_splits(self, df): df = add_nid_metadata( df, add_cols='data_type_id', block_rerun=True, cache_dir=self.cache_dir, force_rerun=False, ) df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1 df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0 df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1 df.loc[df['split_maternal'] == 0, 'pct_maternal_hiv'] = df['pct_maternal_hiv_vr'] df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0 df.drop('pct_maternal_hiv_vr', axis=1, inplace=True) return df
def generate_splits(self, df): """Create a column to indicate how the data should be split. (depends on source type) """ df = add_nid_metadata( df, add_cols='data_type_id', block_rerun=True, cache_dir=self.cache_dir, force_rerun=False, ) df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1 df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0 df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1 df.loc[df['split_maternal'] == 0, 'pct_maternal_hiv'] = df['pct_maternal_hiv_vr'] df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0 df.drop('pct_maternal_hiv_vr', axis=1, inplace=True) return df
def special_cause_reassignment(self, df, code_system_id): """Replace the actual data cause under certain conditions. There are instances where a PI has good reason to believe that a certain group of deaths were assigned to the wrong cause, and it is known what cause to re-assign those deaths to. Implement here. This essentially allows mapping based on not just the cause and code system but based on other information like the location, NID, year, etc. It can also be used (sparingly) for hotfixes like changing all codes with values 'acause_digest_gastrititis' to be named 'acause_digest_gastritis'. Args: df (DataFrame): data with cause Returns: DataFrame: with any modifications """ cache_args = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': 'standard', 'cache_results': False } # Some SRS codes get redistributed differently than # other ICD10 datasets df = add_nid_metadata( df, 'source', **cache_args ) if (df['source'] == "India_SRS_states_report").any(): print_log_message("Changing SRS codes to custom garbage groups") assert (df['source'] == "India_SRS_states_report").all() df = add_code_metadata( df, 'value', code_system_id=code_system_id, **cache_args ) custom_grbg = pd.read_csv( self.cg.get_resource("srs_custom_garbage_groups") ) custom_grbg = custom_grbg.query('active == 1') custom_grbg['value'] = custom_grbg['srs_custom_garbage_group'] custom_grbg = add_code_metadata( custom_grbg, 'code_id', code_system_id=code_system_id, merge_col='value', **cache_args ) custom_grbg = custom_grbg.rename( columns={'code_id': 'new_code_id'}) custom_grbg = custom_grbg[['package_id', 'new_code_id']] gp_dfs = [] for package_id in custom_grbg.package_id.unique(): # THIS QUERIES THE DATABASE - BUT THERE SHOULD NEVER BE A TON # OF SRS JOBS HAPPENING AT ONCE SO IT SHOULD BE OK gp_df = get_garbage_from_package( code_system_id, package_id, package_arg_type="package_id" ) assert len(gp_df) != 0, \ "Found 0 codes for package {}".format(package_id) gp_dfs.append(gp_df) gp_df = pd.concat(gp_dfs, ignore_index=True) gp_df = gp_df.merge(custom_grbg, how='left') report_if_merge_fail(gp_df, 'new_code_id', 'package_id') gp_df = gp_df[['value', 'new_code_id']] gp_df['value'] = gp_df['value'].str.strip() df = df.merge(gp_df, how='left', on='value') df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id'] df['code_id'] = df['code_id'].astype(int) df = df.drop(['new_code_id', 'value'], axis=1) df = df.drop('source', axis=1) china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2) # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail) five_dig_code = df['code_id'] == 13243 df.loc[ china_cdc_2008 & five_dig_code, 'code_id' ] = 13242 return df
def main(years, qsub_out_dir, run_id, try_again=False): """ This function runs all the other functions. """ print("Submitting jobs...") for year in years: run_workers(year=year, run_id=run_id) print("Done submitting.") # wait for 20 minutes while checking for files print("Checking for files...") not_found = watch_jobs(years=years, data_dir=qsub_out_dir) # wait for all files to appear and maybe relaunch if they do not # if not all the file were found, and we want to try again, # delete the remaining jobs, and resubmit them. # deleting and THEN resubmitting is important, because # it prevents having two jobs altering the same file. if len(not_found) > 0 and try_again: print("Didn't find all files on the first try; Trying again...") # delete the remaining jobs print("Deleting remaining obsolete jobs...") qdel_obsolete_jobs() time.sleep(30) print("Re-submitting jobs that haven't yet completed...") for year in [x[:4] for x in not_found ]: # grab years from list of unfinished jobs run_workers(year=year, run_id=run_id) print("Done re-submitting.") print("Checking for files...") not_found = watch_jobs(years=years, data_dir=qsub_out_dir) # This will end the code from running. Within the context of the # empirical deaths run_all system, which will be checking for the output # of this master script, this means that the run all scipt won't find # the output file and will raise its own assertion error. assert len( not_found) == 0, "Not all files present, still missing {}".format( not_found) # still want to sleep some more incase some files are still writing time.sleep(30) # delete the remaining jobs print("Deleting remaining obsolete jobs...") qdel_obsolete_jobs() print("Collecting job outputs...") data = collect_qsub_results(data_dir=qsub_out_dir) # filter down to just the location_years we want location_years = get_location_years() data = filter_by_location_and_year(data, location_years) # add nid metadata data = add_nid_metadata(df=data, add_cols=['source', 'is_active'], force_rerun=False, cache_dir='standard') # filter out duplicates data = filter_duplicates(data.copy()) # aggregate under one for certain loc-years data = aggregate_under_one(data) # check that there isn't any All Cause VR in the data assert_msg = ("There is all cause VR in the data; " "This will lead to duplicates later in the process") assert (data.extract_type_id != 167).all(), assert_msg print("Done!") for col in ['sex_id', 'age_group_id', 'year_id', 'location_id']: data[col] = data[col].astype(int) data = data.drop(['extract_type_id', 'site_id', 'is_active'], axis=1) return data
def get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df): """Get data to run in NR model with incoming data.""" iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique() regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique() super_region_ids = location_hierarchy.query( 'level == 1')['location_id'].unique() # need to be string for later test that what comes after "VA-" is a # super region (otherwise, would have to compare ints, and whats after # "VA-" might not be convertible to an int) super_region_ids = [str(s) for s in super_region_ids] super_region_to_region_ids = location_hierarchy.query('level == 2') # location id here is the region id, and parent id is the super region id # becomes a dictionary from super region id to list of region ids super_region_to_region_ids = ( super_region_to_region_ids[['location_id', 'parent_id']].groupby( 'parent_id' ).apply(lambda df: list(set(df['location_id']))).to_dict() ) regions_to_ids = location_hierarchy.query( 'level == 2').set_index('ihme_loc_id')['region_id'] level_three_location_ids = location_hierarchy.query( 'level == 3')['location_id'].unique() model_group_filters = {} bad_model_group = False if model_group.startswith("VR-"): model_group_filters['data_type_id'] = [9, 10] loc_code = model_group.replace("VR-", "") if loc_code in iso3s: model_group_filters['iso3'] = loc_code elif loc_code in regions: region_id = regions_to_ids[loc_code] model_group_filters['region_id'] = region_id model_group_filters['exec_function'] = restrict_to_location_ids model_group_filters['exec_function_args'] = [ level_three_location_ids ] elif loc_code == "GRL-AK": AK_LOC_ID = 524 GRL_LOC_ID = 349 model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID] else: bad_model_group = True elif model_group.startswith("VA-"): model_group_filters['data_type_id'] = [8, 12] if model_group == "VA-SRS-IND": model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "VA-SRS-IDN": model_group_filters['source'] = IDN_SRS_SOURCES elif model_group == "VA-Matlab": model_group_filters['source'] = MATLAB_SOURCES elif model_group == "VA-Nepal-Burden": model_group_filters['source'] = "Nepal_Burden_VA" elif model_group == "VA-IND": model_group_filters['iso3'] = "IND" elif model_group == "VA-158": # potential bug from GBD2016 - super region 158 keeps only # Pakistan, Nepal, and Bangledesh, doesn't get India data # Also keep Bhutan in case we ever have VA there model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD', 'BTN'] else: loc_code = model_group.replace("VA-", "") if loc_code in super_region_ids: super_region_id = int(loc_code) model_group_filters['region_id'] = \ super_region_to_region_ids[super_region_id] else: bad_model_group = True elif model_group == "Cancer_Registry": model_group_filters['source'] = "Cancer_Registry" # keep data by source/iso3/survey type # model groups follow MATERNAL-{source}-{iso3} format # except for the household surveys within Other_Maternal elif model_group.startswith("MATERNAL"): for source in MATERNAL_NR_SOURCES: if source in model_group: model_group_filters['source'] = source if "HH_SURVEYS" in model_group: model_group_filters['survey_type'] = ["DHS", "RHS", "AHS", "DLHS", "NFHS"] model_group_filters['iso3'] = model_group[-3:] # special malaria model groups for VA data elif model_group.startswith('malaria'): model_group_filters['data_type_id'] = [8, 12] model_group_filters['malaria_model_group'] = model_group if "IND_SRS" in model_group: model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "CHAMPS": model_group_filters['data_type_id'] = [12] else: bad_model_group = True if bad_model_group: raise AssertionError( "Unrecognized model group: {}".format(bad_model_group) ) model_df = get_claude_data( phase="aggregation", is_active=True, is_dropped=False, location_set_id=35, year_id=range(1980, 2050), assert_all_available=True, location_set_version_id=location_set_version_id, **model_group_filters ) add_cols = ['code_system_id'] if model_group.startswith(("VA", "MATERNAL", "malaria", "CHAMPS")) or \ model_group in ["VR-RUS", "VR-R9"]: add_cols.append('source') if model_group.startswith('MATERNAL-HH_SURVEYS'): model_df = add_survey_type(model_df) # add on code_system_id model_df = add_nid_metadata( model_df, add_cols, force_rerun=False, block_rerun=True, cache_dir='standard', cache_results=False ) if model_group == "VR-RUS" or model_group == "VR-R9": # treat this like Russia_FMD_1989_1998 for purpose of cause list, # as it has now been bridge mapped that way replace_source = "Russia_FMD_ICD9" replace_csid = 213 fmd_conv_10 = model_df['source'] == replace_source num_replace = len(model_df[fmd_conv_10]) assert num_replace > 0, \ "No rows found with source {} in " \ "model group {}".format(replace_source, model_group) print_log_message( "Setting code system to {cs} for {s} " "source: {n} rows changed".format( cs=replace_csid, s=replace_source, n=num_replace) ) model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid report_if_merge_fail( model_df, 'code_system_id', ['nid', 'extract_type_id'] ) # special source drops for certain groups model_df = drop_source_data(model_df, model_group, location_hierarchy, cause_meta_df) return model_df
def special_cause_reassignment(self, df, code_system_id): """Replace the actual data cause under certain conditions. This essentially allows mapping based on not just the cause and code system but based on other information like the location, NID, year, etc. Args: df (DataFrame): data with cause Returns: DataFrame: with any modifications """ cache_args = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': 'standard', 'cache_results': False } # Some SRS codes get redistributed differently than # other ICD10 datasets df = add_nid_metadata(df, 'source', **cache_args) if (df['source'] == "India_SRS_states_report").any(): print_log_message("Changing SRS codes to custom garbage groups") assert (df['source'] == "India_SRS_states_report").all() df = add_code_metadata(df, 'value', code_system_id=code_system_id, **cache_args) custom_grbg = pd.read_csv( self.cg.get_resource("srs_custom_garbage_groups")) custom_grbg = custom_grbg.query('active == 1') custom_grbg['value'] = custom_grbg['srs_custom_garbage_group'] custom_grbg = add_code_metadata(custom_grbg, 'code_id', code_system_id=code_system_id, merge_col='value', **cache_args) custom_grbg = custom_grbg.rename( columns={'code_id': 'new_code_id'}) custom_grbg = custom_grbg[['package_id', 'new_code_id']] gp_dfs = [] for package_id in custom_grbg.package_id.unique(): gp_df = get_garbage_from_package(code_system_id, package_id, package_arg_type="package_id") assert len(gp_df) != 0, \ "Found 0 codes for package {}".format(package_id) gp_dfs.append(gp_df) gp_df = pd.concat(gp_dfs, ignore_index=True) gp_df = gp_df.merge(custom_grbg, how='left') report_if_merge_fail(gp_df, 'new_code_id', 'package_id') gp_df = gp_df[['value', 'new_code_id']] gp_df['value'] = gp_df['value'].str.strip() df = df.merge(gp_df, how='left', on='value') df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id'] df['code_id'] = df['code_id'].astype(int) df = df.drop(['new_code_id', 'value'], axis=1) df = df.drop('source', axis=1) china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2) five_dig_code = df['code_id'] == 13243 df.loc[china_cdc_2008 & five_dig_code, 'code_id'] = 13242 return df
def get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df): """Get data to run in NR model with incoming data.""" iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique() regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique() super_region_ids = location_hierarchy.query( 'level == 1')['location_id'].unique() super_region_ids = [str(s) for s in super_region_ids] super_region_to_region_ids = location_hierarchy.query('level == 2') super_region_to_region_ids = (super_region_to_region_ids[[ 'location_id', 'parent_id' ]].groupby('parent_id').apply( lambda df: list(set(df['location_id']))).to_dict()) regions_to_ids = location_hierarchy.query('level == 2').set_index( 'ihme_loc_id')['region_id'] level_three_location_ids = location_hierarchy.query( 'level == 3')['location_id'].unique() model_group_filters = {} bad_model_group = False if model_group.startswith("VR-"): model_group_filters['data_type_id'] = [9, 10] loc_code = model_group.replace("VR-", "") if loc_code in iso3s: model_group_filters['iso3'] = loc_code elif loc_code in regions: region_id = regions_to_ids[loc_code] model_group_filters['region_id'] = region_id model_group_filters['exec_function'] = restrict_to_location_ids model_group_filters['exec_function_args'] = [ level_three_location_ids ] elif loc_code == "GRL-AK": AK_LOC_ID = 524 GRL_LOC_ID = 349 model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID] else: bad_model_group = True elif model_group.startswith("VA-"): model_group_filters['data_type_id'] = 8 if model_group == "VA-SRS-IND": model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "VA-SRS-IDN": model_group_filters['source'] = IDN_SRS_SOURCES elif model_group == "VA-Matlab": model_group_filters['source'] = MATLAB_SOURCES elif model_group == "VA-IND": model_group_filters['iso3'] = "IND" elif model_group == "VA-158": model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD'] else: loc_code = model_group.replace("VA-", "") if loc_code in super_region_ids: super_region_id = int(loc_code) model_group_filters['region_id'] = \ super_region_to_region_ids[super_region_id] else: bad_model_group = True elif model_group == "Cancer_Registry": model_group_filters['source'] = "Cancer_Registry" elif model_group.startswith("MATERNAL"): for source in MATERNAL_NR_SOURCES: if source in model_group: model_group_filters['source'] = source if "HH_SURVEYS" in model_group: model_group_filters['survey_type'] = [ "DHS", "RHS", "AHS", "DLHS", "NFHS" ] model_group_filters['iso3'] = model_group[-3:] elif model_group.startswith('malaria'): model_group_filters['data_type_id'] = 8 model_group_filters['malaria_model_group'] = model_group if "IND_SRS" in model_group: model_group_filters['source'] = IND_SRS_SOURCES else: bad_model_group = True if bad_model_group: raise AssertionError( "Unrecognized model group: {}".format(bad_model_group)) model_df = get_claude_data(phase="aggregation", is_active=True, is_dropped=False, location_set_id=35, year_id=range(1980, 2050), assert_all_available=True, location_set_version_id=location_set_version_id, **model_group_filters) add_cols = ['code_system_id'] if model_group.startswith("VA") or model_group.startswith("MATERNAL") or \ model_group in ["VR-RUS", "VR-R9"] or model_group.startswith('malaria'): add_cols.append('source') if model_group.startswith('MATERNAL-HH_SURVEYS'): model_df = add_survey_type(model_df) # add on code_system_id model_df = add_nid_metadata(model_df, add_cols, force_rerun=False, block_rerun=True, cache_dir='standard', cache_results=False) if model_group == "VR-RUS" or model_group == "VR-R9": replace_source = "Russia_FMD_ICD9" replace_csid = 213 fmd_conv_10 = model_df['source'] == replace_source num_replace = len(model_df[fmd_conv_10]) assert num_replace > 0, \ "No rows found with source {} in " \ "model group {}".format(replace_source, model_group) print_log_message("Setting code system to {cs} for {s} " "source: {n} rows changed".format(cs=replace_csid, s=replace_source, n=num_replace)) model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid report_if_merge_fail(model_df, 'code_system_id', ['nid', 'extract_type_id']) # special source drops for certain groups model_df = drop_source_data(model_df, model_group, location_hierarchy, cause_meta_df) return model_df