def get_package_code_ids(regression_specification, code_system_id): """Returns code_ids for garbage codes in package for given code system""" package_description = regression_specification[ 'package_descriptions' ][code_system_id] packages = get_package_list(code_system_id) package_id = packages.loc[ packages['package_description'] == package_description, 'package_id' ] assert len(package_id) == 1 package_id = package_id.iloc[0] pkg = json.load(open( "FILEPATH".format(code_system_id, package_id))) garbage_codes = list(pkg['garbage_codes']) code_map = get_cause_map(code_system_id=code_system_id, force_rerun=False) code_map['value'] = code_map['value'].str.replace(".", "") is_package_garbage = code_map['value'].isin(garbage_codes) garbage_code_ids = list(code_map.loc[ is_package_garbage, 'code_id' ].unique()) return garbage_code_ids
def get_code_ids_from_map_ids(self, map_id): cs_map = get_cause_map(code_map_version_id=self.code_map_version_id, **self.block_rerun) pkg_map = get_clean_package_map_for_misdc( self.code_system_id, remove_decimal=self.remove_decimal) assert type(map_id) == str if map_id.startswith('_p_'): values = pkg_map.loc[pkg_map['map_id'] == map_id, 'value'].values codes = cs_map.loc[cs_map.value.isin(values), 'code_id'].values cause_id = 743 assert len( codes) > 0, "No code_ids matching {} in the cause map".format( map_id) else: codes = cs_map.loc[cs_map.cause_id == int(map_id), 'code_id'].values cause_id = int(map_id) if len(codes) == 0: codes = cs_map.loc[cs_map.cause_id == self.cc_code, 'code_id'].values cause_id = self.cc_code code_id = codes[0] code_dict = {map_id: code_id} cause_dict = {map_id: cause_id} return code_dict, cause_dict
def merge_on_scaled(df, move_df, adjust_id, code_system_id): df = df.merge(move_df[[ 'location_id', 'year_id', 'site_id', 'age_group_id', 'sex_id', 'map_id', 'misdiagnosed_scaled' ]], how='outer') if len(df.loc[df.cause_id.isnull()]) > 0: assert all( df.loc[df.cause_id.isnull(), 'map_id'].values == str(adjust_id)), 'Other missing map_ids' cs_map = get_cause_map(code_system_id=code_system_id, force_rerun=False, block_rerun=True) possible_codes = cs_map.loc[cs_map.cause_id == adjust_id, 'code_id'].values use_target = True if len(possible_codes) == 0: possible_codes = cs_map.loc[cs_map.cause_id == 919, 'code_id'].values use_target = False target_code = possible_codes[0] df.loc[df.code_id.isnull(), 'code_id'] = target_code if use_target: df.loc[df.cause_id.isnull(), 'cause_id'] = adjust_id else: df.loc[df.cause_id.isnull(), 'cause_id'] = 919 df['deaths'].fillna(0, inplace=True) for exravar in ['nid', 'extract_type_id']: df[exravar].fillna(method='pad', inplace=True) for idvar in [i for i in list(df) if i.endswith('_id')] + ['nid']: if df[idvar].dtype == 'float64': df[idvar] = df[idvar].astype(int) return df
def format_source(release_date): # read the raw data and the WHO provided country/year map df = read_data(release_date) country_map = get_country_map(release_date) # subset to just the new loc/years # also apply location/year restrictions df = subset_location_years(df, country_map) # map location information loc_meta = get_current_location_hierarchy( location_set_id=CONF.get_id('location_set'), location_set_version_id=CONF.get_id('location_set_version'), force_rerun=False, block_rerun=True) df = get_gbd_locations(df, country_map, loc_meta) # replicating age adjustments for WHO data from df = adjust_WHO_ages(df) # Limit the dataframe to the columns needed and melt ages wide to long df = melt_df(df) # assign age group ids df = get_age_group_ids(df) # map code ids and apply special remaps cause_map = get_cause_map(1, force_rerun=False, block_rerun=True) df = map_code_id(df, cause_map) # add manual cols and cleanup df = cleanup(df) # apply nids df = map_nids(df, release_date) # apply any final special adjustments df = apply_special_adjustments(df) # final grouping and finalize formatting df = df[FINAL_FORMATTED_COLS] assert df.notnull().values.all() df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum() # run finalize formatting locals_present = finalize_formatting(df, SYSTEM_SOURCE, write=WRITE) nid_meta_df = locals_present['nid_meta_df'] # update nid metadata status if WRITE: nid_extracts = nid_meta_df[[ 'nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) update_nid_metadata_status(nid, extract_type_id, is_active=IS_ACTIVE, is_mort_active=IS_MORT_ACTIVE)
def format_greenland(): grl_14 = pd.read_excel(GRL_PATH_2014) grl_15 = pd.read_excel(GRL_PATH_2015) grl_14 = grl_14[['Year', 'Sex', 'ICD-10', 'Age', 'Deaths']] assert (grl_14.columns.values == grl_15.columns.values).all() df = pd.concat([grl_14, grl_15]) df = clean_df(df) df = get_sex_id(df) df = get_nid(df) df = get_age_group_id(df) df = fix_codes(df) cause_map = get_cause_map(code_system_id=1) df = map_code_id(df, cause_map) df = df[FINAL_FORMATTED_COLS] df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum() system_source = 'Greenland_BoH_ICD10' finalize_formatting(df, system_source, write=True)
def prep_code_metadata(self): df = get_cause_map( self.code_system_id, **self.standard_cache_options ) df = df[['code_id', 'value', 'cause_id']] df = df.rename(columns={'value': 'raw_cause'}) return df
def format_sri_lanka(): #importing raw data from excel file df = pd.read_excel(path) #initial cleaning to fix rows/columns imported from excel df = clean_df(df) #incoming data has sex data in wide format, following function splits df by sex and manually sets age groups #function returns initial_total (a float to compare against deaths later to ensure no deaths were lost in process) df, initial_total = split_sexes(df) #reshaping df age groups wide to long and assuring no deaths were lost df = pd.melt(df, id_vars=['cause_name', 'sex_id', 'value'], var_name='age', value_name='deaths') assert np.allclose(initial_total, df.deaths.sum()) df = get_age_ids(df) #importing and formatting rdp_frac dataframe to disaggregate tabulated icd10 rdp = pd.read_stata(rdp_path) rdp = format_rdp_frac(rdp) #disaggregating tabulated icd10 codes df = disaggregate(df, rdp) #mapping code_ids using cause map from engine room cause_map = get_cause_map(code_system_id=9) df = map_code_id(df, cause_map) #addition of manually added columns #Sri Lanka location id 17 df['location_id'] = 17 #nid 327524 df['nid'] = 327524 #data_type_id 9 (VR) df['data_type_id'] = 9 #code_system_id 9 (ICD10_tabulated) df['code_system_id'] = 9 #year: 2013, site: blank, representative_id: 1 df['year_id'] = 2013 df['site'] = "" df['representative_id'] = 1 #grouping by ID_COLS and assigning system source df = df[FINAL_FORMATTED_COLS] df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum() system_source = "ICD10_tabulated" #df.to_csv('/home/j/temp/mwcunnin/test_lka_out.csv', index = False, encoding = 'utf8') #run finalize formatting finalize_formatting(df, system_source, write=True)
def format_sri_lanka(): df = read_and_clean_data() # incoming data has sex data in wide format, following function splits df by # sex and manually sets age groups # function returns initial_total (a float to compare against deaths later to # ensure no deaths were lost in process) df, initial_total = split_sexes(df) # reshaping df age groups wide to long and assuring no deaths were lost df = pd.melt(df, id_vars=['cause_name', 'sex_id', 'value', 'year_id'], var_name='age', value_name='deaths') assert np.allclose(initial_total, df.deaths.sum()) df = get_age_ids(df) # importing and formatting rdp_frac dataframe to disaggregate tabulated icd10 rdp = pd.read_stata(rdp_path) rdp = format_rdp_frac(rdp) # disaggregating tabulated icd10 codes df = disaggregate(df, rdp) # mapping code_ids using cause map from engine room cause_map = get_cause_map(code_system_id=9) df = map_code_id(df, cause_map) # addition of manually added columns # Sri Lanka location id 17 df['location_id'] = 17 # nid 327524 df['nid'] = df.year_id.map({2007: 272959, 2013: 327524}) # data_type_id 9 (VR) df['data_type_id'] = 9 # code_system_id 9 (ICD10_tabulated) df['code_system_id'] = 9 # site: blank, representative_id: 1 df['site'] = "" df['representative_id'] = 1 # grouping by ID_COLS and assigning system source df = df[FINAL_FORMATTED_COLS] assert df.notnull().values.all() df[INT_COLS] = df[INT_COLS].astype(int) df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum() system_source = "ICD10_tabulated" # run finalize formatting finalize_formatting(df, system_source, write=WRITE) return df
def get_computed_dataframe(self, df): """Return mapped dataframe.""" # list of all cause columns raw_cause_cols = MCoDMapper.get_code_columns(df) df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id) print_log_message("Mapping underlying cause/primary diagnosis") cause_map = get_cause_map(code_map_version_id=self.code_map_version_id, **self.cache_options) code_map = MCoDMapper.prep_cause_map(cause_map) df['cause_mapped'] = df['cause'].map(code_map) print_log_message( "Trimming ICD codes and remapping underlying cause/primary diagnosis" ) df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map, self.code_system_id) report_if_merge_fail(df, 'cause_mapped', 'cause') # merge on the cause_id for the underlying cause df = df.rename(columns={'cause_mapped': 'code_id'}) df['code_id'] = df['code_id'].astype(int) df = add_code_metadata(df, 'cause_id', code_map_version_id=self.code_map_version_id, **self.cache_options) report_if_merge_fail(df, 'cause_id', 'code_id') print_log_message("Mapping chain causes") # get the special intermediate cause map int_cause_map = self.prep_int_cause_map() df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause) print_log_message("Trimming ICD codes and remapping chain causes") int_cause_cols = [x for x in df.columns if self.int_cause in x] int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary( raw_cause_cols, int_cause_cols) df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map, self.code_system_id) print_log_message( "Identifying rows with intermediate cause of interest") df = self.capture_int_cause(df, int_cause_cols) if not self.drop_p2: df = self.set_part2_flag(df) return df
def run_pipeline(nid, extract_type_id, launch_set_id, df, code_system_id, cause_set_version_id, location_set_version_id, pop_run_id, env_run_id, distribution_set_version_id, diagnostic=False): """Run the full pipeline""" cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_results': False, 'cache_dir': CONF.get_directory('FILEPATH'), 'verbose': False } location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **cache_options) code_map = get_cause_map(code_system_id=code_system_id, **cache_options) source = get_value_from_nid(nid, "source", extract_type_id) print("Overriding causes when necessary") df = overrides(df, location_meta_df) print("Dropping data out of scope") df = drop_data_out_of_scope(df, location_meta_df, source) if len(df) > 0: # make sure six minor territories are grouped correctly assert_no_six_minor_territories(df) # run mapping print("\nDeaths before MAPPING: {}".format(df.deaths.sum())) Mapper = GBDCauseMapper(cause_set_version_id, code_map) df = Mapper.get_computed_dataframe(df, code_system_id) if diagnostic: write_phase_output(df, 'mapping', nid, extract_type_id, launch_set_id, sub_dirs='diagnostic') print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum())) # run age sex splitting MySplitter = AgeSexSplitter(cause_set_version_id, pop_run_id, distribution_set_version_id, verbose=True, collect_diagnostics=False) df = MySplitter.get_computed_dataframe(df, location_meta_df) if diagnostic: diag_df = MySplitter.get_diagnostic_dataframe() write_phase_output(diag_df, 'agesexsplit', nid, extract_type_id, launch_set_id, sub_dirs='diagnostic') print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum())) # run restrictions corrections Corrector = RestrictionsCorrector(code_system_id, cause_set_version_id, collect_diagnostics=False, verbose=True) df = Corrector.get_computed_dataframe(df) # calculate cc_code for some sources if source in ['Iran_maternal_surveillance', 'Iran_forensic']: env_meta_df = get_env(env_run_id=env_run_id, **cache_options) df = calculate_cc_code(df, env_meta_df, code_map) print("\nDeaths after adding cc_code: {}".format(df.deaths.sum())) # adjust deaths for New Zealand by maori/non-maori ethnicities if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]: df = correct_maori_non_maori_deaths(df) print("\nDeaths after Maori/non-Maori adjustment: {}".format( df.deaths.sum())) print("\nDeaths at END: {}".format(df.deaths.sum())) return df
def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") wait('claude_redistributionworker_{}'.format(nid), 30) print_log_message("Done waiting. Appending them together") df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df
def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): """String together processes for redistribution.""" # what to do about caching throughout the phase read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } # the iso3 of this data iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) # the code system id code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) # the data type data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) # cause map cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) # get age group ids age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) # Move garbage to hiv first hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) # recognizing that it is weird for code_system_id to come from two places, # make sure they are consistent assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") # do we have all the packages we need? # verify_packages(df) # format age groups to match package parameters df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) # create split groups # NO SPLIT GROUP NEEDED df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) # submit jobs or just run them here if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") # wait until all jobs for a given nid have completed # eventually need logic for files not being present wait('claude_redistributionworker_{}'.format(nid), 30) # This seems to be necessary to wait for files print_log_message("Done waiting. Appending them together") # append split groups together df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) # bad if change 2% or 5 deaths, whichever is greater # (somewhat arbitrary, just trying to avoid annoying/non-issue failures) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df