def get_parentmapped_garbage(self, df): assert 'code_id' in df.columns, \ "Need a code_id to map to packages, but columns " \ "were: {}".format(df.columns) package_id_to_parent_id = self.cause_package_hierarchy.set_index( 'package_id', verify_integrity=True)['parent_id'] value_to_package_id = self.package_map.set_index( 'value', verify_integrity=True)['package_id'] df = add_code_metadata(df, 'value', code_map=self.code_map) if self.remove_decimal: df['value'] = df['value'].str.replace(".", "") df['package_id'] = df['value'].map(value_to_package_id) df['parent_id'] = df['package_id'].map(package_id_to_parent_id) # still need to know number of deaths in non-garbage in some applications, to # get the sample size for example df.loc[df['package_id'].isnull(), 'parent_id'] = -1 # keep only df['cause_id'] = df['parent_id'] df = df.groupby(AGGREGATION_IDX_COLS, as_index=False)['deaths'].sum() return df
def check_vr_raw_causes(df): """Check for common mistakes in cause formatting for VR data.""" if len(df.loc[df['data_type_id'].isin([9, 10])]) > 0: # only have checks for ICD10 and ICD9 detail data at the moment # to add more checks, remove this this if block if len(df.loc[df['code_system_id'].isin([1, 6])]) > 0: message = "" code_system_ids = df['code_system_id'].unique() for code_system_id in code_system_ids: cs_df = df.query("code_system_id == {}".format(code_system_id)) cs_df = add_code_metadata(cs_df, 'value', code_system_id=code_system_id) if code_system_id == 6: # check for N codes ncode_df = cs_df.loc[cs_df['value'].str.contains('^[89]')] if len(ncode_df) > 0: message = "!!CONFIRM OR CHANGE TO E CODES!! \nNature of injury"\ " codes will be mapped to garbage \n{}".format(ncode_df.head()) if code_system_id == 1: ucode_df = cs_df.loc[cs_df['value'].str.startswith('U0')] if len(ucode_df) > 0: message = "These codes should only be in US data"\ " \n{}".format(ucode_df) stcode_df = cs_df.loc[cs_df['value'].str.contains('^[ST]')] if len(stcode_df) > 0: message = "Data contain S/T codes that will"\ " mostly be mapped to garbage \n{}".format(stcode_df) if message != "": warnings.warn(message)
def calculate_cc_code(df, env_meta_df, code_map): df_cc = df.copy() # groupby everything except cause + code_id group_cols = [ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid', 'extract_type_id', 'site_id' ] df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum() # merge on envelope df_cc = add_envelope(df_cc, env_df=env_meta_df) df_cc['value'] = 'cc_code' df_cc = add_code_metadata(df_cc, ['code_id'], merge_col='value', code_map=code_map) report_if_merge_fail(df_cc, ['code_id'], ['value']) df_cc['cause_id'] = 919 df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths'] assert df_cc.notnull().values.any() # append together df = pd.concat([df, df_cc], ignore_index=True) assert np.isclose(df['deaths'].sum(), df.mean_env.sum()) df = df.drop(['mean_env', 'value'], axis=1) return df
def calculate_cc_code(df, env_meta_df, code_map): """Calculate total deaths denominator. Note: This step is usually done in formatting. Moving this calculation after age/sex splitting should return more accurate results for data that has a mix of known, detailed age groups and unknown ages. """ df_cc = df.copy() # groupby everything except cause + code_id group_cols = [ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid', 'extract_type_id', 'site_id' ] df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum() # merge on envelope df_cc = add_envelope(df_cc, env_df=env_meta_df) df_cc['value'] = 'cc_code' df_cc = add_code_metadata(df_cc, ['code_id'], merge_col='value', code_map=code_map) report_if_merge_fail(df_cc, ['code_id'], ['value']) df_cc['cause_id'] = 919 df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths'] assert df_cc.notnull().values.any() # append together df = pd.concat([df, df_cc], ignore_index=True) assert np.isclose(df['deaths'].sum(), df.mean_env.sum()) df = df.drop(['mean_env', 'value'], axis=1) return df
def assert_valid_mappings(self, df, code_system_id): """Test that the mapping worked. Runs a suite of assertions to make sure that mapping was successful. Args: df (DataFrame): with at least code_id and cause_id Returns: None Raises: AssertionError: Any condition fails """ # add code value from cached code map print("Adding value") df = add_code_metadata(df, ['value'], code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) report_if_merge_fail(df, 'value', 'code_id') # get acause from cached cause hierarchy print("Adding acause") df = add_cause_metadata(df, ['acause'], cause_set_version_id=self.cause_set_version_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) report_if_merge_fail(df, 'acause', 'cause_id') # Test that all causes starting with 'acause_' are mapped correctly. # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd'). # 'acause__gc_X59' should be mapped to '_gc', etc. print("Checking implied acauses") check_df = df.loc[df['value'].str.startswith('acause_')] check_df['implied_acause'] = \ check_df['value'].str.replace('acause_', '', 1) check_df.loc[check_df['value'].str.contains("acause__gc"), 'implied_acause'] = "_gc" bad_df = check_df.loc[check_df['acause'] != check_df['implied_acause']] if len(bad_df) > 0: bad_stuff = bad_df[['value', 'acause']].drop_duplicates() raise AssertionError( "These code values do not match their acause: " "\n{}".format(bad_stuff)) print("Checking for bad values") # assert incorrect acauses are gone bad_acauses = [ 'acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug' ] bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique() if len(bad_df) > 0: raise AssertionError( "Found these bad code values in the data: {}".format( bad_stuff))
def get_computed_dataframe(self, df): """Return mapped dataframe.""" # list of all cause columns raw_cause_cols = MCoDMapper.get_code_columns(df) df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id) print_log_message("Mapping underlying cause/primary diagnosis") cause_map = get_cause_map(code_map_version_id=self.code_map_version_id, **self.cache_options) code_map = MCoDMapper.prep_cause_map(cause_map) df['cause_mapped'] = df['cause'].map(code_map) print_log_message( "Trimming ICD codes and remapping underlying cause/primary diagnosis" ) df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map, self.code_system_id) report_if_merge_fail(df, 'cause_mapped', 'cause') # merge on the cause_id for the underlying cause df = df.rename(columns={'cause_mapped': 'code_id'}) df['code_id'] = df['code_id'].astype(int) df = add_code_metadata(df, 'cause_id', code_map_version_id=self.code_map_version_id, **self.cache_options) report_if_merge_fail(df, 'cause_id', 'code_id') print_log_message("Mapping chain causes") # get the special intermediate cause map int_cause_map = self.prep_int_cause_map() df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause) print_log_message("Trimming ICD codes and remapping chain causes") int_cause_cols = [x for x in df.columns if self.int_cause in x] int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary( raw_cause_cols, int_cause_cols) df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map, self.code_system_id) print_log_message( "Identifying rows with intermediate cause of interest") df = self.capture_int_cause(df, int_cause_cols) if not self.drop_p2: df = self.set_part2_flag(df) return df
def get_computed_dataframe(self, df, code_system_id): # make special cause adjustments df = self.special_cause_reassignment(df, code_system_id) """Map code id to cause id.""" print_log_message("Merging with cause map") # get code metadata from a file already cached df = add_code_metadata(df, ['cause_id'], code_system_id, code_map=self.code_map) report_if_merge_fail(df, 'cause_id', 'code_id') print("Asserting it's all good") self.assert_valid_mappings(df, code_system_id) df = self.drop_unnecessary_causes(df, self.unnecessary_causes) print("Collapsing") df = self.collapse_and_sum_by_deaths(df) return df
def assign_code_to_created_target_deaths(df, code_system_id, cause_meta_df): created = df[df['_merge'] == 'right_only'] original = df[df['_merge'] != 'right_only'] created = add_cause_metadata(created, 'acause', cause_meta_df=cause_meta_df) created['value'] = created['acause'].apply(lambda x: 'acause_' + x) created.drop(['code_id', 'acause'], axis=1, inplace=True) created = add_code_metadata(created, 'code_id', code_system_id=code_system_id, merge_col='value', cache_dir=CONF.get_directory('db_cache'), force_rerun=False, block_rerun=True) report_if_merge_fail(created, 'code_id', ['value']) df = original.append(created) df.drop(['_merge', 'value'], axis=1, inplace=True) return df
def add_packages(df, code_system_id, remove_decimal, package_dir): ''' Assing map value to garbage based on package ''' df = add_code_metadata(df, ['value'], code_system_id=code_system_id, force_rerun=False, block_rerun=True) df['value'] = clean_icd_codes(df['value'], remove_decimal) df = assign_packages(df, code_system_id, remove_decimal, package_dir) df.drop('value', axis=1, inplace=True) assert len(df.loc[(df.cause_id != 743) & (df.map_id.str.contains('_p_'))]) == 0, \ 'Code(s) mapped to both a cause and a package' bad_garbage = df.loc[(df.cause_id == 743) & ~(df.map_id.str.contains('_p_'))] assert len(bad_garbage) == 0, \ 'Code(s) mapped to garbage but not a package: {}'.format(bad_garbage) return df
def add_map_ids(self, df): '''Assing map value to garbage based on package id.''' df = add_code_metadata(df, ['value'], code_map_version_id=self.code_map_version_id, **self.block_rerun) df['value'] = clean_icd_codes(df['value'], self.remove_decimal) # we do this extra step in downloading packages for ICD10, ICD9_detail if self.code_system_id in [1, 6]: df = remove_five_plus_digit_icd_codes( df, code_system_id=self.code_system_id, trim=True) df = self.assign_packages(df) # some checks garbage_cause_id = (df.cause_id == 743) garbage_map_id = (df.map_id.str.contains('_p_', na=False)) bad_codes = df.loc[~garbage_cause_id & garbage_map_id, ['value', 'map_id', 'cause_id']].drop_duplicates() assert len(bad_codes) == 0, \ 'Code(s) mapped to both a cause and a package: {}'.format(bad_codes) bad_garbage = df.loc[garbage_cause_id & ~garbage_map_id, ['value', 'map_id']].drop_duplicates() assert len(bad_garbage) == 0, \ 'Code(s) mapped to garbage but not a package: {}'.format(bad_garbage) df.drop('value', axis=1, inplace=True) return df
def get_computed_dataframe(self): keep_cols = self.df.columns if not self.country_needs_correction(): print_log_message("Country doesn't need hiv correction") self.diag_df = None return self.df print_log_message("Getting rates df") rates_df = self.get_rates_df(self.cause_meta_df) if self.correct_garbage: df = add_code_metadata(self.df, add_cols=['value'], code_system_id=self.code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) df = self.identify_sepsis_gc(df, self.code_system_id) df = self.identify_injury_gc(df, self.code_system_id) df = self.identify_hivrd_gc(df, self.code_system_id) group_cols = [ x for x in keep_cols if x not in ['code_id', 'deaths'] ] df_by_code = df.copy() df_by_cause = df.groupby(group_cols, as_index=False)['deaths'].sum() else: df_by_cause = self.df df = add_population(df_by_cause, pop_df=self.pop_df) print_log_message("Flagging correct dem groups for " "{0} rows of data".format(len(df))) df = flag_correct_dem_groups(df, self.code_system_id, self.cause_meta_df, self.loc_meta_df, self.age_meta_df, rates_df, self.reference_ages, self.move_gc_age_restrictions, self.value_cols, self.pop_col, self.cause_selections_path, correct_garbage=self.correct_garbage) cause_to_targets_map = self.get_cause_to_targets_map( self.cause_meta_df) print_log_message("Identifying positive excess") df = identify_positive_excess(df, rates_df, cause_to_targets_map, self.reference_ages, self.loc_meta_df, self.cause_meta_df, self.value_cols, self.pop_col, self.correct_garbage) if self.correct_garbage: df = self.calculate_garbage_positive_excess( df, df_by_code, group_cols) print_log_message("Moving excess to target") df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) computed_df = assign_code_to_created_target_deaths( df, self.code_system_id, self.cause_meta_df) else: print_log_message("Moving excess to target") computed_df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) self.diag_df = computed_df return computed_df[keep_cols]
def set_restricted_cause(self, df): """Run a set of manual replacements, according to expert opinion.""" # based on first letter of icd code, certain values chould be filled in mapping_icd10 = { 'A': 'B99.9', 'B': 'B99.9', 'C': 'D49.9', 'D': 'D49.9', 'I': 'I99.9', 'J': 'J98.9', 'K': 'K92.9', 'V': 'Y89', 'Y': 'Y89' } # add value field df = add_code_metadata(df, ['value'], self.code_system_id, **self.standard_cache_options) report_if_merge_fail(df, 'value', 'code_id') df = df.rename(columns={'value': 'raw_cause'}) # generate new column called "restricted_cause" # ZZZ is the default for all code systems raw_causes = self.prep_code_metadata() assert "ZZZ" in raw_causes.raw_cause.unique(), \ "ZZZ must be in the map" df['restricted_cause'] = "ZZZ" df['restricted_code_id'] = raw_causes.query( "raw_cause == 'ZZZ'")["code_id"].values[0] df['restricted_cause_id'] = raw_causes.query( "raw_cause == 'ZZZ'")["cause_id"].values[0] # restrictions if code system is ICD10 if self.code_system_id == 1: for key in mapping_icd10.keys(): raw_cause = mapping_icd10[key] code_list = raw_causes.query( "raw_cause == '{}'".format(raw_cause)) assert len(code_list) == 1, \ "Found more than one code with value {} in code " \ "system {}".format(raw_cause, self.code_system_id) new_code_id = code_list['code_id'].iloc[0] new_cause_id = code_list['cause_id'].iloc[0] df.loc[df['raw_cause'].str.startswith(key), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = [raw_cause, new_code_id, new_cause_id] # replace restricted_cause = "acause_diarrhea" # if inlist(yll_cause,"digest_ibd","digest_vascular") code_list = raw_causes.query('raw_cause == "acause_diarrhea"') assert len(code_list) == 1, \ "Found more than one code with value {} in code " \ "system {}".format(raw_cause, self.code_system_id) new_code_id = code_list['code_id'].iloc[0] new_cause_id = code_list['cause_id'].iloc[0] # changes for digest_ibd df.loc[df['cause_id'] == 532, ['restricted_cause']] = raw_cause df.loc[df['cause_id'] == 532, ['restricted_code_id']] = new_code_id df.loc[df['cause_id'] == 532, ['restricted_cause_id']] = new_cause_id # changes for digest_vascular df.loc[df['cause_id'] == 533, ['restricted_cause']] = raw_cause df.loc[df['cause_id'] == 533, ['restricted_code_id']] = new_code_id df.loc[df['cause_id'] == 533, ['restricted_cause_id']] = new_cause_id # restrictions if code system is ICD9 if self.code_system_id == 6: df['numeric_cause'] = pd.to_numeric(df['raw_cause'], errors='coerce') # 0-140 to 139.8 new_code_id = raw_causes.query( "raw_cause == '139.8'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '139.8'")["cause_id"].values[0] df.loc[(df.numeric_cause >= 1) & (df.numeric_cause < 140), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = "139.8", new_code_id, new_cause_id # replace restricted_cause = "239.9" if numeric_cause >= 140 # & numeric_cause < 240 new_code_id = raw_causes.query( "raw_cause == '239.9'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '239.9'")["cause_id"].values[0] df.loc[(df.numeric_cause >= 140) & (df.numeric_cause < 240), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = "239.9", new_code_id, new_cause_id # replace restricted_cause = "459.9" if numeric_cause >= 390 # & numeric_cause < 460 new_code_id = raw_causes.query( "raw_cause == '459.9'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '459.9'")["cause_id"].values[0] df.loc[(df.numeric_cause >= 390) & (df.numeric_cause < 460), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = "459.9", new_code_id, new_cause_id # replace restricted_cause = "5199" if numeric_cause >= 460 # & numeric_cause < 520 new_code_id = raw_causes.query( "raw_cause == '519.9'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '519.9'")["cause_id"].values[0] df.loc[(df.numeric_cause >= 460) & (df.numeric_cause < 520), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = "519.9", new_code_id, new_cause_id # replace restricted_cause = "578" if numeric_cause >= 520 # & numeric_cause < 580 new_code_id = raw_causes.query( "raw_cause == '578'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '578'")["cause_id"].values[0] df.loc[(df.numeric_cause >= 520) & (df.numeric_cause < 580), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = "578", new_code_id, new_cause_id # replace restricted_cause = "E989" if substr(cause,1,1) == "E" new_code_id = raw_causes.query( "raw_cause == 'E989'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == 'E989'")["cause_id"].values[0] df.loc[df['raw_cause'].str.startswith("E"), [ 'restricted_cause', 'restricted_code_id', 'restricted_cause_id' ]] = "E989", new_code_id, new_cause_id assert pd.notnull(df.restricted_code_id).all() assert pd.notnull(df.restricted_cause_id).all() return df
def get_computed_dataframe(self): """Main method to execute computations and return result. Notes: UNDECIDED HOW TO DO THIS WITHOUT ALL YEARS IN MEMORY LIKE STATA HAD Potential solutions: 1. Don't do this at all, just correct ANY cause-age-sex-location-year that exceeds the global reference rate - this would potentially change results slightly, but does not seem unreasonable, and in fact seems more correct 2. Prime HIV correction by assembling the list ahead of time - might take a long time and need to be rerun every time, which would essentially double the required time for this step - advantage is that it mimics last years results without needing any additional years of data - could eliminate some of the problems with this method by running it very infrequently instead of every time the data changes 3. Take a 'source' argument in the class and pull the other data that we pulled last year to pool years necessary to generate this list 4. Run HIV correction with all the data for a 'source' altogether, like the stata code did, but still update versions based on nid-year FOR NOW: Follow method 1 and expect to test the similarity later """ keep_cols = self.df.columns if not self.country_needs_correction(): print_log_message("Country doesn't need hiv correction") self.diag_df = None return self.df print_log_message("Getting rates df") rates_df = self.get_rates_df(self.cause_meta_df) if self.correct_garbage: df = add_code_metadata(self.df, add_cols=['value'], code_system_id=self.code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) df = self.identify_sepsis_gc(df, self.code_system_id) df = self.identify_injury_gc(df, self.code_system_id) df = self.identify_hivrd_gc(df, self.code_system_id) # do a groupby to collapse down to cause_id level for next steps group_cols = [ x for x in keep_cols if x not in ['code_id', 'deaths'] ] df_by_code = df.copy() df_by_cause = df.groupby(group_cols, as_index=False)['deaths'].sum() else: df_by_cause = self.df df = add_population(df_by_cause, pop_df=self.pop_df) print_log_message("Flagging correct dem groups for " "{0} rows of data".format(len(df))) df = flag_correct_dem_groups(df, self.code_system_id, self.cause_meta_df, self.loc_meta_df, self.age_meta_df, rates_df, self.reference_ages, self.move_gc_age_restrictions, self.value_cols, self.pop_col, self.cause_selections_path, correct_garbage=self.correct_garbage) cause_to_targets_map = self.get_cause_to_targets_map( self.cause_meta_df) print_log_message("Identifying positive excess") df = identify_positive_excess(df, rates_df, cause_to_targets_map, self.reference_ages, self.loc_meta_df, self.cause_meta_df, self.value_cols, self.pop_col, self.correct_garbage) if self.correct_garbage: df = self.calculate_garbage_positive_excess( df, df_by_code, group_cols) print_log_message("Moving excess to target") df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) computed_df = assign_code_to_created_target_deaths( df, self.code_system_id, self.cause_meta_df) else: print_log_message("Moving excess to target") computed_df = move_excess_to_target(df, self.value_cols, cause_to_targets_map, self.correct_garbage) self.diag_df = computed_df return computed_df[keep_cols]
def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") wait('claude_redistributionworker_{}'.format(nid), 30) print_log_message("Done waiting. Appending them together") df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df
def special_cause_reassignment(self, df, code_system_id): """Replace the actual data cause under certain conditions. This essentially allows mapping based on not just the cause and code system but based on other information like the location, NID, year, etc. Args: df (DataFrame): data with cause Returns: DataFrame: with any modifications """ cache_args = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': 'standard', 'cache_results': False } # Some SRS codes get redistributed differently than # other ICD10 datasets df = add_nid_metadata(df, 'source', **cache_args) if (df['source'] == "India_SRS_states_report").any(): print_log_message("Changing SRS codes to custom garbage groups") assert (df['source'] == "India_SRS_states_report").all() df = add_code_metadata(df, 'value', code_system_id=code_system_id, **cache_args) custom_grbg = pd.read_csv( self.cg.get_resource("srs_custom_garbage_groups")) custom_grbg = custom_grbg.query('active == 1') custom_grbg['value'] = custom_grbg['srs_custom_garbage_group'] custom_grbg = add_code_metadata(custom_grbg, 'code_id', code_system_id=code_system_id, merge_col='value', **cache_args) custom_grbg = custom_grbg.rename( columns={'code_id': 'new_code_id'}) custom_grbg = custom_grbg[['package_id', 'new_code_id']] gp_dfs = [] for package_id in custom_grbg.package_id.unique(): gp_df = get_garbage_from_package(code_system_id, package_id, package_arg_type="package_id") assert len(gp_df) != 0, \ "Found 0 codes for package {}".format(package_id) gp_dfs.append(gp_df) gp_df = pd.concat(gp_dfs, ignore_index=True) gp_df = gp_df.merge(custom_grbg, how='left') report_if_merge_fail(gp_df, 'new_code_id', 'package_id') gp_df = gp_df[['value', 'new_code_id']] gp_df['value'] = gp_df['value'].str.strip() df = df.merge(gp_df, how='left', on='value') df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id'] df['code_id'] = df['code_id'].astype(int) df = df.drop(['new_code_id', 'value'], axis=1) df = df.drop('source', axis=1) china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2) five_dig_code = df['code_id'] == 13243 df.loc[china_cdc_2008 & five_dig_code, 'code_id'] = 13242 return df
def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): """String together processes for redistribution.""" # what to do about caching throughout the phase read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } # the iso3 of this data iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) # the code system id code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) # the data type data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) # cause map cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) # get age group ids age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) # Move garbage to hiv first hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) # recognizing that it is weird for code_system_id to come from two places, # make sure they are consistent assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") # do we have all the packages we need? # verify_packages(df) # format age groups to match package parameters df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) # create split groups # NO SPLIT GROUP NEEDED df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) # submit jobs or just run them here if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") # wait until all jobs for a given nid have completed # eventually need logic for files not being present wait('claude_redistributionworker_{}'.format(nid), 30) # This seems to be necessary to wait for files print_log_message("Done waiting. Appending them together") # append split groups together df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) # bad if change 2% or 5 deaths, whichever is greater # (somewhat arbitrary, just trying to avoid annoying/non-issue failures) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df
def set_restricted_cause(self, df): mapping_icd10 = {'A': 'B99.9', 'B': 'B99.9', 'C': 'D49.9', 'D': 'D49.9', 'I': 'I99.9', 'J': 'J98.9', 'K': 'K92.9', 'V': 'Y89', 'Y': 'Y89'} df = add_code_metadata( df, ['value'], self.code_system_id, **self.standard_cache_options ) report_if_merge_fail(df, 'value', 'code_id') df = df.rename(columns={'value': 'raw_cause'}) raw_causes = self.prep_code_metadata() assert "ZZZ" in raw_causes.raw_cause.unique(), \ "ZZZ must be in the map" df['restricted_cause'] = "ZZZ" df['restricted_code_id'] = raw_causes.query( "raw_cause == 'ZZZ'")["code_id"].values[0] df['restricted_cause_id'] = raw_causes.query( "raw_cause == 'ZZZ'")["cause_id"].values[0] if self.code_system_id == 1: for key in mapping_icd10.keys(): raw_cause = mapping_icd10[key] code_list = raw_causes.query( "raw_cause == '{}'".format(raw_cause)) assert len(code_list) == 1, \ "Found more than one code with value {} in code " \ "system {}".format(raw_cause, self.code_system_id) new_code_id = code_list['code_id'].iloc[0] new_cause_id = code_list['cause_id'].iloc[0] df.loc[df['raw_cause'].str.startswith(key), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id']] = [raw_cause, new_code_id, new_cause_id] code_list = raw_causes.query('raw_cause == "acause_diarrhea"') assert len(code_list) == 1, \ "Found more than one code with value {} in code " \ "system {}".format(raw_cause, self.code_system_id) new_code_id = code_list['code_id'].iloc[0] new_cause_id = code_list['cause_id'].iloc[0] df.loc[df['cause_id'] == 532, ['restricted_cause']] = raw_cause df.loc[df['cause_id'] == 532, ['restricted_code_id']] = new_code_id df.loc[df['cause_id'] == 532, ['restricted_cause_id']] = new_cause_id df.loc[df['cause_id'] == 533, ['restricted_cause']] = raw_cause df.loc[df['cause_id'] == 533, ['restricted_code_id']] = new_code_id df.loc[df['cause_id'] == 533, ['restricted_cause_id']] = new_cause_id if self.code_system_id == 6: df['numeric_cause'] = pd.to_numeric( df['raw_cause'], errors='coerce') new_code_id = raw_causes.query( "raw_cause == '139.8'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '139.8'")["cause_id"].values[0] df.loc[ (df.numeric_cause >= 1) & (df.numeric_cause < 140), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id'] ] = "139.8", new_code_id, new_cause_id new_code_id = raw_causes.query( "raw_cause == '239.9'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '239.9'")["cause_id"].values[0] df.loc[ (df.numeric_cause >= 140) & (df.numeric_cause < 240), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id'] ] = "239.9", new_code_id, new_cause_id new_code_id = raw_causes.query( "raw_cause == '459.9'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '459.9'")["cause_id"].values[0] df.loc[ (df.numeric_cause >= 390) & (df.numeric_cause < 460), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id'] ] = "459.9", new_code_id, new_cause_id new_code_id = raw_causes.query( "raw_cause == '519.9'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '519.9'")["cause_id"].values[0] df.loc[ (df.numeric_cause >= 460) & (df.numeric_cause < 520), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id'] ] = "519.9", new_code_id, new_cause_id new_code_id = raw_causes.query( "raw_cause == '578'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == '578'")["cause_id"].values[0] df.loc[(df.numeric_cause >= 520) & (df.numeric_cause < 580), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id']] = "578", new_code_id, new_cause_id new_code_id = raw_causes.query( "raw_cause == 'E989'")["code_id"].values[0] new_cause_id = raw_causes.query( "raw_cause == 'E989'")["cause_id"].values[0] df.loc[df['raw_cause'].str.startswith("E"), ['restricted_cause', 'restricted_code_id', 'restricted_cause_id']] = "E989", new_code_id, new_cause_id assert pd.notnull(df.restricted_code_id).all() assert pd.notnull(df.restricted_cause_id).all() return df
def finalize_formatting(df, source, write=False, code_system_id=None, extract_type=None, conn_def='ADDRESS', is_active=False, refresh_cache=True, check_ages=True): """Finalize the formatting of the source and optionally write it out. Decides whether to map code_id based on whether code_id is already a column in the dataset. Needs the following information from either the df values or from the nid_meta_vals dict: data_type_id representative_id All of the above must have only one value per nid in df. Maps site_id to the data based on incoming 'site' column. Will upload any sites that are not in the cod.site table already. Arguments: df, pandas.DataFrame: The dataframe with near-formatted data source, str: The source this df is (should be the whole source and nothing but the source). Will break if there is no source in FILEPATH with this name, and you should pass the source without a leading underscore even if it is that way in J write, bool: whether to write the outputs extract_type, str: The manner in which the nid was extracted. If left as None, will be induced by the location_type_id of the location_id with the maximum level in the dataset. This should be over-ridden in cases like China DSP, where the same locations are used in two extraction types - "DSP + VR" and "DSP"; China DSP then gets two extraction types: "admin1" and "admin1: DSP sites only" (in the particular instance of DSP, extract type is built into this code. Feel free to add other source-extract type mappings here to force consistency.) check_ages, bool: Whether or not to enforce age group checks such as ensuring no overlaps or gaps. This can be turned off because sometimes raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West Bank data with different age groupings). Returns: Every local value to the function Why? There are multiple df outputs, and formatting is a very engaged process so its helpful to just see everything sometimes """ # set column groups, and verify that we have everything we need NID_META_COLS = [ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id', 'is_active', 'is_mort_active' ] NID_LOCATION_YEAR_COLS = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id' ] FORMATTED_ID_COLS = [ 'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id', 'age_group_id', 'location_id' ] if 'code_id' in df.columns: code_col = 'code_id' map_code_id = False elif 'cause' in df.columns: code_col = 'cause' map_code_id = True else: raise AssertionError("Need either 'code_id' or 'cause' in columns") INCOMING_EXPECTED_ID_COLS = [ 'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col, 'site', 'data_type_id', 'representative_id', 'code_system_id' ] VALUE_COLS = ['deaths'] FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns) assert len(missing_cols) == 0, \ "Required formatting columns not found in df: \n{}".format(missing_cols) # SET FORMATTING TIMESTAMP format_timestamp = cod_timestamp() print("Finalizing formatting with timestamp {}".format(format_timestamp)) # ADD SOURCE df['source'] = source # MAP OR CHECK CODE ID code_system_ids = df.code_system_id.unique() if map_code_id: cs_dfs = [] for code_system_id in code_system_ids: cs_df = df.loc[df['code_system_id'] == code_system_id].copy() # map code_id to the data cs_df['value'] = cs_df['cause'] cs_df = add_code_metadata(cs_df, ['code_id'], code_system_id=code_system_id, merge_col='value', force_rerun=True, cache_dir='standard') report_if_merge_fail(cs_df, ['code_id'], ['value']) cs_df = cs_df.drop('value', axis=1) cs_dfs.append(cs_df) df = pd.concat(cs_dfs, ignore_index=True) else: # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM all_codes_q = """ SELECT code_id FROM engine_room.maps_code WHERE code_system_id IN ({}) """.format(",".join([str(c) for c in code_system_ids])) all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS') bad_codes = set(df.code_id) - set(all_codes.code_id) assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\ "systems {}: {}".format(code_system_ids, bad_codes) check_vr_raw_causes(df) # MAP SITE ID df = map_site_id(df, conn_def=conn_def) # MAP EXTRACT TYPE ID df = map_extract_type_id(df, source, extract_type, conn_def=conn_def) # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS df = group_six_minor_territories(df, sum_cols=VALUE_COLS) # sorry for putting this here # drop these loc/years b/c env < deaths creating negative cc_code # maybe re run w/ another envelope? df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))] df = df.loc[~(df['nid'].isin([24143, 107307]))] # ENSURE NO NEGATIVES for val_col in VALUE_COLS: assert (df[val_col] >= 0).all(), \ "there are negative values in {}".format(val_col) ################################################ # keep all 0s now, messing up for NR in non-VR # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1) # all-cause extractions want to keep zeroes # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID # otherwise, drop them # greater_than_zero = df['val_sum_tmp'] > 0 # df = df[greater_than_zero | keep_zeroes] # df = df.drop('val_sum_tmp', axis=1) ################################################ # CHECKS FOR FORMATTED PHASE OUTPUT input_df = df[FINAL_FORMATED_COLS].copy() assert not input_df.isnull().values.any(), "null values in df" dupped = input_df[input_df.duplicated()] if len(dupped) > 0: raise AssertionError("duplicate values in df: \n{}".format(dupped)) # GROUP IF NECESSARY if input_df[FORMATTED_ID_COLS].duplicated().any(): input_df = input_df.groupby(FORMATTED_ID_COLS, as_index=False)[VALUE_COLS].sum() # TESTS F0R CHECKING AGE GROUP IDS if check_ages: check_age_groups(df) # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX? # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION # AGE YEAR SEX FOR REVIEW # MAKE NID METADATA TABLE if 'parent_nid' not in df.columns: df['parent_nid'] = np.nan if is_active is True: warnings.warn( """is_active is deprecated: use the update_nid_metadata_status function to change the status of finalized datasets""" ) # Use existing is_active and is_mort_active values, otherwise default to 0 nid_map = pull_nid_metadata() df = df.merge(nid_map, on=[ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id' ], how='left') df_na = df[pd.isnull(df['is_active'])] df_na = df_na[['nid', 'extract_type_id']].drop_duplicates() if df_na.shape[0] > 0: print("""New rows for the following NID/extract_type_id will be added with is_active and is_mort_active = 0:\n {}""".format(df_na)) df['is_active'] = df['is_active'].fillna(0) df['is_mort_active'] = df['is_mort_active'].fillna(0) # CHECK SUBNATIONAL LOCATIONS df = check_subnational_locations(df) # OVERRIDE REPRESENTATIVE ID FOR NON-VR df = adjust_representative_id(df) nid_meta_df = df[NID_META_COLS].drop_duplicates() nid_meta_df['last_formatted_timestamp'] = format_timestamp # MAKE NID LOCATION YEAR TABLE nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates() nid_locyears['last_formatted_timestamp'] = format_timestamp # check one iso3 per nid nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id') nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3) report_duplicates( nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(), ['nid', 'extract_type_id']) nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1) if write: # write nid metadata write_to_claude_nid_table(nid_meta_df, 'claude_nid_metadata', replace=True, conn_def=conn_def) # write nid location-year map write_to_claude_nid_table(nid_locyears, 'claude_nid_location_year', replace=True, conn_def=conn_def) # write to cod.source for new sources insert_source_id(source) nid_extracts = input_df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) print("Writing nid {}, extract_type_id {}".format( nid, extract_type_id)) idf = input_df.loc[(input_df['nid'] == nid) & ( input_df['extract_type_id'] == extract_type_id)].copy() phase = 'formatted' launch_set_id = format_timestamp print("\nTotal deaths: {}".format(idf.deaths.sum())) write_phase_output(idf, phase, nid, extract_type_id, launch_set_id) # now refresh cache files for nid if refresh_cache: refresh_claude_nid_cache_files() return locals()
def special_cause_reassignment(self, df, code_system_id): """Replace the actual data cause under certain conditions. There are instances where a PI has good reason to believe that a certain group of deaths were assigned to the wrong cause, and it is known what cause to re-assign those deaths to. Implement here. This essentially allows mapping based on not just the cause and code system but based on other information like the location, NID, year, etc. It can also be used (sparingly) for hotfixes like changing all codes with values 'acause_digest_gastrititis' to be named 'acause_digest_gastritis'. Args: df (DataFrame): data with cause Returns: DataFrame: with any modifications """ cache_args = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': 'standard', 'cache_results': False } # Some SRS codes get redistributed differently than # other ICD10 datasets df = add_nid_metadata( df, 'source', **cache_args ) if (df['source'] == "India_SRS_states_report").any(): print_log_message("Changing SRS codes to custom garbage groups") assert (df['source'] == "India_SRS_states_report").all() df = add_code_metadata( df, 'value', code_system_id=code_system_id, **cache_args ) custom_grbg = pd.read_csv( self.cg.get_resource("srs_custom_garbage_groups") ) custom_grbg = custom_grbg.query('active == 1') custom_grbg['value'] = custom_grbg['srs_custom_garbage_group'] custom_grbg = add_code_metadata( custom_grbg, 'code_id', code_system_id=code_system_id, merge_col='value', **cache_args ) custom_grbg = custom_grbg.rename( columns={'code_id': 'new_code_id'}) custom_grbg = custom_grbg[['package_id', 'new_code_id']] gp_dfs = [] for package_id in custom_grbg.package_id.unique(): # THIS QUERIES THE DATABASE - BUT THERE SHOULD NEVER BE A TON # OF SRS JOBS HAPPENING AT ONCE SO IT SHOULD BE OK gp_df = get_garbage_from_package( code_system_id, package_id, package_arg_type="package_id" ) assert len(gp_df) != 0, \ "Found 0 codes for package {}".format(package_id) gp_dfs.append(gp_df) gp_df = pd.concat(gp_dfs, ignore_index=True) gp_df = gp_df.merge(custom_grbg, how='left') report_if_merge_fail(gp_df, 'new_code_id', 'package_id') gp_df = gp_df[['value', 'new_code_id']] gp_df['value'] = gp_df['value'].str.strip() df = df.merge(gp_df, how='left', on='value') df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id'] df['code_id'] = df['code_id'].astype(int) df = df.drop(['new_code_id', 'value'], axis=1) df = df.drop('source', axis=1) china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2) # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail) five_dig_code = df['code_id'] == 13243 df.loc[ china_cdc_2008 & five_dig_code, 'code_id' ] = 13242 return df
def finalize_formatting(df, source, write=False, code_system_id=None, extract_type=None, conn_def='ADDRESS', is_active=True): NID_META_COLS = [ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id', 'is_active' ] NID_LOCATION_YEAR_COLS = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id' ] FORMATTED_ID_COLS = [ 'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id', 'age_group_id', 'location_id' ] if 'code_id' in df.columns: code_col = 'code_id' map_code_id = False elif 'cause' in df.columns: code_col = 'cause' map_code_id = True else: raise AssertionError("Need either 'code_id' or 'cause' in columns") INCOMING_EXPECTED_ID_COLS = [ 'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col, 'site', 'data_type_id', 'representative_id', 'code_system_id' ] VALUE_COLS = ['deaths'] FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns) if len(missing_cols) > 0: raise AssertionError( """These columns are needed for formatting but not found in df: {} """.format(missing_cols)) # SET FORMATTING TIMESTAMP format_timestamp = cod_timestamp() print("Finalizing formatting with timestamp {}".format(format_timestamp)) # ADD SOURCE df['source'] = source # MAP OR CHECK CODE ID code_system_ids = df.code_system_id.unique() if map_code_id: cs_dfs = [] for code_system_id in code_system_ids: cs_df = df.loc[df['code_system_id'] == code_system_id].copy() # map code_id to the data cs_df['value'] = cs_df['cause'] cs_df = add_code_metadata(cs_df, ['code_id'], code_system_id=code_system_id, merge_col='value', force_rerun=True, cache_dir='standard') print(cs_df.loc[cs_df['code_id'].isnull()].value.unique()) report_if_merge_fail(cs_df, ['code_id'], ['value']) cs_df = cs_df.drop('value', axis=1) cs_dfs.append(cs_df) df = pd.concat(cs_dfs, ignore_index=True) else: # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE # CODE SYSTEM all_codes_q = """ SELECT code_id FROM ADDRESS WHERE code_system_id IN ({}) """.format(",".join([str(c) for c in code_system_ids])) all_codes = ezfuncs.query(all_codes_q, conn_def='engine') bad_codes = set(df.code_id) - set(all_codes.code_id) if len(bad_codes) > 0: print("Found these code ids in data that can't exist in code " "systems {}: {}".format(code_system_ids, bad_codes)) # MAP SITE ID df = map_site_id(df, conn_def=conn_def) # MAP EXTRACT TYPE ID df = map_extract_type_id(df, source, extract_type, conn_def=conn_def) # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS df = group_six_minor_territories(df, sum_cols=VALUE_COLS) df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))] df = df.loc[~(df['nid'].isin([24143, 107307]))] # ENSURE NO NEGATIVES for val_col in VALUE_COLS: assert (df[val_col] >= 0).all(), \ "there are negative values in {}".format(val_col) input_df = df[FINAL_FORMATED_COLS].copy() assert not input_df.isnull().values.any(), "null values in df" dupped = input_df[input_df.duplicated()] if len(dupped) > 0: raise AssertionError("duplicate values in df: \n{}".format(dupped)) # GROUP IF NECESSARY if input_df[FORMATTED_ID_COLS].duplicated().any(): input_df = input_df.groupby(FORMATTED_ID_COLS, as_index=False)[VALUE_COLS].sum() # MAKE NID METADATA TABLE if 'parent_nid' not in df.columns: df['parent_nid'] = np.nan df['is_active'] = 1 * is_active # CHECK SUBNATIONAL LOCATIONS # alters is_active if needed df = check_subnational_locations(df) nid_meta_df = df[NID_META_COLS].drop_duplicates() nid_meta_df['last_updated_timestamp'] = format_timestamp # MAKE NID LOCATION YEAR TABLE nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates() nid_locyears['last_updated_timestamp'] = format_timestamp # check one iso3 per nid nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id') nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3) report_duplicates( nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(), ['nid', 'extract_type_id']) nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1) if write: # write nid metadata write_to_claude_nid_table(nid_meta_df, 'claude_nid_metadata', replace=True, conn_def=conn_def) # write nid location-year map write_to_claude_nid_table(nid_locyears, 'claude_nid_location_year', replace=True, conn_def=conn_def) insert_source_id(source) nid_extracts = input_df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) print("Writing nid {}, extract_type_id {}".format( nid, extract_type_id)) idf = input_df.loc[(input_df['nid'] == nid) & ( input_df['extract_type_id'] == extract_type_id)].copy() phase = 'formatted' launch_set_id = format_timestamp print("\nTotal deaths: {}".format(idf.deaths.sum())) write_phase_output(idf, phase, nid, extract_type_id, launch_set_id) # now refresh cache files for nid print("\nRefreshing claude nid metadata cache files") force_cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': "standard", 'cache_results': True, 'verbose': True } get_nid_metadata(**force_cache_options) get_nidlocyear_map(**force_cache_options) return locals()