def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): """String together processes for redistribution.""" # what to do about caching throughout the phase read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } # the iso3 of this data iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) # the code system id code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) # the data type data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) # cause map cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) # get age group ids age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) # Move garbage to hiv first hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) # recognizing that it is weird for code_system_id to come from two places, # make sure they are consistent assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") # do we have all the packages we need? # verify_packages(df) # format age groups to match package parameters df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) # create split groups # NO SPLIT GROUP NEEDED df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) # submit jobs or just run them here if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") # wait until all jobs for a given nid have completed # eventually need logic for files not being present wait('claude_redistributionworker_{}'.format(nid), 30) # This seems to be necessary to wait for files print_log_message("Done waiting. Appending them together") # append split groups together df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) # bad if change 2% or 5 deaths, whichever is greater # (somewhat arbitrary, just trying to avoid annoying/non-issue failures) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id): cache_dir = CONF.get_directory('db_cache') source = get_value_from_nid( nid, 'source', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } # ************************************************************ # Get cached metadata # ************************************************************ print_log_message("Getting cached db resources") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) env_df = get_env(env_run_id=env_run_id, **standard_cache_options) age_weight_df = get_age_weights(**standard_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) age_meta_df = get_ages(**standard_cache_options) # ************************************************************ # RAKING # ************************************************************ # Rake if appropriate based on this logic if ((data_type_id in [8, 9, 10] and (source != 'Other_Maternal')) or source in MATERNAL_NR_SOURCES): if source not in NOT_RAKED_SOURCES: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # for the Other_Maternal source we only rake household surveys elif source == "Other_Maternal": model_groups = get_datasets(nid, extract_type_id, block_rerun=True, force_rerun=False).model_group.unique() assert len(model_groups) == 1 model_group = model_groups[0] if "HH_SURVEYS" in model_group: if model_group == 'MATERNAL-HH_SURVEYS-IND': print_log_message("Raking sub national estimates," \ " applying double raking for India Maternal" ) raker = Raker(df, source, double=True) df = raker.get_computed_dataframe(location_hierarchy) else: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # ************************************************************ # DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA # ************************************************************ # data with zero sample size is almost certaintly some anomolous result # of a program generating data it shouldn't have, and it shouldn't be # included in codem models. Was probably already dropped, anyway, before # running noise reduction. df = df.query('sample_size != 0') # uploading data before 1980 is a waste of space because neither codem # nor codviz use it df = df.loc[df['year_id'] >= 1980] print_log_message("Enforcing age sex restrictions") # this actually drops data from the dataframe if it violates age/sex # restrictions (e.g. male maternity disorders) df = enforce_asr(df, cause_meta_df, age_meta_df) # ************************************************************ # FIT EACH DRAW TO NON-ZERO FLOOR # ************************************************************ print_log_message("Fitting to non-zero floor...") nonzero_floorer = NonZeroFloorer(df) df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df) # ************************************************************ # AGE AGGREGATION # ************************************************************ print_log_message("Creating age standardized and all ages groups") age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df) df = age_aggregator.get_computed_dataframe() # ************************************************************ # Make CODEm and CoDViz metrics for uncertainty # ************************************************************ # columns that should be present in the phase output final_cols = [ 'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd', 'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id', 'site_id', 'year_id' ] # Use draws to make metrics for uncertainty to # be used by CODEm and CoDViz # also creates cf_final from mean of draws print_log_message("Making metrics for CODEm and CoDViz") if dataset_has_redistribution_variance(data_type_id, source): df = RedistributionVarianceEstimator.make_codem_codviz_metrics( df, pop_df) final_cols += [ 'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr', 'variance_rd_logit_cf' ] # we did this in the old code-- no cfs over 1 nor below 0 for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']: df.loc[df[cf_col] > 1, cf_col] = 1 df.loc[df[cf_col] < 0, cf_col] = 0 df = df[final_cols] return df
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id, location_set_version_id): """Run the full phase, chaining together computational elements.""" cache_dir = CONF.get_directory('FILEPATH') orig_deaths = df['deaths'].sum() standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } code_system_id = get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id) # this queries the database, maybe should be passed in directly code_system = get_code_system_from_id(code_system_id) source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) is_vr = data_type_id in [9, 10] if not skip_hiv_correction(source) and is_vr: # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) # get population pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) # get age metadata age_meta_df = get_ages(**standard_cache_options) # get the country iso3 = get_value_from_nid( nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \ "extract_type_id {}".format(nid, extract_type_id) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=False) print_log_message("Running hiv correction for iso3 {}".format(iso3)) df = hiv_corrector.get_computed_dataframe() if needs_injury_redistribution(source): print_log_message("Correcting injuries") if not 'loc_meta_df' in vars(): # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) injury_redistributor = InjuryRedistributor(df, loc_meta_df, cause_meta_df) df = injury_redistributor.get_computed_dataframe() df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id) val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd'] # run china VR rescaling if needs_subnational_rescale(source): china_rescaler = ChinaHospitalUrbanicityRescaler() df = china_rescaler.get_computed_dataframe(df) if needs_strata_collapse(source): # set site id to blank site id and collapse df['site_id'] = 2 group_cols = list(set(df.columns) - set(val_cols)) df = df.groupby(group_cols, as_index=False)[val_cols].sum() if is_vr: # drop if deaths are 0 across all current deaths columns df = df.loc[df[val_cols].sum(axis=1) != 0] # restrict causes based on code system print_log_message("Running bridge mapper") bridge_mapper = BridgeMapper(source, cause_meta_df, code_system) df = bridge_mapper.get_computed_dataframe(df) # run recodes based on expert opinion print_log_message("Enforcing some very hard priors (expert opinion)") expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id, data_type_id) df = expert_opinion_recoder.get_computed_dataframe(df) end_deaths = df['deaths'].sum() print_log_message("Checking no large loss or gain of deaths") if abs(orig_deaths - end_deaths) >= (.1 * end_deaths): diff = round(abs(orig_deaths - end_deaths), 2) old = round(abs(orig_deaths)) new = round(abs(end_deaths)) raise AssertionError("Change of {} deaths [{}] to [{}]".format( diff, old, new)) return df
def run_redistribution(input_data, signature_ids, proportion_ids, cause_map, package_folder, residual_cause='cc_code', diagnostic_output=False, first_and_last_only=False, rerun_cause_map=True): """Most granular method of whole redistribution process.""" data, signature_metadata, proportion_metadata = prep_data( input_data, signature_ids, proportion_ids, residual_cause=residual_cause) print_log_message("Importing packages") packages = get_packages(package_folder, cause_map) print_log_message("Evaluating cause map restrictions") nid = int(input_data.nid.unique().item()) extract_type_id = int(input_data.extract_type_id.unique().item()) cm_file = "FILEPATH".format(nid, extract_type_id) if not os.path.isfile(cm_file): rerun_cause_map = True if rerun_cause_map: cause_map_evaluated = evaluate_cause_restrictions( cause_map, proportion_metadata) else: cause_map_evaluated = pd.read_csv(cm_file) print_log_message("Run redistribution!") diagnostics_all = [] seq = 0 if first_and_last_only: first = packages[0] last = packages[-1] packages = [first, last] for package in packages: if not data_has_any_package_garbage(data, package): continue print_log_message(" package: {}".format(package['package_name'])) print_log_message(" package_description: {}".format( package['package_description'])) print_log_message(" Deaths before = " + str(data.freq.sum())) print_log_message(" Rows before = " + str(len(data))) print_log_message(" ... calculating proportions") proportions = get_proportions(data, proportion_metadata, package, cause_map_evaluated, residual_cause=residual_cause) print_log_message(" ... redistributing data") data, diagnostics = redistribute_garbage(data, proportions, package) data = data.loc[(data['freq'] > 0) | (data['cause'] == residual_cause)] data = data.groupby(['proportion_id', 'signature_id', 'cause']).sum().reset_index() if diagnostic_output: diagnostics['seq'] = seq add_cols = [ 'shared_package_version_id', 'package_version_id', 'package_name', 'package_id' ] for add_col in add_cols: diagnostics[add_col] = package[add_col] seq += 1 diagnostics_all.append(diagnostics) print_log_message(" Deaths after = " + str(data.freq.sum())) print_log_message(" Rows after = " + str(len(data))) print_log_message("Done!") data = pd.merge(data, signature_metadata, on='signature_id') if diagnostic_output: diagnostics = pd.concat(diagnostics_all).reset_index(drop=True) return data.ix[data.freq > 0], diagnostics, \ signature_metadata, proportion_metadata
def correct_misdiagnosis(df, nid, extract_type_id, code_system_id, adjust_id, remove_decimal): conf = Configurator('standard') mc_process_dir = conf.get_directory('mc_process_data') package_dir = conf.get_directory('rd_process_inputs') + "FILEPATH" misdiagnosis_path = conf.get_resource('misdiagnosis_prob_path') if adjust_id == 543: misdiagnosis_version_id = 4 elif adjust_id == 544: misdiagnosis_version_id = 3 elif adjust_id == 500: misdiagnosis_version_id = 3 misdiagnosis_path = misdiagnosis_path.format( adjust_id=adjust_id, version_id=misdiagnosis_version_id, code_system_id=code_system_id) start_deaths = df['deaths'].sum() start_deaths_target = df.loc[df.cause_id == adjust_id, 'deaths'].sum() start_deaths_cc = df.loc[df.cause_id == 919, 'deaths'].sum() df = df.loc[df.deaths > 0] print_log_message("Adding packages") df = add_packages(df, code_system_id, remove_decimal, package_dir) print_log_message("Getting deaths to move") move_df = get_deaths_to_move(df, adjust_id, misdiagnosis_path, mc_process_dir, nid, extract_type_id, code_system_id) print_log_message("Jumbling up deaths") df = death_jumble(df, move_df, adjust_id, code_system_id) print_log_message("Checking deaths jumbled well") end_deaths = df['deaths'].sum() end_deaths_target = df.loc[df.cause_id == adjust_id, 'deaths'].sum() end_deaths_cc = df.loc[df.cause_id == 919, 'deaths'].sum() assert abs(int(start_deaths) - int(end_deaths)) <= 5, \ 'Bad jumble - added/lost deaths ' \ '(started: {}, ended: {})'.format(str(int(start_deaths)), str(int(end_deaths))) print_log_message("Storing intermediate data") store_intermediate_data(df, move_df, mc_process_dir, adjust_id, nid, extract_type_id) print_log_message('Deaths moved: ' + str( int((end_deaths_target + end_deaths_cc) - (start_deaths_target + start_deaths_cc)))) return df
def get_proportions(data, proportion_metadata, package, cause_map_evaluated, residual_cause='cc_code'): weight_groups = find_weight_groups(package, proportion_metadata, filter_impossible=True, verify_integrity=False) print_log_message(" -Identifying targets") targets = [] for tg in package['target_groups']: temp = pd.DataFrame( {'cause': package['target_groups'][tg]['target_codes']}) temp['target_group'] = tg targets.append(temp) targets = pd.concat(targets).reset_index(drop=True) print_log_message(" -Pulling data counts - 1") proportions = [] for pid in weight_groups['proportion_id']: temp = targets.copy(deep=True) if package['create_targets'] == 1: temp['freq'] = 0.001 else: temp['freq'] = 0 temp['proportion_id'] = pid proportions.append(temp) print_log_message(" -Pulling data counts - 2") tg_dict = {} for tg in package['target_groups']: tg_dict[tg] = package['target_groups'][tg]['target_codes'] tg_df = pd.DataFrame.from_dict(tg_dict, orient='index').stack().reset_index() tg_df.columns = ['target_group', 'index', 'cause'] tg_df = tg_df[['target_group', 'cause']] tg_df = tg_df.merge(data[['proportion_id', 'cause', 'freq']], on='cause', how='left') proportions.append(tg_df) print_log_message(" -Pulling data counts - 3") proportions = pd.concat(proportions) print_log_message(" -Pulling data counts - 4") proportions = proportions.sort_values( ['proportion_id', 'target_group', 'cause'], ).reset_index(drop=True) print_log_message(" -Pulling data counts - 5") proportions = proportions.set_index( ['proportion_id', 'target_group']).join( proportions.groupby(['proportion_id', 'target_group' ]).sum().rename(columns={'freq': 'total'})) print_log_message(" -Pulling data counts - 6") proportions.ix[proportions['total'] == 0, 'freq'] = 0.001 proportions = proportions.drop('total', axis=1).reset_index() print_log_message(" -Pulling data counts - 7") proportions = pd.merge(proportions, proportion_metadata, on='proportion_id') print_log_message(" -Pulling data counts - 8") print_log_message(" -Merging on cause restrictions") # Merge on cause restrictions proportions = pd.merge(proportions, cause_map_evaluated, on=['proportion_id', 'cause'], how='left') report_if_merge_fail(proportions, 'eval', ['proportion_id', 'cause']) # Zero out if the cause is restricted proportions.ix[~proportions['eval'], 'freq'] = 0 # Calculate totals for each cause print_log_message(" -Calculating totals for each cause") proportions = proportions.ix[:, [ 'proportion_id', 'target_group', 'cause', 'freq' ]].groupby(['proportion_id', 'target_group', 'cause']).sum().reset_index() # Calculate totals for each target group & merge back on print_log_message( " -Calculating totals for each target group") proportions = proportions.set_index( ['proportion_id', 'target_group']).join( proportions.groupby(['proportion_id', 'target_group' ]).sum().rename(columns={'freq': 'total'})) proportions = pd.merge(proportions.reset_index(), weight_groups, on='proportion_id') # Merge on weights print_log_message(" -Merging on weights") weights = [] for tg in package['target_groups']: wg = 0 for wgt in package['target_groups'][tg]['weights']: weights.append({ 'target_group': tg, 'weight_group': str(wg), 'weight': wgt }) wg += 1 weights = pd.DataFrame(weights) proportions = pd.merge(proportions, weights, on=['target_group', 'weight_group']) # Calculate final proportions to apply print_log_message(" -Reformatting data type") for c in ['freq', 'weight', 'total']: proportions[c] = proportions[c].astype('float64') print_log_message(" -Calculating proportions") proportions['proportion'] = (proportions.freq / proportions.total) * \ proportions.weight print_log_message(" -Adding residual causes where needed") proportions = pd.concat([ proportions.ix[proportions.total == 0, ['proportion_id', 'target_group', 'weight', 'total']]. drop_duplicates().set_index('total').set_value( 0, 'cause', residual_cause).rename(columns={ 'weight': 'proportion' }).reset_index().ix[:, ['proportion_id', 'proportion', 'cause']]. groupby(['proportion_id', 'cause']).sum().reset_index(), proportions.ix[proportions.total != 0, ['proportion_id', 'cause', 'proportion']].groupby( ['proportion_id', 'cause']).sum().reset_index() ]).reset_index(drop=True) # Again make sure everything sums to 1 print_log_message(" -Make sure everything sums to 1") proportions = proportions.set_index(['proportion_id']).join( proportions.groupby(['proportion_id' ]).sum().rename(columns={'proportion': 'total'})) proportions['proportion'] = (proportions.proportion / proportions.total) proportions = proportions.reset_index()[[ 'proportion_id', 'cause', 'proportion' ]] return proportions
def redistribute_garbage(data, proportions, package): """Prepare garbage codes for redistribution.""" diagnostics = [] # Make sure the package contains all the codes in the proportions set print_log_message(" -Expanding proportions to signature id") temp = data[['proportion_id', 'signature_id' ]].drop_duplicates().reset_index(drop=True).copy(deep=True) proportions = pd.merge(temp, proportions, on='proportion_id') # Tag garbage print_log_message(" -Tagging garbage") causes = data[['cause']].drop_duplicates() causes['garbage'] = 0 causes.loc[causes['cause'].isin(package['garbage_codes']), 'garbage'] = 1 cause_garbage_map = causes.set_index('cause').to_dict()['garbage'] data['garbage'] = data['cause'].map(cause_garbage_map) diagnostics.append(data.loc[data['garbage'] == 1]) # Get total number of garbage codes for each signature_id print_log_message(" -Summing garbage per signature id") temp = data.loc[data['garbage'] == 1, ['proportion_id', 'signature_id', 'freq']].groupby( ['proportion_id', 'signature_id']).sum().reset_index() temp = temp.rename(columns={'freq': 'garbage'}) print_log_message(" -Splitting garbage onto targets: merge") # Redistribute garbage onto targets additions = pd.merge(proportions, temp, on=['proportion_id', 'signature_id'], how='outer') print_log_message( " -Splitting garbage onto targets: multiply") for c in ['proportion', 'garbage']: additions[c] = additions[c].fillna(0) additions['freq'] = additions['proportion'] * additions['garbage'] additions = additions.loc[ additions['freq'] > 0, ['signature_id', 'proportion_id', 'cause', 'freq']] diagnostics.append(additions) print_log_message( " -Appending split garbage onto non-garbage") # Zero out garbage codes data.loc[data['garbage'] == 1, 'freq'] = 0 # Tack on redistributed data data = pd.concat([data, additions]) data = data.loc[:, ['proportion_id', 'signature_id', 'cause', 'freq']] data = data.reset_index(drop=True) # Create diagnostics print_log_message(" -Making diagnostic dataframe") diagnostics = pd.concat(diagnostics) diagnostics['garbage'] = diagnostics['garbage'].fillna(0) # Collapse to proportion id diagnostics = diagnostics.groupby(['proportion_id', 'garbage', 'cause'])['freq'].sum().reset_index() # Return outputs return data, diagnostics
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id): cache_dir = CONF.get_directory('db_cache') source = get_value_from_nid( nid, 'source', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } # ************************************************************ # Get cached metadata # ************************************************************ print_log_message("Getting cached db resources") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) env_df = get_env(env_run_id=env_run_id, **standard_cache_options) age_weight_df = get_age_weights(**standard_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) age_meta_df = get_ages(**standard_cache_options) # ************************************************************ # RAKING # ************************************************************ if ((data_type_id in [8, 9, 10] and (source != "Other_Maternal")) or source in MATERNAL_NR_SOURCES): if source not in NOT_RAKED_SOURCES: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # for the Other_Maternal source we only rake household surveys elif source == "Other_Maternal": model_groups = get_datasets(nid, extract_type_id, block_rerun=True, force_rerun=False).model_group.unique() assert len(model_groups) == 1 model_group = model_groups[0] if "HH_SURVEYS" in model_group: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # ************************************************************ # DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA # ************************************************************ df = df.query('sample_size != 0') df = df.loc[df['year_id'] >= 1980] print_log_message("Enforcing age sex restrictions") df = enforce_asr(df, cause_meta_df, age_meta_df) # ************************************************************ # FIT EACH DRAW TO NON-ZERO FLOOR # ************************************************************ print_log_message("Fitting to non-zero floor...") nonzero_floorer = NonZeroFloorer(df) df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df) # ************************************************************ # AGE AGGREGATION # ************************************************************ print_log_message("Creating age standardized and all ages groups") age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df) df = age_aggregator.get_computed_dataframe() # ************************************************************ # Make CODEm and CoDViz metrics for uncertainty # ************************************************************ # columns that should be present in the phase output final_cols = [ 'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd', 'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id', 'site_id', 'year_id' ] print_log_message("Making metrics for CODEm and CoDViz") if dataset_has_redistribution_variance(data_type_id, source): df = RedistributionVarianceEstimator.make_codem_codviz_metrics( df, pop_df) final_cols += [ 'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr', 'variance_rd_logit_cf' ] for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']: df.loc[df[cf_col] > 1, cf_col] = 1 df.loc[df[cf_col] < 0, cf_col] = 0 df = df[final_cols] return df
def special_cause_reassignment(self, df, code_system_id): """Replace the actual data cause under certain conditions. This essentially allows mapping based on not just the cause and code system but based on other information like the location, NID, year, etc. Args: df (DataFrame): data with cause Returns: DataFrame: with any modifications """ cache_args = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': 'standard', 'cache_results': False } # Some SRS codes get redistributed differently than # other ICD10 datasets df = add_nid_metadata( df, 'source', **cache_args ) if (df['source'] == "India_SRS_states_report").any(): print_log_message("Changing SRS codes to custom garbage groups") assert (df['source'] == "India_SRS_states_report").all() df = add_code_metadata( df, 'value', code_system_id=code_system_id, **cache_args ) custom_grbg = pd.read_csv( self.cg.get_resource("srs_custom_garbage_groups") ) custom_grbg = custom_grbg.query('active == 1') custom_grbg['value'] = custom_grbg['srs_custom_garbage_group'] custom_grbg = add_code_metadata( custom_grbg, 'code_id', code_system_id=code_system_id, merge_col='value', **cache_args ) custom_grbg = custom_grbg.rename( columns={'code_id': 'new_code_id'}) custom_grbg = custom_grbg[['package_id', 'new_code_id']] gp_dfs = [] for package_id in custom_grbg.package_id.unique(): gp_df = get_garbage_from_package( code_system_id, package_id, package_arg_type="package_id" ) assert len(gp_df) != 0, \ "Found 0 codes for package {}".format(package_id) gp_dfs.append(gp_df) gp_df = pd.concat(gp_dfs, ignore_index=True) gp_df = gp_df.merge(custom_grbg, how='left') report_if_merge_fail(gp_df, 'new_code_id', 'package_id') gp_df = gp_df[['value', 'new_code_id']] gp_df['value'] = gp_df['value'].str.strip() df = df.merge(gp_df, how='left', on='value') df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id'] df['code_id'] = df['code_id'].astype(int) df = df.drop(['new_code_id', 'value'], axis=1) df = df.drop('source', axis=1) china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2) # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail) five_dig_code = df['code_id'] == 13243 df.loc[ china_cdc_2008 & five_dig_code, 'code_id' ] = 13242 return df
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id, location_set_version_id): """Run the full phase, chaining together computational elements.""" # get filepaths cache_dir = CONF.get_directory('db_cache') orig_deaths = df['deaths'].sum() standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } code_system_id = get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id) code_system = get_code_system_from_id(code_system_id, **standard_cache_options) source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) is_vr = data_type_id in [9, 10] # run hiv correction on VR, but not Other_Maternal # countries to correct will be further pruned by the master cause # selections csv in the hiv corrector class if not skip_hiv_correction(source) and is_vr: # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) # get population pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) # get age metadata age_meta_df = get_ages(**standard_cache_options) # get the country iso3 = get_value_from_nid( nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \ "extract_type_id {}".format(nid, extract_type_id) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=False) print_log_message("Running hiv correction for iso3 {}".format(iso3)) df = hiv_corrector.get_computed_dataframe() if needs_injury_redistribution(source): print_log_message("Correcting injuries") if not 'loc_meta_df' in vars(): # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) injury_redistributor = InjuryRedistributor(df, loc_meta_df, cause_meta_df) df = injury_redistributor.get_computed_dataframe() # apply redistribution of LRI to tb in under 15, non-neonatal ages based # on location/year specific proportions print_log_message( "Applying special redistribution of LRI to TB in under 15") lri_tb_redistributor = LRIRedistributor(df, cause_meta_df) df = lri_tb_redistributor.get_computed_dataframe() # merge in raw and rd here because recodes and bridge mapping should # also apply to the causes that are in previous phases (raw deaths for # secret codes need to be moved up to their parent cause, for example) df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id) val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd'] # run china VR rescaling if needs_subnational_rescale(source): china_rescaler = ChinaHospitalUrbanicityRescaler() df = china_rescaler.get_computed_dataframe(df) if needs_strata_collapse(source): # set site id to blank site id and collapse df['site_id'] = 2 group_cols = list(set(df.columns) - set(val_cols)) df = df.groupby(group_cols, as_index=False)[val_cols].sum() if is_vr: # drop if deaths are 0 across all current deaths columns df = df.loc[df[val_cols].sum(axis=1) != 0] # restrict causes based on code system print_log_message("Running bridge mapper") bridge_mapper = BridgeMapper(source, cause_meta_df, code_system) df = bridge_mapper.get_computed_dataframe(df) # run recodes based on expert opinion print_log_message("Enforcing some very hard priors (expert opinion)") expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id, data_type_id) df = expert_opinion_recoder.get_computed_dataframe(df) end_deaths = df['deaths'].sum() print_log_message("Checking no large loss or gain of deaths") if abs(orig_deaths - end_deaths) >= (.1 * end_deaths): diff = round(abs(orig_deaths - end_deaths), 2) old = round(abs(orig_deaths)) new = round(abs(end_deaths)) raise AssertionError("Change of {} deaths [{}] to [{}]".format( diff, old, new)) return df
def run_pipeline(year, source, int_cause, code_system_id, code_map_version_id, cause_set_version_id, nid, extract_type_id, data_type_id, inj_garbage, diagnostic_acauses=None, explore=False, drop_p2=False): """Clean, map, and prep data for next steps.""" print_log_message("Formatting data") formatting_method, args = get_formatting_method( source, data_type_id, year, drop_p2=drop_p2) df = formatting_method(*args) print_log_message("Dropping rows without multiple cause") df = drop_non_mcause(df, explore) print_log_message("Mapping data") Mapper = MCoDMapper(int_cause, code_system_id, code_map_version_id, drop_p2=drop_p2) df = Mapper.get_computed_dataframe(df) cause_cols = [x for x in list(df) if ("cause" in x) & ~( x.endswith("code_original")) & ~(x.endswith(f"{int_cause}"))] cause_cols.remove("cause_id") # keep original "cause" information for # "cause" col is a string name in CoD cause map # after mapping to cause ids - (ex code id 103591) if source == "USA_NVSS": if code_system_id == 1: for col in cause_cols: df.loc[~(df[f"{col}"].str.match( "(^[A-Z][0-9]{2,4}$)|(^0000$)")), col] = df[f"{col}_code_original"] if inj_garbage: # FYI: This was a last minute addition to make plots of %X59/Y34 # of injuries garbage for my manuscript # it's not needed for any analysis print_log_message( "subsetting to only rows with UCOD as injuries garbage codes") package_list = pd.read_excel( "/homes/agesak/thesis/maps/package_list.xlsx", sheet_name="mohsen_vetted") # get a list of all injuries garbage package names inj_packages = package_list.package_name.unique().tolist() # get the garbage codes associated with these garbage packages garbage_df = engine_room.get_package_list( code_system_or_id=code_system_id, include_garbage_codes=True) # subset df to only rows with injuries garbage as UCOD df = apply_garbage_map(df, garbage_df, inj_packages) else: causes = get_most_detailed_inj_causes( int_cause, cause_set_version_id=cause_set_version_id, **{'block_rerun': True, 'force_rerun': False}) df = df.loc[(df.cause_id.isin(causes)) | ( (df[f"{int_cause}"] == 1) & (df.cause_id == 743))] df = format_for_bow(df, code_system_id) # subset to rows where UCOD is injuries or any death is X59/y34 df = df[[x for x in list(df) if not ((x.endswith(f"{int_cause}")) | ( x.endswith("code_original")) | (x.startswith("pII")))] + [ int_cause, f"pII_{int_cause}", f"cause_{int_cause}"]] return df
def main(model_group, location_set_version_id, cause_set_version_id, launch_set_id): print_log_message( "Beginning NR modeling for model_group {}".format(model_group) ) cache_dir = CONF.get_directory('db_cache') read_file_cache_options = { 'block_rerun': True, 'cache_dir': cache_dir, 'force_rerun': False, 'cache_results': False } print_log_message("Preparing location hierarchy") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **read_file_cache_options ) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **read_file_cache_options ) age_meta_df = get_ages( **read_file_cache_options ) print_log_message("Preparing model data") model_df = get_model_data( model_group, location_hierarchy, location_set_version_id, cause_meta_df ) print_log_message("Got {} rows of model data".format(len(model_df))) if len(model_df) == 0: print_log_message("Exiting...") return # re make deaths in nrmodelworker.R model_df = model_df.drop('deaths', axis=1) # get the unique cause_ids by code_system code_system_cause_dict = get_code_system_cause_ids(model_df) # square data for certain data types if (model_group.startswith("VR")) or (model_group.startswith("Cancer")): print_log_message( "Bringing back zeros (squaring) so noise reduction " "knows to depress time series" ) squarer = Squarer(cause_meta_df, age_meta_df) model_df = squarer.get_computed_dataframe(model_df) elif "HH_SURVEYS" in model_group: model_df = square_dhs_data(model_df, cause_meta_df, age_meta_df, location_hierarchy) print_log_message(log_statistic(model_df)) print_log_message("Restricting model data to only existing cause_ids") model_df = restrict_to_cause_ids(code_system_cause_dict, model_df) print_log_message("Adding NR location info") model_df = format_for_nr(model_df, location_hierarchy) if model_group_is_run_by_cause(model_group): run_phase_by_cause(model_df, model_group, launch_set_id) else: run_phase_by_model_group(model_df, model_group, launch_set_id) print_log_message("Job complete. Exiting...")
def run_phase_by_cause(model_df, model_group, launch_set_id): """Run the model, parallelizing by country and cause.""" # get cause list nocause = model_df[model_df['cause_id'].isnull()] if len(nocause) > 0: raise AssertionError("Have {} rows with missing cause: {}".format( len(nocause), nocause )) causes = list(set(model_df['cause_id'])) causes = [int(cause) for cause in causes] print_log_message( "Writing NR input file and submitting jobs for " "{} causes".format(len(causes))) log_base_dir = "FILEPATH" \ "{launch_set_id}".format( user=getpass.getuser(), launch_set_id=launch_set_id) claude_dir = CONF.get_directory('claude_code') worker = "{claude}/run_phase_nrmodelworker.R".format(claude=claude_dir) slots = 5 if model_group == 'VR-GBR': cores = 25 else: cores = 15 subnat_iso3s = CONF.get_id('subnational_modeled_iso3s') for subnat_iso3 in subnat_iso3s: if model_group == "VR-{}".format(subnat_iso3): slots = 18 if model_group == 'VR-GBR': slots = 100 language = "r" num_draws = CONF.get_resource('uncertainty_draws') if not modelgroup_has_redistribution_variance(model_group): num_draws = 0 for cause_id in causes: write_nrmodel_data( model_df, model_group, launch_set_id, cause_id=cause_id) params = [ model_group, str(launch_set_id), str(num_draws), str(cause_id) ] jobname = "claude_nrmodelworker_{model_group}_{cause_id}".format( model_group=model_group, cause_id=cause_id) submit_cod( jobname, slots, language, worker, cores=cores, params=params, verbose=(launch_set_id == 0), logging=True, log_base_dir=log_base_dir ) wait("claude_nrmodelworker_{model_group}".format( model_group=model_group), 30) nr_dir = CONF.get_directory('nr_process_data') iso_dir = "FILEPATH".format( nrdir=nr_dir, model_group=model_group) causes_outpath = "FILEPATH".format(iso_dir=iso_dir, lsid=launch_set_id) cause_path = "FILEPATH" for cause_id in causes: outpath = cause_path.format(iso_dir=iso_dir, cause_id=cause_id, lsid=launch_set_id) just_keep_trying( os.path.exists, args=[outpath], max_tries=250, seconds_between_tries=6, verbose=True ) print_log_message("Writing causes file to {}".format(causes_outpath)) causes_df = pd.DataFrame({'cause_id': list(causes)}) causes_df.to_csv(causes_outpath, index=False)
def get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df): """Get data to run in NR model with incoming data.""" iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique() regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique() super_region_ids = location_hierarchy.query( 'level == 1')['location_id'].unique() # need to be string for later test that what comes after "VA-" is a # super region (otherwise, would have to compare ints, and whats after # "VA-" might not be convertible to an int) super_region_ids = [str(s) for s in super_region_ids] super_region_to_region_ids = location_hierarchy.query('level == 2') # location id here is the region id, and parent id is the super region id # becomes a dictionary from super region id to list of region ids super_region_to_region_ids = ( super_region_to_region_ids[['location_id', 'parent_id']].groupby( 'parent_id' ).apply(lambda df: list(set(df['location_id']))).to_dict() ) regions_to_ids = location_hierarchy.query( 'level == 2').set_index('ihme_loc_id')['region_id'] level_three_location_ids = location_hierarchy.query( 'level == 3')['location_id'].unique() model_group_filters = {} bad_model_group = False if model_group.startswith("VR-"): model_group_filters['data_type_id'] = [9, 10] loc_code = model_group.replace("VR-", "") if loc_code in iso3s: model_group_filters['iso3'] = loc_code elif loc_code in regions: region_id = regions_to_ids[loc_code] model_group_filters['region_id'] = region_id model_group_filters['exec_function'] = restrict_to_location_ids model_group_filters['exec_function_args'] = [ level_three_location_ids ] elif loc_code == "GRL-AK": AK_LOC_ID = 524 GRL_LOC_ID = 349 model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID] else: bad_model_group = True elif model_group.startswith("VA-"): model_group_filters['data_type_id'] = [8, 12] if model_group == "VA-SRS-IND": model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "VA-SRS-IDN": model_group_filters['source'] = IDN_SRS_SOURCES elif model_group == "VA-Matlab": model_group_filters['source'] = MATLAB_SOURCES elif model_group == "VA-Nepal-Burden": model_group_filters['source'] = "Nepal_Burden_VA" elif model_group == "VA-IND": model_group_filters['iso3'] = "IND" elif model_group == "VA-158": # potential bug from GBD2016 - super region 158 keeps only # Pakistan, Nepal, and Bangledesh, doesn't get India data # Also keep Bhutan in case we ever have VA there model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD', 'BTN'] else: loc_code = model_group.replace("VA-", "") if loc_code in super_region_ids: super_region_id = int(loc_code) model_group_filters['region_id'] = \ super_region_to_region_ids[super_region_id] else: bad_model_group = True elif model_group == "Cancer_Registry": model_group_filters['source'] = "Cancer_Registry" # keep data by source/iso3/survey type # model groups follow MATERNAL-{source}-{iso3} format # except for the household surveys within Other_Maternal elif model_group.startswith("MATERNAL"): for source in MATERNAL_NR_SOURCES: if source in model_group: model_group_filters['source'] = source if "HH_SURVEYS" in model_group: model_group_filters['survey_type'] = ["DHS", "RHS", "AHS", "DLHS", "NFHS"] model_group_filters['iso3'] = model_group[-3:] # special malaria model groups for VA data elif model_group.startswith('malaria'): model_group_filters['data_type_id'] = [8, 12] model_group_filters['malaria_model_group'] = model_group if "IND_SRS" in model_group: model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "CHAMPS": model_group_filters['data_type_id'] = [12] else: bad_model_group = True if bad_model_group: raise AssertionError( "Unrecognized model group: {}".format(bad_model_group) ) model_df = get_claude_data( phase="aggregation", is_active=True, is_dropped=False, location_set_id=35, year_id=range(1980, 2050), assert_all_available=True, location_set_version_id=location_set_version_id, **model_group_filters ) add_cols = ['code_system_id'] if model_group.startswith(("VA", "MATERNAL", "malaria", "CHAMPS")) or \ model_group in ["VR-RUS", "VR-R9"]: add_cols.append('source') if model_group.startswith('MATERNAL-HH_SURVEYS'): model_df = add_survey_type(model_df) # add on code_system_id model_df = add_nid_metadata( model_df, add_cols, force_rerun=False, block_rerun=True, cache_dir='standard', cache_results=False ) if model_group == "VR-RUS" or model_group == "VR-R9": # treat this like Russia_FMD_1989_1998 for purpose of cause list, # as it has now been bridge mapped that way replace_source = "Russia_FMD_ICD9" replace_csid = 213 fmd_conv_10 = model_df['source'] == replace_source num_replace = len(model_df[fmd_conv_10]) assert num_replace > 0, \ "No rows found with source {} in " \ "model group {}".format(replace_source, model_group) print_log_message( "Setting code system to {cs} for {s} " "source: {n} rows changed".format( cs=replace_csid, s=replace_source, n=num_replace) ) model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid report_if_merge_fail( model_df, 'code_system_id', ['nid', 'extract_type_id'] ) # special source drops for certain groups model_df = drop_source_data(model_df, model_group, location_hierarchy, cause_meta_df) return model_df
def square_dhs_data(model_df, cause_meta_df, age_meta_df, location_hierarchy): """Special squaring method for DHS. We want to represent DHS data as a continuous time series, but there are some gaps in location/years. The goal is to produce a squared dataframe that can be noise reduced and upload a continuous time series. """ # separate DHS from non-DHS data (e.g. NFHS, AHS, DLHS, RHS) dhs = model_df['survey_type'] == "DHS" non_dhs = model_df[~dhs] dhs = model_df[dhs] if len(dhs) > 0: # get df with id_cols to merge on apres square nid_loc_df = model_df[ ['nid', 'location_id', 'site_id', 'extract_type_id'] ].drop_duplicates() print_log_message( "Bringing back zeros (squaring) so noise reduction " "knows to depress time series" ) squarer = Squarer(cause_meta_df, age_meta_df, location_meta_df=location_hierarchy, dhs=True) dhs = squarer.get_computed_dataframe(dhs) # fill in some metadata dhs['code_system_id'].fillna(177, inplace=True) dhs['source'].fillna("Other_Maternal", inplace=True) dhs['survey_type'].fillna("DHS", inplace=True) # merge on NIDs, etid dhs = nid_loc_df.merge(dhs, on=['location_id', 'site_id'], how='right') # issue for timor-leste overlap w/ indonesia, drop indonesia duplicate tls = dhs.query('location_id == 19') dhs = dhs.query('location_id != 19') tls = tls.drop_duplicates( subset=['location_id', 'year_id', 'cause_id', 'age_group_id', 'sex_id', 'nid_y', 'extract_type_id_y'], keep='first' ) dhs = pd.concat([tls, dhs], ignore_index=True) # replace null values with merged dhs.loc[dhs['nid_y'].isnull(), 'nid_y'] = dhs['nid_x'] dhs.loc[ dhs['extract_type_id_y'].isnull(), 'extract_type_id_y' ] = dhs['extract_type_id_x'] # clean up dhs.drop(['extract_type_id_x', 'nid_x', 'iso3'], axis=1, inplace=True) dhs.rename(columns={ 'nid_y': 'nid', 'extract_type_id_y': 'extract_type_id' }, inplace=True) # fill in sample size for rows that were newly created # want sample size to be > 0 for noise reduction dhs.loc[dhs['sample_size'] == 0, 'sample_size'] = 0.5 # append all household survey types back together model_df = pd.concat([dhs, non_dhs], ignore_index=True) assert model_df.notnull().values.any() report_duplicates( model_df, ['year_id', 'sex_id', 'location_id', 'cause_id', 'age_group_id', 'nid', 'extract_type_id', 'site_id']) return model_df
def get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df): """Get data to run in NR model with incoming data.""" iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique() regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique() super_region_ids = location_hierarchy.query( 'level == 1')['location_id'].unique() super_region_ids = [str(s) for s in super_region_ids] super_region_to_region_ids = location_hierarchy.query('level == 2') super_region_to_region_ids = (super_region_to_region_ids[[ 'location_id', 'parent_id' ]].groupby('parent_id').apply( lambda df: list(set(df['location_id']))).to_dict()) regions_to_ids = location_hierarchy.query('level == 2').set_index( 'ihme_loc_id')['region_id'] level_three_location_ids = location_hierarchy.query( 'level == 3')['location_id'].unique() model_group_filters = {} bad_model_group = False if model_group.startswith("VR-"): model_group_filters['data_type_id'] = [9, 10] loc_code = model_group.replace("VR-", "") if loc_code in iso3s: model_group_filters['iso3'] = loc_code elif loc_code in regions: region_id = regions_to_ids[loc_code] model_group_filters['region_id'] = region_id model_group_filters['exec_function'] = restrict_to_location_ids model_group_filters['exec_function_args'] = [ level_three_location_ids ] elif loc_code == "GRL-AK": AK_LOC_ID = 524 GRL_LOC_ID = 349 model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID] else: bad_model_group = True elif model_group.startswith("VA-"): model_group_filters['data_type_id'] = 8 if model_group == "VA-SRS-IND": model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "VA-SRS-IDN": model_group_filters['source'] = IDN_SRS_SOURCES elif model_group == "VA-Matlab": model_group_filters['source'] = MATLAB_SOURCES elif model_group == "VA-IND": model_group_filters['iso3'] = "IND" elif model_group == "VA-158": model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD'] else: loc_code = model_group.replace("VA-", "") if loc_code in super_region_ids: super_region_id = int(loc_code) model_group_filters['region_id'] = \ super_region_to_region_ids[super_region_id] else: bad_model_group = True elif model_group == "Cancer_Registry": model_group_filters['source'] = "Cancer_Registry" elif model_group.startswith("MATERNAL"): for source in MATERNAL_NR_SOURCES: if source in model_group: model_group_filters['source'] = source if "HH_SURVEYS" in model_group: model_group_filters['survey_type'] = [ "DHS", "RHS", "AHS", "DLHS", "NFHS" ] model_group_filters['iso3'] = model_group[-3:] elif model_group.startswith('malaria'): model_group_filters['data_type_id'] = 8 model_group_filters['malaria_model_group'] = model_group if "IND_SRS" in model_group: model_group_filters['source'] = IND_SRS_SOURCES else: bad_model_group = True if bad_model_group: raise AssertionError( "Unrecognized model group: {}".format(bad_model_group)) model_df = get_claude_data(phase="aggregation", is_active=True, is_dropped=False, location_set_id=35, year_id=range(1980, 2050), assert_all_available=True, location_set_version_id=location_set_version_id, **model_group_filters) add_cols = ['code_system_id'] if model_group.startswith("VA") or model_group.startswith("MATERNAL") or \ model_group in ["VR-RUS", "VR-R9"] or model_group.startswith('malaria'): add_cols.append('source') if model_group.startswith('MATERNAL-HH_SURVEYS'): model_df = add_survey_type(model_df) # add on code_system_id model_df = add_nid_metadata(model_df, add_cols, force_rerun=False, block_rerun=True, cache_dir='standard', cache_results=False) if model_group == "VR-RUS" or model_group == "VR-R9": replace_source = "Russia_FMD_ICD9" replace_csid = 213 fmd_conv_10 = model_df['source'] == replace_source num_replace = len(model_df[fmd_conv_10]) assert num_replace > 0, \ "No rows found with source {} in " \ "model group {}".format(replace_source, model_group) print_log_message("Setting code system to {cs} for {s} " "source: {n} rows changed".format(cs=replace_csid, s=replace_source, n=num_replace)) model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid report_if_merge_fail(model_df, 'code_system_id', ['nid', 'extract_type_id']) # special source drops for certain groups model_df = drop_source_data(model_df, model_group, location_hierarchy, cause_meta_df) return model_df