def project_data(): ''' Runs pipeline to combine previously-selected incidence data Requires incidence data that are unique by location_id-year-sex-age-acause ''' input_file = utils.get_path("combined_incidence", process="cod_mortality") output_file = utils.get_path("projected_incidence", process="cod_mortality") df = pd.read_csv(input_file) df = project_to_special_locations(df) df = project_ihme_location_estimates(df) df.to_csv(output_file, index=False) print("incidence data projected")
def get_age_frmat_map(frmat_type): ''' Loads map with indicators for which age groups need splitting ''' if frmat_type == "im_frmat": resource = pd.read_csv( utils.get_path('im_frmat_map', process="mi_dataset")) elif frmat_type == "frmat": resource = pd.read_csv( utils.get_path('frmat_map', process="mi_dataset")) resource = md.stdz_col_formats( resource, additional_float_stubs=['age_specific', 'age_split']) return (resource)
def run(): ''' Runs pipeline to combine previously-selected incidence data Requires incidence data that are unique by location_id-year-sex-age-acause ''' output_file = utils.get_path("combined_incidence", process="cod_mortality") input_file = utils.get_path("prepped_incidence", process="cod_mortality") utils.ensure_dir(output_file) df = pd.read_csv(input_file) print("Combining data to one single entry per uid...") df = combine_incidence(df) df.to_csv(output_file, index=False) return (df)
def project_incidence(): ''' For each IHME location_id, projects estimates based in the input cancer rates Includes generation of national estimates from subnational estimates where national estimates are not present (note: in CoD, such estimates are used for validation only) ''' print(" projecting data to ihme demographic specifications...") output_file = utils.get_path("projected_incidence", process="cod_mortality") input_file = utils.get_path("combined_incidence", process="cod_mortality") pop_uids = [c for c in get_uid_columns() if c != 'acause'] df = pd.read_csv(input_file) # define subset that can be projected to the IHME population df = modeled_locations.add_subnational_status(df) df = supplement_national_estimates(df) # Ensure validity of sdi_quintile df = modeled_locations.add_sdi_quintile(df, delete_existing=True) # Calculate rate of input df.loc[:, 'rate'] = df['cases'] / df['pop'] df['registry_pop'] = df['pop'] # Mark data to be projected project_to_ihme = (df['sdi_quintile'].eq(5)) df_sdi5 = df.loc[project_to_ihme, :].copy() df_other = df.loc[~project_to_ihme, :].copy() # Add IHME population to applicable uids del df_sdi5['pop'] ihme_pop = load_ihme_pop(df.loc[project_to_ihme, 'location_id'].unique()) df_sdi5 = df_sdi5.merge(ihme_pop) # Homogenize population by group where not applying IHME populations df_other = staging_functions.homogenize_pop(df_other, uid_cols=pop_uids) output = df_other.append(df_sdi5) # reindex to allow multiplying series # create new column index, then set that as the new index output['index'] = np.arange(len(output)) output = output.set_index('index') # Broadcast rates to the final population estimate for all locations output.loc[(output['pop'].notnull()) & (output['rate'].notnull()), 'cases'] = output['rate'] * output['pop'] # Drop registry-specific tags output = output.drop([ 'national_registry', 'full_coverage', 'is_subnational', 'registry_pop' ], axis=1, errors='ignore') assert not output.loc[output.duplicated(get_uid_columns()), :].any().any(), \ "Duplicates exist after projection" assert not output['pop'].isnull().any(), "Missing population data" assert len(output) == len(df), "Error during estimate projection" output.to_csv(output_file, index=False) print(" data projected.") return (output)
def run(): ''' Runs merge between incidence and MIR estimates to generate mortaliy estimate output ''' output_file = utils.get_path("mortality_estimates", process='cod_mortality') input_file = utils.get_path("projected_incidence", process="cod_mortality") df = pd.read_csv(input_file) df = calculate_mortality(df) df = apply_recode(df) # Validate and save df.to_csv(output_file, index=False) print(" deaths calculated and recoded.") return(df)
def load_mi_estimates(): ''' returns the compiled MIR model results with the provided suffix, formatted for use with incidence selected for the CoD upload -- Inputs location_ids : list of location_ids to return years : list of years to return ages : list of ages to return ''' print(" formatting mir estimates...") uid_columns = get_uid_columns() required_columns = uid_columns + ['mi_ratio'] input_file = utils.get_path("compiled_mir_outputs", process="cancer_model") # load and subset data df = pd.read_csv(input_file) df.rename(columns={'sex':'sex_id', 'year':'year_id'}, inplace=True) df = df.loc[:, required_columns] # add extended age groups extended_ages = [30, 31, 32, 235] if not any(a in df['age_group_id'].unique() for a in extended_ages): eightyplus = df.loc[df['age_group_id'] == 21, :].copy() for a in extended_ages: eightyplus.loc[:,'age_group_id'] = a df = df.append(eightyplus) df = df.loc[df['age_group_id'] != 21, :] return(df)
def run(): ''' Finalizes data for CoD prep, then runs CoD prep's format code ''' finalized_file = utils.get_path("formatted_CoD_data", process="cod_mortality") CoD_format_script = utils.get_path("finalize_CoD_input", process="cod_mortality") input_file = utils.get_path("mortality_estimates", process="cod_mortality") df = pd.read_csv(input_file) # Ensure that there is a single entry by uid (data will not be collapsed # after this point) assert not df[df.duplicated(get_uid_cols())].any().any(), \ "Duplicate values present at input" df = refine_by_cc_code(df) df = add_subdiv(df) df = add_CoD_variables(df) df = format_CoD_variables(df) df = test_output(df) df.to_csv(finalized_file) pydo.do_stata(CoD_format_script, arg_list=None) return (df)
def submit_rdp(input_data, this_dataset, is_resubmission): ''' Returns full dataset after redistribution. Separates data by submission requirement before submitting rdp for only only those data that require it ''' def submission_requirement(df, uid): return needs_rdp(df[df['uid'] == uid], this_dataset) def output_file_function(id): return get_rdp_file(this_dataset, which_file='split_output', splitNum=id[2]) # create a list of the uids that require redistribution and set aside a # dataframe of the uids that do not require redistribution rdp_code_location = utils.get_path("redistribution", base="code_repo", process="mi_dataset") worker_script = rdp_code_location + "/rdp_worker.py" output_uids = md.get_uid_cols(7) header = "cncRDP_{}_{}".format(this_dataset.dataset_id, this_dataset.data_type_id) rdp_input_file = get_rdp_file(this_dataset, which_file='rdp_input') # prepped_df = prep_input(input_data, this_dataset) submitted_data, unsubmitted_data = cup.split_submission_data( prepped_df, 'uid', submission_requirement, rdp_input_file) uid_list = submitted_data['uid'].unique().tolist() rdp_job_dict = cup.generate_prep_workers(worker_script, list_of_uids=uid_list, ds_instance=this_dataset, job_header=header, is_resubmission=is_resubmission, pace_interval=0.05) output_files = cup.get_results(rdp_job_dict, output_file_function, parent_process_name="rdp", noisy_checker=is_resubmission, add_resubmission_argument=is_resubmission, wait_time=5) # Re-combine compiled results with the set-aside data, before collapsing # and testing final_results = pe.append_files(output_files) final_results = final_results.append(unsubmitted_data) # Re-set all 'under 5' data, then collapse to combine it with any existing # 'under 5' data final_results.loc[final_results['age'].lt(7) | (final_results['age'].gt(90) & final_results['age'].lt(95)), 'age'] = 2 final_results = dft.collapse(final_results, by_cols=output_uids, combine_cols=this_dataset.metric) return (final_results)
def load_package_set(df): ''' loads the package set linked to the coding system ''' code_version = df.coding_system.unique()[0] params_dir = utils.get_path('mi_dataset_resources', process="mi_dataset") package_path = params_dir + '/redistribution/packagesets_{}.dta'.format( code_version) package = pd.read_stata(package_path) assert len(package) == 1, "Incorrect number of source labels in "\ "packagesets_{}. Expected 1, got {}. Redistribution failed."\ .format(code_version, len(package)) return (package.package_set_id.unique()[0])
def generate_code_index(input_codes): ''' Returns an index of all possible ICD10 codes with attached number indicating order of appearance and tag for viability. "Viable" tag indicates either an official code or an unofficial code that exists in the data ''' if not isinstance(input_codes, tuple): input_codes = tuple(input_codes) # Import list of ICD10 codes and define code index code_list_path = (utils.get_path('mi_input') + "/_resources/" + "subtotal_recalculation/list_of_official_NUMERIC_ICD10_codes.csv") ICD10_code_list = pd.read_csv(code_list_path) ICD10_code_list.sort_values(by=['ICD10_code'], inplace=True) ICD10_code_list = tuple(ICD10_code_list['ICD10_code']) ICD10_code_index = {} order_num = 1 for k in ['C', 'D']: under_10_alternate = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09'] for o in under_10_alternate + list(range(10, 100)): kode = '{}{}'.format(k, o) ICD10_code_index[kode] = {} ICD10_code_index[kode]['order'] = order_num if kode in ICD10_code_list or kode in input_codes: ICD10_code_index['{}{}'.format(k, o)]['viable'] = True else: ICD10_code_index[kode]['viable'] = False order_num += 1 for d in range(0, 10): kode = '{}{}.{}'.format(k, o, d) ICD10_code_index[kode] = {} ICD10_code_index[kode]['order'] = order_num if kode in ICD10_code_list or kode in input_codes: ICD10_code_index['{}{}.{}'.format( k, o, d)]['viable'] = True else: ICD10_code_index[kode]['viable'] = False order_num += 1 for e in range(0, 10): kode = '{}{}.{}{}'.format(k, o, d, e) ICD10_code_index[kode] = {} ICD10_code_index[kode]['order'] = order_num if kode in ICD10_code_list or kode in input_codes: ICD10_code_index[kode]['viable'] = True else: ICD10_code_index[kode]['viable'] = False order_num += 1 return(ICD10_code_index)
def run(input_data, PACKAGE_MAP, TEMP_FOLDER): ''' Manages formatting and redistribution for the input data, then saves the output ''' if int(input_data['freq'].sum()) == 0: print("Data sums to zero.") return (input_data) # output_cols = [ 'registry_index', 'year_start', 'year_end', 'year_id', 'sex', 'coding_system', 'split_group', 'age', 'cause', 'freq' ] proportion_uids = [ 'dev_status', 'super_region', 'region', 'subnational_level1', 'subnational_level2', 'country', 'location_id', 'registry_index', 'year_start', 'year_end', 'year_id', 'sex', 'coding_system', 'split_group', 'age' ] residual_cause = 'ZZZ' resources_dir = utils.get_path("mi_dataset_resources", process="mi_dataset") package_folder = '{}/redistribution/{}'.format(resources_dir, PACKAGE_MAP) cause_map_file = package_folder + '/cause_map.csv' cause_map = pd.read_csv(cause_map_file) packages = get_packages(input_data, package_folder, cause_map) if len(packages) == 0: print("No packages available with which to redistribute this data.") return (input_data) start_time = time.time() prepped_data, proportion_metadata = prep_data( input_data, proportion_uids, residual_cause=residual_cause) evaluated_cause_map = evaluate_cause_restrictions(cause_map, proportion_metadata, proportion_uids) result, diagnostics = apply_packages(prepped_data, proportion_metadata, evaluated_cause_map, packages, residual_cause=residual_cause) output_data = result.merge(proportion_metadata, on='proportion_id') # output_data = output_data.ix[:, output_cols] output_data.loc[:, 'freq'].fillna(value=0, inplace=True) diff = output_data['freq'].sum() - input_data['freq'].sum() assert abs(diff) < 1, "Difference from input after rdp is too large" return (output_data)
def load_surv_folder(cnf_model_run_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' surv_folder = surv_folder = utils.get_path("relative_survival", process="nonfatal_model") record = nd.get_run_record(cnf_model_run_id) rs_version = record.at[0, 'rel_survival_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='rel_survival_version', uniq_col='rel_survival_version_id', val=rs_version) suffix = str(this_version.at[0, 'date_updated']) rs_folder = surv_folder.replace("<date>", suffix) return (rs_folder)
def load_lambda_file(cnf_model_run_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' lambda_file_default = utils.get_path("lambda_values", process="nonfatal_model") record = nd.get_run_record(cnf_model_run_id) lambda_version = record.at[0, 'cnf_lambda_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='cnf_lambda_version', uniq_col='cnf_lambda_version_id', val=lambda_version) suffix = str(this_version.at[0, 'date_updated']) lambda_file = lambda_file_default.replace("<date>", suffix) return (lambda_file)
def generate_estimates(acause, location_id, cnf_model_run_id, is_resubmission=False): ''' Runs a subprocess that passes all arguments to the R script that calculates incidence draws ''' r_script = utils.get_path("calculate_incidence", process="nonfatal_model") cmd = "bash {shl} {scr} {ac} {l} {id} {rs}".format( shl=utils.get_cluster_setting("r_shell"), scr=r_script, ac=acause, l=location_id, id=cnf_model_run_id, rs=int(is_resubmission)) print(cmd) subprocess.call(cmd, shell=True) return (True)
def update_repness(df): ''' Returns dataframe with updated representativeness status, superceeding the value with one from override file (if present) ''' repres_update_file = utils.get_path("representativeness_override", process="staging") repres_update = pd.read_csv(repres_update_file) repres_update = repres_update[['country_id', 'grouping', 'representative']] repres_update.rename(columns={'representative': 'update_rep'}, inplace=True) df = modeled_locations.add_country_id(df) df.loc[df['location_id'] == df['country_id'], 'grouping'] = 'national' df.loc[df['grouping'] != 'national', 'grouping'] = 'subnational' df = df.merge(repres_update, how='left') df.loc[df['update_rep'].notnull(), 'representative'] = df['update_rep'] df.loc[df['representative'].isnull(), 'representative'] = 0 df = df.drop(['update_rep', 'grouping'], axis=1) return (df)
def manage_split(source_cid, target_cids, proportion_meids, work_dir, description): ''' Manages the split of the source_cid followed by saving of the targets, then returns boolean indication of success ''' utils.ensure_dir(work_dir) # split model df = split_cod_model(source_cause_id=source_cid, target_cause_ids=target_cids, target_meids=proportion_meids, output_dir=work_dir) print( print("Split data saved to " + work_dir + " at " + utils.display_timestamp())) # Generate a list of arguments (one for each child me) save_args_template = "--target {targ} --desc {desc} --indir {dir}" save_arg_list = [] for t in target_cids: save_arg_list += [ save_args_template.format(targ=t, desc=description, dir=work_dir) ] # Start jobs header = description.replace(" ", "_") save_worker = utils.get_path("save_cod_worker", process="cancer_model") job_dict = cluster_tools.create_jobs(script_path=save_worker, job_header=description, memory_request=50, id_list=target_cids, script_args=save_arg_list, use_argparse=True, project_name="cancer") for i in job_dict: job_dict[i]['job'].launch() # Check for results job_descrip = description + " upload" success_df = cluster_tools.wait_for_results(job_dict, jobs_description=job_descrip, noisy_checker=False, max_minutes=30) success = cluster_tools.validate_success(success_df, description) return (success)
def split_liver(): ''' Submits the liver-cancer-specific information to the split manager ''' # set source and targets source_cid = 417 # parent cause_id target_cids = [996, 418, 419, 420, 421] # cause_ids proportion_meids = [18763, 2470, 2471, 2472, 2473] # proportion me_ids years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1)) description = "lvr_cncr_split" liver_model_path = utils.get_path('cod_liver_splits', process='cancer_model') work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp()) # Run split print(utils.display_timestamp()) success = manage_split(source_cid, target_cids, proportion_meids, work_dir, description) if success: print("All CoD liver splits uploaded. " + utils.display_timestamp()) else: print("Error during CoD splits for liver cancer")
def add_location_hierarchy_info(df): ''' Returns the dataframe (df) with added location information: region, super_region, subnational_status, etc. Stops RDP if there is a problem with the location ifnormation ''' print(" Adding location information.") input_len = len(df) # Reformat/convert variables and ages loc_info_dir = utils.get_path('mi_dataset_resources', process="mi_dataset") loc_info_path = loc_info_dir + '/redistribution/location_hierarchy.dta' location_hierarchy = pd.read_stata(loc_info_path) location_hierarchy = location_hierarchy[[ 'location_id', 'dev_status', 'super_region', 'region', 'country', 'subnational_level1', 'subnational_level2' ]] df = df.merge(location_hierarchy, how='left', on='location_id') assert not df.location_id.isnull().any(), \ "Cannot redistribute. Unmapped location ids present." assert len( df) == input_len, "ERROR: data lost while adding location metadata" return (df)
def run(dataset_id, data_type_id, uid): ''' Preps data for recalculation then recalculates as necessary ''' this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id) dataset_name = this_dataset.name metric = this_dataset.metric input_file = run_sr.get_sr_file(this_dataset, "sr_input") # Exit if output already exists output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid) print(output_file) if os.path.exists(output_file): print(" output file found for uid " + str(uid)) return(None) # negative_data_ok = is_exception(dataset_id, data_type_id) error_folder = utils.get_path("mi_input", base='j_temp') subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format( error_folder, dataset_name, data_type_id, uid) exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format( error_folder, dataset_name, data_type_id, uid) for d in [subcause_issue_file, exception_file, error_folder]: utils.ensure_dir(d) # print(" removing subtotals from uid {}...".format(uid)) # add data for the given uid df = pd.read_hdf(input_file, 'split_{}'.format(uid)) # Create a list of possible codes so that decimal subcauses are only added # if available input_cause_list = sorted(df['orig_cause'].unique().tolist()) # create a dictionary for codes in the selected uid and attach the uid's # data uid_subset = {} input_data = {} # process decimals first and ranges last to ensure that nested causes are # removed for c in sorted(df['orig_cause'].unique().tolist()): uid_subset[c] = {} input_data[c] = {} uid_subset[c]['codes'] = [] uid_subset[c]['subcodes'] = [] if "-" not in c and "," not in c: uid_subset[c]['codes'].append(c) # add subcodes to 'subcode' key df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist() for subcode in sorted(df['cause'].where(df['orig_cause'] == c ).dropna().unique().tolist()): if subcode != c: uid_subset[c]['subcodes'].append(subcode) # if none of the subcodes appear in the list, set the cause as a # subcode of itself (prevents the addition of unused decimal # causes) if not len(uid_subset[c]['subcodes']): uid_subset[c]['subcodes'] = uid_subset[c]['codes'] elif (not any('{}.'.format(sub[:3]) in check for check in input_cause_list for sub in uid_subset[c]['subcodes'])): uid_subset[c]['subcodes'] = uid_subset[c]['codes'] else: for code in sorted(df['cause'].where( df['orig_cause'].eq(c)).dropna().unique().tolist()): uid_subset[c]['codes'].append(code) uid_subset[c]['subcodes'].append(code) # create other lists associated with the cause and add the metric data uid_subset[c]['subcauses_remaining'] = [] uid_subset[c]['codes_removed'] = [] uid_subset[c]['causes_removed'] = [] uid_subset[c]['data'] = df.loc[df['cause'].eq(c), ['age', metric]].set_index('age') input_data[c]['data'] = uid_subset[c]['data'] input_data[c]['codes'] = uid_subset[c]['codes'] # Determine subcauses and highest number of causes remaining (how many # subcauses are contained within each cause) uid_set = set_subcauses(uid_subset, subcause_issue_file) highest_level = determine_highest_level(uid_set) # remove lowest level codes from parent causes if highest_level == 0: print(' no subcauses present.') else: subcauses_removed = True while subcauses_removed: uid_set, subcauses_removed = remove_subcauses( uid_set, uid, exception_file) # remove duplicates uid_set = remove_duplicates(uid_set) # re-set subcauses and num_subcause_remaining uid_set, highest_level = set_subcauses( uid_set, subcause_issue_file,) print(" subcauses removed.") # Prepare Output print("saving output...") output = pd.DataFrame( columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric]) for c in uid_set: # format cause information cause_data = pd.DataFrame( columns=['cause', 'codes_remaining', 'codes_removed']) cause_data.loc[0, ['cause']] = c # if nothing was removed, or there was only a single cause, or all of # the input codes are still present, set the codes remaining as the # cause if (not len(uid_set[c]['codes_removed']) or ("-" not in c and "," not in c) or set(input_data[c]['codes']) <= set(uid_set[c]['codes'])): cause_data.loc[0, ['codes_remaining']] = c else: cause_data.loc[0, ['codes_remaining']] = ','.join( convert_to_range(uid_set[c]['codes'])) cause_data.loc[0, ['codes_removed']] = ','.join( convert_to_range(uid_set[c]['codes_removed'])) # format output data output_data = uid_set[c]['data'] output_data['age'] = output_data.index output_data['cause'] = c orig_data = input_data[c]['data'] orig_data['age'] = orig_data.index orig_data = orig_data.rename( columns={metric: 'orig_metric_value'}) orig_data['cause'] = c # combine and add to output final = pd.merge(output_data, cause_data, on='cause') final = pd.merge(final, orig_data, on=['cause', 'age']) output = output.append(final) # Create output dataset output['uniqid'] = uid # Update encoding (bug fix to work around pandas to_stata issue) output = md.stdz_col_formats(output, additional_float_stubs='uniqid') # Export results output.to_csv(output_file, index=False) print('\n Done!') time.sleep(1) return(None)