def save_model_results(df, metric_name, acause): ''' Saves a separate output file for each me_tag in the dataframe ''' uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols data_cols = nd.get_columns(metric_name) draw_cols = nd.get_columns("draw_cols") if metric_name == "incidence": measure_id = utils.get_gbd_parameter('incidence_measure_id') df.loc[:, 'me_tag'] = 'primary_phase' elif metric_name == "prevalence": measure_id = utils.get_gbd_parameter('prevalence_measure_id') for this_tag in df['me_tag'].unique(): me_id = nd.get_modelable_entity_id(acause, this_tag) if me_id is None: continue print("me_id " + str(me_id) + " " + this_tag) output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols] output_data.columns = uid_cols + draw_cols output_data['modelable_entity_id'] = me_id nd.save_outputs( "final_results", output_data, acause, me_id, measure_id, )
def save_worker(target_id, description, input_dir): print("saving {}...".format(description)) years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1)) save_results_cod(input_dir=input_dir, input_file_pattern='death_{location_id}.csv', cause_id=target_id, description=description, sex_id=[1, 2], metric_id=1, year_id=years, mark_best=True) print("model saved.")
def refine_by_cc_code(df): ''' Generates a 'cc_code' (measure of the remaining difference between cancer mortality and all-cause mortality) and drops data that that are not credible (cancer deaths > 70% of all-cause mortality) ''' uid_cols = ['country_id'] + \ [c for c in get_uid_cols() if c not in ['acause']] # Set max proportion of all-cause mortality that could possibly come from cancer max_pct_cancer = 0.70 print("Entries before cc_code refinement: {}".format(len(df))) # Calculate cc_code as the difference between total cancer and all-cause mortality env = load_mortality_envelope(df['location_id'].unique().tolist(), df['age_group_id'].unique().tolist(), df['year_id'].unique().tolist()) deaths_df = df.loc[ ~df['acause'].str.contains("neo_leukemia_"), :] # remove child causes deaths_df = deaths_df.groupby(uid_cols, as_index=False).agg({ 'deaths': 'sum', 'pop': 'mean' }).rename(columns={'deaths': 'cancer_deaths'}) cc_df = deaths_df.merge( env, how='inner', on=['location_id', 'year_id', 'sex_id', 'age_group_id']) cc_df['total_deaths'] = cc_df['death_rate'] * cc_df['pop'] cc_df.loc[:, ['total_deaths', 'cancer_deaths']] = \ cc_df[['total_deaths', 'cancer_deaths']].fillna(0) valid_estimates = (cc_df['cancer_deaths'] <= max_pct_cancer * cc_df['total_deaths']) cc_df = cc_df.loc[valid_estimates, :] cc_df['deaths'] = cc_df['total_deaths'] - cc_df['cancer_deaths'] cc_df['acause'] = "cc_code" cc_df['registry_index'] = "(\'0.0.0\',)" cc_df['NID'] = utils.get_gbd_parameter('generic_cancer_nid') cc_df['dataset_id'] = 3 cc_df = cc_df.drop(['total_deaths', 'cancer_deaths', 'death_rate'], axis=1) cc_df.drop_duplicates(inplace=True) # Attach cc_code data to main dataset and return. First subset df to only # those uids with valid cc_code values, then append the full cc_code values # subset output to only valid cc_code output = df.merge(cc_df[uid_cols], how='inner') print("Entries after cc_code refinement: {}".format(len(output))) output = output.append(cc_df) # append df = modeled_locations.add_sdi_quintile(df, delete_existing=True) print("Final entries with cc_code attached: {}".format(len(output))) assert not output[output.duplicated(get_uid_cols())].any().any(), \ "Duplicate entries present at end of refine_by_cc_code" assert not df['deaths'].isnull().any(), \ "Mortality estimates lost while calulating cc_code" return (output)
def extract_single_nid(nid_entry): ''' If multiple nids are present, returns a single nid ''' try: nid_entry = literal_eval(nid_entry) except ValueError: pass if not isinstance(nid_entry, tuple) or isinstance(nid_entry, list): nid_entry = [nid_entry] if len(nid_entry) == 1: if str(nid_entry[0]).isdigit() and str(nid_entry[0]) != '0': return (int(nid_entry[0])) else: pass return (int(utils.get_gbd_parameter('generic_cancer_nid')))
def split_liver(): ''' Submits the liver-cancer-specific information to the split manager ''' # set source and targets source_cid = 417 # parent cause_id target_cids = [996, 418, 419, 420, 421] # cause_ids proportion_meids = [18763, 2470, 2471, 2472, 2473] # proportion me_ids years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1)) description = "lvr_cncr_split" liver_model_path = utils.get_path('cod_liver_splits', process='cancer_model') work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp()) # Run split print(utils.display_timestamp()) success = manage_split(source_cid, target_cids, proportion_meids, work_dir, description) if success: print("All CoD liver splits uploaded. " + utils.display_timestamp()) else: print("Error during CoD splits for liver cancer")
def calc_procedure_tenplus(inc_df, proportions, acause, location_id): ''' Multiplies incidence draws by the procedure proportion and the absolute survival proportion at 10 years to estimate the number of cases surviving for at least 10 years ''' # Load known values print( " calculating the incidence of procedures with surv > ten years...") uid_cols = nd.nonfatalDataset().uid_cols type_cols = nd.get_columns('incidence') draw_cols = nd.get_columns("draw_cols") abs_surv = [nd.get_columns("absolute_survival")] max_estimation_year = utils.get_gbd_parameter('max_year') max_survival_months = nd.nonfatalDataset().max_survival_months # Estimate incidence of procedure mrg_df = inc_df.merge(proportions) adj_df = mrg_df[uid_cols] num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values) adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0) # Estimate number of procedures resulting in survival beyond ten years surv_df = load_estimates('survival', acause, location_id) surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months), uid_cols + abs_surv] adj_df = adj_df.merge(surv_df) pbt_df = adj_df[uid_cols] num_procedures_10ys = adj_df[type_cols].values * \ adj_df[abs_surv].values pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0) # Update years and age categories pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply( add_decade_to_age) pbt_df.loc[:, 'year_id'] += 10 # drop data that are now out of scope pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :] # For procedures whose sequelae are fractional, if sequelae_fractions(acause): pbt_df = split_sequelae(pbt_df, acause, location_id) else: pbt_df.loc[:, 'modelable_entity_id'] = \ nd.get_modelable_entity_id(acause, 'procedure_sequelae') return (pbt_df)