def redist_nmsc_gc(df, metric): ''' Splits non-melanoma skin cancer data proportionately into subcauses ------ Inputs: df : a mortality-incidence input dataset at stage 5 metric : one of ['pop', 'cases', 'deaths'] ''' # subset data to split. exit if no split necessary nmsc_props = pp.get_nmsc_proportions() nmsc_props.rename(columns={'cause': 'acause'}, inplace=True) del nmsc_props['coding_system'] to_split = df.loc[df['acause'].isin(nmsc_props['acause']), :] no_split = df.loc[~df['acause'].isin(nmsc_props['acause']), :] if len(to_split) == 0: return (df) print("disaggregating nmsc data...") # merge with proportions to split causes is_split = to_split.merge(nmsc_props, on=['sex', 'acause', 'age'], how='left', indicator=True) assert not is_split['_merge'].isin(["left_only"]).all(), \ "Error during merge with NMSC proportions" # apply proportions is_split.loc[:, 'acause'] = is_split['mapped_cause'] is_split['split_value'] = is_split[metric] is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value'] output = no_split.append(is_split) pt.verify_metric_total(df, output, metric, 'NMSC') return (output.loc[:, no_split.columns.tolist()])
def split_unknown_age(dataset, wgt_df, uid_cols, metric): ''' redistribute unkonwn ages --- Inputs --- dataset : DataFrame pandas dataframe metric : string possible values: ['pop', 'cases', 'deaths'] ''' assert 'wgt' in wgt_df.columns, "Error: no wgt column sent in wgt_df" # Ensure that 'age' is not listed as a uid uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] # Split unknown age unknown_age = dataset.loc[dataset['age'] == 26, uids_noAge + [metric]].copy() unknown_age.rename(columns={metric: 'unknown_age_data'}, inplace=True) known_age = dataset.loc[dataset['age'] != 26, :].copy() with_weights = known_age.merge(wgt_df) astest.test_weights(known_age, with_weights) prop_df = pp.add_proportions(with_weights, uids_noAge) to_distribute = prop_df.merge(unknown_age) to_distribute.loc[:, 'unknown_age_data'].fillna(value=0, inplace=True) to_distribute['orig_data'] = to_distribute[metric].copy() to_distribute.loc[:, metric] += to_distribute['unknown_age_data'].multiply( to_distribute['proportion']) output = to_distribute[uid_cols + [metric]] output.loc[:, 'frmat'] = 131 pt.verify_metric_total(dataset, output, metric, "split unknown age") return (output)
def split_age(dataset, wgt_df, metric, uid_cols, replacement_col='gbd_age'): ''' Splits aggregate ages --- Inputs --- dataset : DataFrame pandas dataframe must contain "obs" column indicating the observation number wgt_df : DataFrame pandas dataframe containing expected numbers for the metric at each age metric : string possible values: ['pop', 'cases', 'deaths'] uid_cols : list column names indicating unique identifiers for the wgt_df (eg. cause, location_id) replacement_col: string name of the column containing the new age values in the wgt_df ''' assert 'wgt' in wgt_df.columns, "Error: no wgt column sent in wgt_df" # Ensure that 'age' is not listed as a uid, since it's being split uids_noAge = [u for u in uid_cols if 'age' not in u] # mark those entries that need to be split. do not split if frmat==9 (unknown) standard_age_frmats = [0] standard_age_im_frmats = [1, 2, 8, 9] # fill to a standard im_frmat if frmat == 9 dataset.loc[:, 'im_frmat'].fillna(value=9, inplace=True) dataset = add_missing_ages(dataset, uids_noAge, metric) # for each format type, mark which age categories need to be split per the # corresponding format map. split only those categogies. can split # multiple age formats at once dataset['need_split'] = 0 dataset.loc[~dataset.frmat.isin(standard_age_frmats + [9]) & ~dataset['age'].isin([26]), 'need_split'] = 1 dataset.loc[~dataset.im_frmat.isin(standard_age_im_frmats) & dataset.frmat != 9 & ~dataset['age'].isin([26]), 'need_split'] = 1 # Split age for each format type df = dataset.loc[dataset.need_split == 1, :].copy(deep=True) for frmat_type in ['frmat', 'im_frmat']: df = apply_age_spilt_proportions(df, frmat_type, wgt_df, uid_cols, metric) # rename age formats to original name unadjusted = dataset.loc[dataset.need_split == 0, :].copy(deep=True) output = unadjusted.append(df) del output['need_split'] output.loc[:, 'im_frmat'] = 9 output.loc[output['frmat'] != 9, 'frmat'] = 131 pt.verify_metric_total(dataset, output, metric, "split age") return (output)
def disaggregate_acause(df, ds_instance): ''' Description: Returns a dataframe in which metric values have been distributed across all associated acauses How it Works: Utilizes the create_metric_weights function to reshape the df so that acause is in long form. Adds proportions to each acause by observation, then applies those proportions to split the input metric value across the attributed acauses. Finally, collapses to re-combine data to single datapoints by gbd_cause and acause. NOTE: this process drops the 'cause' and 'cause_name' columns. ''' # Ensure that 'age' is not listed as a uid metric = ds_instance.metric uids_noAcause = [c for c in md.get_uid_cols(5) if 'acause' not in c] acause_cols = [a for a in df.columns if 'acause' in a] all_uids = md.get_uid_cols(5) needs_split = (df['acause2'].notnull() & ~df['acause2'].isin([""])) to_split = df.loc[needs_split, :] no_split = df.loc[~needs_split, :] # If no split needed, simply return the datafraqme with a renamed acause1 if len(to_split) == 0: df.rename(columns={'acause1': 'acause'}, inplace=True) acause_cols.remove('acause1') df.drop(labels=acause_cols, axis=1, inplace=True) return (df) print("disaggregating acause...") # create weights used for splitting weight_df = pp.create_metric_weights(df, all_uids, ds_instance) # calculate proportions based on the weights proportions_df = pp.add_proportions(weight_df, uids_noAcause) # adjust by proportions is_split = to_split.merge(proportions_df) is_split['split_value'] = is_split[metric] is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value'] is_split = md.stdz_col_formats(is_split) # no_split.rename(columns={'acause1': 'acause'}, inplace=True) acause_cols.remove('acause1') no_split.drop(labels=acause_cols, axis=1, inplace=True) # output = no_split.append(is_split) pt.verify_metric_total(df, output, metric, "disaggregate acause") return (output.loc[:, no_split.columns.tolist()])
def apply_age_spilt_proportions(input_df, frmat_type, wgt_df, uid_cols, metric): ''' combines weights with population to calculate proportions by which combined age groups are to be split, then splits data by those proportions ''' # remove dataset_id if present in dataframe split_input = input_df.copy() if 'dataset_id' in split_input.columns: del split_input['dataset_id'] # merge with the age format map and get an expanded dataframe with the ages # to be split uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] marked_df = mark_ages_to_be_split(split_input, frmat_type, uid_cols, metric) to_expand = marked_df.loc[marked_df['to_expand'].eq(1), :].copy() if len(to_expand) == 0: return (split_input) # merge with expected values ("weights") to_expand.rename(columns={ 'age': 'split_age', 'gbd_age': 'age' }, inplace=True) weighted_df = to_expand.merge(wgt_df) astest.test_weights(to_expand, weighted_df) # calculate proportions to_split = pp.add_proportions(weighted_df, uids_noAge + ['split_age']) # adjust by proportions to_split.loc[:, 'split_value'] = to_split[metric] to_split.loc[:, metric] = to_split['proportion'] * to_split['split_value'] # collapse, then update format types of split data recombined_df = to_split.append( marked_df.loc[marked_df['to_expand'] == 0, :]) adjusted_df = dft.collapse(recombined_df, by_cols=uid_cols, func='sum', combine_cols=metric) astest.compare_pre_post_split(split_input, adjusted_df, metric) adjusted_df.loc[:, 'need_split'] = 1 pt.verify_metric_total(split_input, adjusted_df, metric, "apply age proportions") return (adjusted_df[split_input.columns.values])
def redist_kaposi(df, metric, uid_cols): ''' Adjusts Kaposi Sarcoma data to account for HIV-attributed cases. ------ Inputs: df : a mortality-incidence input dataset at stage 5 metric : one of ['pop', 'cases', 'deaths'] uid_cols : list indicating column-set that uniquely identifies observations ''' # subset data to split. exit if no split necessary kaposi_prop = pp.get_kaposi_proportions() kaposi_prop.rename(columns={'cause': 'acause'}, inplace=True) del kaposi_prop['coding_system'] to_split = df.loc[df['acause'].isin(kaposi_prop['acause'].unique()), :] no_split = df.loc[~df['acause'].isin(kaposi_prop['acause'].unique()), :] if len(to_split) == 0: return (df) print("disaggregating kaposi sarcoma data...") # merge with weights to_split = to_split.merge(kaposi_prop, on=['sex', 'acause', 'age'], how='left', indicator=True) assert not to_split['_merge'].isin(["left_only"]).any(), \ "Error: Not all Kaposi data could be merged with proportions" # Mark those those data that are both kaposi and have the correct year range. to_split = add_year_id(to_split) within_range = ((to_split['year'] >= to_split['year_start']) & (to_split['year'] <= to_split['year_end'])) to_split.loc[within_range, 'match'] = 1 split_groups = to_split.groupby(uid_cols, as_index=False)['match'].max() # Split only marked data is_split = to_split.merge(split_groups[split_groups['match'].isin([1])], how='inner') is_split['split_value'] = is_split[metric] is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value'] is_split.loc[:, 'acause'] = is_split['target'] # Format kaposi data that did not meet any year range criteria cant_split = to_split.merge(split_groups[split_groups['match'].isin([0])], how='inner') cant_split = cant_split.loc[:, no_split.columns.tolist()].drop_duplicates() output = pd.concat([no_split, is_split, cant_split]) pt.verify_metric_total(df, output, metric, 'kaposi_redist') return (output.loc[:, no_split.columns.tolist()])
def validate_mapping(in_df, out_df, metric): ''' Tests the mapping output to verify results. Stops ''' uid_cols = md.get_uid_cols(3) test_results = {'missing entries': [], 'new entries': []} in_df = md.stdz_col_formats(in_df) out_df = md.stdz_col_formats(out_df) test_df = pd.merge(in_df[uid_cols], out_df[uid_cols], on=uid_cols, how='outer', indicator=True) if test_df['_merge'].isin(["left_only"]).any(): test_results['missing entries'] = \ test_df.loc[test_df._merge.eq("left_only"),uid_cols].to_dict() if test_df['_merge'].eq("right_only").any(): test_results['new entries'] = \ test_df.loc[test_df._merge.eq("right_only"), uid_cols].to_dict() if len(out_df) != len(in_df): test_results['missing or extra uids'] = "fail" pt.verify_metric_total(in_df, out_df, metric, "mapping test") tests.checkTestResults(test_results, 'validate mapping', displayOnly=False)
def main(dataset_id, data_type_id): ''' Disaggregates uids that are mapped to multiple gbd causes, including garbage codes, Kaposi Sarcoma, and non-melanoma skin cancer ''' # prep_step 5 = cause_disaggregation this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id) input_data = this_dataset.load_input() metric = this_dataset.metric uid_cols = md.get_uid_cols(5) input_data = input_data.loc[ ~input_data['age'].isin([26, 3, 4, 5, 6, 91, 92, 93, 94]), :] # Format and add observation numbers formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric) # Disaggregate disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset) # update uid columns to account for reshaped acause uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause'] # kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols) if data_type_id == 2: adjusted_df = core.redist_nmsc_gc(kaposi_df, metric) else: adjusted_df = kaposi_df final_df = core.map_remaining_garbage(adjusted_df, data_type_id) # run test functions and save output pt.verify_metric_total(input_data, adjusted_df, metric, "cause disaggregation module") # collapse to incorperate newly-split data output_uids = md.get_uid_cols(6) final_df = md.stdz_col_formats(final_df) final_df = dft.collapse(final_df, by_cols=output_uids, func='sum', combine_cols=metric) # save md.complete_prep_step(final_df, this_dataset) print("Acause disaggregated")