def get_computed_dataframe(self, df): original_columns = list(df.columns) orig_deaths_sum = (df['cf'] * df['sample_size']).sum() anemia_props = pd.read_csv(self.anemia_props_path) anemia_df = df.loc[df['cause_id'] == self.anemia_cause_id] anemia_df = add_location_metadata( anemia_df, 'ihme_loc_id', location_set_version_id=self.location_set_version_id, force_rerun=False ) anemia_df['iso3'] = anemia_df['ihme_loc_id'].str.slice(0, 3) unique_iso3s = list(anemia_df['iso3'].unique()) merge_props = anemia_props.loc[ anemia_props['iso3'].isin(unique_iso3s) ] unique_years = list(anemia_df.year_id.unique()) years_under_90 = [u for u in unique_years if u < 1990] if len(years_under_90) > 0: props_90 = merge_props.query('year_id == 1990') for copy_year in years_under_90: copy_props = props_90.copy() copy_props['year_id'] = copy_year merge_props = merge_props.append( copy_props, ignore_index=True) anemia_df = anemia_df.merge( merge_props, on=['iso3', 'year_id', 'age_group_id', 'sex_id', 'cause_id'], how='left' ) self.diag_df = anemia_df sum_to_one_id_cols = list(set(original_columns) - set(self.cf_cols)) assert np.allclose( anemia_df.groupby( sum_to_one_id_cols )['anemia_prop'].sum(), 1 ) anemia_df['cause_id'] = anemia_df['target_cause_id'] for cf_col in self.cf_cols: anemia_df[cf_col] = anemia_df[cf_col] * anemia_df['anemia_prop'] anemia_df = anemia_df[original_columns] df = df.loc[df['cause_id'] != self.anemia_cause_id] df = df.append(anemia_df, ignore_index=True) sum_cols = self.cf_cols group_cols = list(set(df.columns) - set(sum_cols)) df = df.groupby(group_cols, as_index=False)[sum_cols].sum() new_deaths_sum = (df['cf'] * df['sample_size']).sum() assert np.allclose(orig_deaths_sum, new_deaths_sum) return df
def get_computed_dataframe(self, df): original_columns = list(df.columns) orig_deaths_sum = (df['cf'] * df['sample_size']).sum() # split the anemia in the data onto a set of target causes with # proportions that add to 1 anemia_props = pd.read_csv(self.anemia_props_path) anemia_df = df.loc[df['cause_id'] == self.anemia_cause_id] anemia_df = add_location_metadata( anemia_df, 'ihme_loc_id', location_set_version_id=self.location_set_version_id, force_rerun=False) anemia_df['iso3'] = anemia_df['ihme_loc_id'].str.slice(0, 3) unique_iso3s = list(anemia_df['iso3'].unique()) merge_props = anemia_props.loc[anemia_props['iso3'].isin(unique_iso3s)] unique_years = list(anemia_df.year_id.unique()) years_under_90 = [u for u in unique_years if u < 1990] if len(years_under_90) > 0: props_90 = merge_props.query('year_id == 1990') for copy_year in years_under_90: copy_props = props_90.copy() copy_props['year_id'] = copy_year merge_props = merge_props.append(copy_props, ignore_index=True) anemia_df = anemia_df.merge( merge_props, on=['iso3', 'year_id', 'age_group_id', 'sex_id', 'cause_id'], how='left') # use the unchanged anemia df as diag df self.diag_df = anemia_df sum_to_one_id_cols = list(set(original_columns) - set(self.cf_cols)) assert np.allclose( anemia_df.groupby(sum_to_one_id_cols)['anemia_prop'].sum(), 1) anemia_df['cause_id'] = anemia_df['target_cause_id'] for cf_col in self.cf_cols: anemia_df[cf_col] = anemia_df[cf_col] * anemia_df['anemia_prop'] # remove extra columns used in this adjustment anemia_df = anemia_df[original_columns] # replace the anemia data in the incoming df with the newly split # anemia data df = df.loc[df['cause_id'] != self.anemia_cause_id] df = df.append(anemia_df, ignore_index=True) # collapse together potential duplicate location-year-age-sex-causes # introduced from splitting anemia onto targets sum_cols = self.cf_cols group_cols = list(set(df.columns) - set(sum_cols)) df = df.groupby(group_cols, as_index=False)[sum_cols].sum() new_deaths_sum = (df['cf'] * df['sample_size']).sum() assert np.allclose(orig_deaths_sum, new_deaths_sum) return df
def add_iso3(self, df, location_hierarchy): """Add iso3 to incoming dataframe.""" df = add_location_metadata(df, 'ihme_loc_id', location_meta_df=location_hierarchy) df['iso3'] = df['ihme_loc_id'].str[0:3] df.drop(['ihme_loc_id'], axis=1, inplace=True) return df
def prune_cancer_registry_data(df, location_meta_df): ukraine_nid_extract = (df['nid'] == 284465) & (df['extract_type_id'] == 53) assert (df[ukraine_nid_extract]['location_id'] == 63).all(), \ "Now ukraine data has more than just ukraine national, and code " \ "should be changed" df.loc[ukraine_nid_extract, 'location_id'] = 50559 df = add_location_metadata(df, ['most_detailed'], location_meta_df=location_meta_df) report_if_merge_fail(df, 'most_detailed', 'location_id') df = df.query('most_detailed == 1') df = df.drop('most_detailed', axis=1) return df
def prep_rti_fractions(self, df, age_meta_df, location_meta_df): # modify df df = add_age_metadata(df, 'simple_age', age_meta_df=age_meta_df) df = add_location_metadata(df, 'ihme_loc_id', location_meta_df=location_meta_df) df['iso3'] = df['ihme_loc_id'][:3] df = df.drop('ihme_loc_id', axis=1) rti_fractions = CONF.get_resource("RTI_fractions") rti_fractions = pd.read_stata(rti_fractions) rti_fractions = rti_fractions.rename(columns={'age': 'simple_age', 'year': 'year_id', 'sex': 'sex_id'}) twenty_eleven = rti_fractions[rti_fractions.year_id == 2011] twenty_eleven['year_id'] = 2012 rti_fractions = rti_fractions.append(twenty_eleven, ignore_index=True) return df, rti_fractions
def double_rake(self, df, location_hierarchy): """Method built to rake twice when one location level exists between most detailed and the national level. Perhaps in time this can be expanded to function for datasets where multiple locations levels exists between most detailed and the national level (GBR only?). """ start = len(df) # flag location levels df = add_location_metadata(df, ['level'], location_meta_df=location_hierarchy) report_if_merge_fail(df, 'level', 'location_id') # isolate the different location levels in the dataframe, assumption that levels # in the data are 3(natl), 4(intermediate/state), and 5(most detailed) national = (df['level'] == 3) intermediate = (df['level'] == 4) detail = (df['level'] == 5) # some checks: assert len(df[detail]) > 0, "Double raking assumes there is subnational data in the" \ " dataframe, however no subnational detail is present" assert len(df[intermediate]) > 0, "Attempting to double rake, but there are no" \ " intermediate locations in the data to rake to first" intermediate_locs = df[intermediate].location_id.unique().tolist() df.drop('level', axis=1, inplace=True) # rake the detailed locs to their parent first (first rake of double raking) non_national = df.loc[detail | intermediate] single_raked_detail_locs = self.rake_detail_to_intermediate( non_national, location_hierarchy, intermediate_locs) # now take the subnationals that have been raked to the intermediate locations # and rake them to the national (the second rake of double raking) detail_and_national = pd.concat( [single_raked_detail_locs, df.loc[national]], ignore_index=True) detail_df = self.standard_rake(detail_and_national, location_hierarchy) detail_df = detail_df.loc[detail_df.is_nat == 0] # rake the intermediate level to the national intermediate_and_national = df.loc[intermediate | national] intermediate_and_national = self.standard_rake( intermediate_and_national, location_hierarchy) # concat the national, single raked intermediate locs, and double raked detail locs df = pd.concat([detail_df, intermediate_and_national], ignore_index=True) assert start == len(df), "The number of rows have changed,"\ " this really shouldn't happen." return df
def rake_detail_to_intermediate(self, df, location_hierarchy, intermediate_locs): """Raking the detailed locations to their non-national parent. Have to do this individually by each intermediate location. """ # add parent_id to the data df = add_location_metadata(df, ['parent_id'], location_meta_df=location_hierarchy) # loop through each intermediate loc and rake dfs = [] for loc in intermediate_locs: temp = df.loc[(df.parent_id == loc) | (df.location_id == loc)] # treating the intermediate location as a national to align with # the logic that identifies aggregates in the raker temp.loc[temp.location_id == loc, 'location_id'] = temp['parent_id'] temp.drop('parent_id', axis=1, inplace=True) temp = self.standard_rake(temp, location_hierarchy) dfs.append(temp) df = pd.concat(dfs, ignore_index=True) # subset to just the detail locs, we'll add the intermediate and national on later df = df.loc[df.is_nat == 0] return df
def prune_cancer_registry_data(df, location_meta_df): """ A bit of a hacky place to remove non-detailed locations if these are not removed, then there will be data in the national aggregate locations that comes from aggregating with site id 2, as well as data from the non-aggregated output with whatever original site id was in there. This causes problems, one of which is cause fractions over 1 """ # Ukraine data was supposed to be without Crimea Sevastapol ukraine_nid_extract = (df['nid'] == 284465) & (df['extract_type_id'] == 53) assert (df[ukraine_nid_extract]['location_id'] == 63).all(), \ "Now ukraine data has more than just ukraine national, and code " \ "should be changed" df.loc[ukraine_nid_extract, 'location_id'] = 50559 df = add_location_metadata(df, ['most_detailed'], location_meta_df=location_meta_df) report_if_merge_fail(df, 'most_detailed', 'location_id') # make exception for Ukraine data that gets split up later df = df.query('most_detailed == 1') df = df.drop('most_detailed', axis=1) return df
def set_iso3_on_data(self): self.df = add_location_metadata(self.df, 'ihme_loc_id', location_meta_df=self.loc_meta_df) self.df['iso3'] = self.df['ihme_loc_id'].apply(lambda x: x[0:3]) self.df = self.df.drop('ihme_loc_id', axis=1)