Exemplo n.º 1
0
    def get_computed_dataframe(self, df):
        original_columns = list(df.columns)

        orig_deaths_sum = (df['cf'] * df['sample_size']).sum()

        anemia_props = pd.read_csv(self.anemia_props_path)
        anemia_df = df.loc[df['cause_id'] == self.anemia_cause_id]
        anemia_df = add_location_metadata(
            anemia_df, 'ihme_loc_id',
            location_set_version_id=self.location_set_version_id,
            force_rerun=False
        )
        anemia_df['iso3'] = anemia_df['ihme_loc_id'].str.slice(0, 3)
        unique_iso3s = list(anemia_df['iso3'].unique())
        merge_props = anemia_props.loc[
            anemia_props['iso3'].isin(unique_iso3s)
        ]
        unique_years = list(anemia_df.year_id.unique())
        years_under_90 = [u for u in unique_years if u < 1990]
        if len(years_under_90) > 0:
            props_90 = merge_props.query('year_id == 1990')
            for copy_year in years_under_90:
                copy_props = props_90.copy()
                copy_props['year_id'] = copy_year
                merge_props = merge_props.append(
                    copy_props, ignore_index=True)
        anemia_df = anemia_df.merge(
            merge_props,
            on=['iso3', 'year_id', 'age_group_id', 'sex_id', 'cause_id'],
            how='left'
        )
        self.diag_df = anemia_df

        sum_to_one_id_cols = list(set(original_columns) - set(self.cf_cols))
        assert np.allclose(
            anemia_df.groupby(
                sum_to_one_id_cols
            )['anemia_prop'].sum(),
            1
        )

        anemia_df['cause_id'] = anemia_df['target_cause_id']
        for cf_col in self.cf_cols:
            anemia_df[cf_col] = anemia_df[cf_col] * anemia_df['anemia_prop']

        anemia_df = anemia_df[original_columns]

        df = df.loc[df['cause_id'] != self.anemia_cause_id]
        df = df.append(anemia_df, ignore_index=True)

        sum_cols = self.cf_cols
        group_cols = list(set(df.columns) - set(sum_cols))
        df = df.groupby(group_cols, as_index=False)[sum_cols].sum()

        new_deaths_sum = (df['cf'] * df['sample_size']).sum()

        assert np.allclose(orig_deaths_sum, new_deaths_sum)

        return df
Exemplo n.º 2
0
    def get_computed_dataframe(self, df):
        original_columns = list(df.columns)

        orig_deaths_sum = (df['cf'] * df['sample_size']).sum()

        # split the anemia in the data onto a set of target causes with
        # proportions that add to 1
        anemia_props = pd.read_csv(self.anemia_props_path)
        anemia_df = df.loc[df['cause_id'] == self.anemia_cause_id]
        anemia_df = add_location_metadata(
            anemia_df,
            'ihme_loc_id',
            location_set_version_id=self.location_set_version_id,
            force_rerun=False)
        anemia_df['iso3'] = anemia_df['ihme_loc_id'].str.slice(0, 3)
        unique_iso3s = list(anemia_df['iso3'].unique())
        merge_props = anemia_props.loc[anemia_props['iso3'].isin(unique_iso3s)]
        unique_years = list(anemia_df.year_id.unique())
        years_under_90 = [u for u in unique_years if u < 1990]
        if len(years_under_90) > 0:
            props_90 = merge_props.query('year_id == 1990')
            for copy_year in years_under_90:
                copy_props = props_90.copy()
                copy_props['year_id'] = copy_year
                merge_props = merge_props.append(copy_props, ignore_index=True)
        anemia_df = anemia_df.merge(
            merge_props,
            on=['iso3', 'year_id', 'age_group_id', 'sex_id', 'cause_id'],
            how='left')
        # use the unchanged anemia df as diag df
        self.diag_df = anemia_df

        sum_to_one_id_cols = list(set(original_columns) - set(self.cf_cols))
        assert np.allclose(
            anemia_df.groupby(sum_to_one_id_cols)['anemia_prop'].sum(), 1)

        anemia_df['cause_id'] = anemia_df['target_cause_id']
        for cf_col in self.cf_cols:
            anemia_df[cf_col] = anemia_df[cf_col] * anemia_df['anemia_prop']

        # remove extra columns used in this adjustment
        anemia_df = anemia_df[original_columns]

        # replace the anemia data in the incoming df with the newly split
        # anemia data
        df = df.loc[df['cause_id'] != self.anemia_cause_id]
        df = df.append(anemia_df, ignore_index=True)

        # collapse together potential duplicate location-year-age-sex-causes
        # introduced from splitting anemia onto targets
        sum_cols = self.cf_cols
        group_cols = list(set(df.columns) - set(sum_cols))
        df = df.groupby(group_cols, as_index=False)[sum_cols].sum()

        new_deaths_sum = (df['cf'] * df['sample_size']).sum()

        assert np.allclose(orig_deaths_sum, new_deaths_sum)

        return df
Exemplo n.º 3
0
 def add_iso3(self, df, location_hierarchy):
     """Add iso3 to incoming dataframe."""
     df = add_location_metadata(df,
                                'ihme_loc_id',
                                location_meta_df=location_hierarchy)
     df['iso3'] = df['ihme_loc_id'].str[0:3]
     df.drop(['ihme_loc_id'], axis=1, inplace=True)
     return df
def prune_cancer_registry_data(df, location_meta_df):

    ukraine_nid_extract = (df['nid'] == 284465) & (df['extract_type_id'] == 53)
    assert (df[ukraine_nid_extract]['location_id'] == 63).all(), \
        "Now ukraine data has more than just ukraine national, and code " \
        "should be changed"
    df.loc[ukraine_nid_extract, 'location_id'] = 50559

    df = add_location_metadata(df, ['most_detailed'],
                               location_meta_df=location_meta_df)
    report_if_merge_fail(df, 'most_detailed', 'location_id')

    df = df.query('most_detailed == 1')
    df = df.drop('most_detailed', axis=1)
    return df
Exemplo n.º 5
0
    def prep_rti_fractions(self, df, age_meta_df, location_meta_df):
        # modify df
        df = add_age_metadata(df, 'simple_age', age_meta_df=age_meta_df)
        df = add_location_metadata(df, 'ihme_loc_id', location_meta_df=location_meta_df)
        df['iso3'] = df['ihme_loc_id'][:3]
        df = df.drop('ihme_loc_id', axis=1)

        rti_fractions = CONF.get_resource("RTI_fractions")
        rti_fractions = pd.read_stata(rti_fractions)
        rti_fractions = rti_fractions.rename(columns={'age': 'simple_age',
                                                      'year': 'year_id',
                                                      'sex': 'sex_id'})
        twenty_eleven = rti_fractions[rti_fractions.year_id == 2011]
        twenty_eleven['year_id'] = 2012
        rti_fractions = rti_fractions.append(twenty_eleven, ignore_index=True)
        return df, rti_fractions
Exemplo n.º 6
0
    def double_rake(self, df, location_hierarchy):
        """Method built to rake twice when one location level exists between
        most detailed and the national level. Perhaps in time this can be expanded
        to function for datasets where multiple locations levels exists between
        most detailed and the national level (GBR only?).
        """
        start = len(df)
        # flag location levels
        df = add_location_metadata(df, ['level'],
                                   location_meta_df=location_hierarchy)
        report_if_merge_fail(df, 'level', 'location_id')

        # isolate the different location levels in the dataframe, assumption that levels
        # in the data are 3(natl), 4(intermediate/state), and 5(most detailed)
        national = (df['level'] == 3)
        intermediate = (df['level'] == 4)
        detail = (df['level'] == 5)
        # some checks:
        assert len(df[detail]) > 0, "Double raking assumes there is subnational data in the" \
            " dataframe, however no subnational detail is present"
        assert len(df[intermediate]) > 0, "Attempting to double rake, but there are no" \
            " intermediate locations in the data to rake to first"
        intermediate_locs = df[intermediate].location_id.unique().tolist()
        df.drop('level', axis=1, inplace=True)

        # rake the detailed locs to their parent first (first rake of double raking)
        non_national = df.loc[detail | intermediate]
        single_raked_detail_locs = self.rake_detail_to_intermediate(
            non_national, location_hierarchy, intermediate_locs)
        # now take the subnationals that have been raked to the intermediate locations
        # and rake them to the national (the second rake of double raking)
        detail_and_national = pd.concat(
            [single_raked_detail_locs, df.loc[national]], ignore_index=True)
        detail_df = self.standard_rake(detail_and_national, location_hierarchy)
        detail_df = detail_df.loc[detail_df.is_nat == 0]

        # rake the intermediate level to the national
        intermediate_and_national = df.loc[intermediate | national]
        intermediate_and_national = self.standard_rake(
            intermediate_and_national, location_hierarchy)

        # concat the national, single raked intermediate locs, and double raked detail locs
        df = pd.concat([detail_df, intermediate_and_national],
                       ignore_index=True)
        assert start == len(df), "The number of rows have changed,"\
                             " this really shouldn't happen."
        return df
Exemplo n.º 7
0
 def rake_detail_to_intermediate(self, df, location_hierarchy,
                                 intermediate_locs):
     """Raking the detailed locations to their non-national parent. Have to do this
     individually by each intermediate location.
     """
     # add parent_id to the data
     df = add_location_metadata(df, ['parent_id'],
                                location_meta_df=location_hierarchy)
     # loop through each intermediate loc and rake
     dfs = []
     for loc in intermediate_locs:
         temp = df.loc[(df.parent_id == loc) | (df.location_id == loc)]
         # treating the intermediate location as a national to align with
         # the logic that identifies aggregates in the raker
         temp.loc[temp.location_id == loc,
                  'location_id'] = temp['parent_id']
         temp.drop('parent_id', axis=1, inplace=True)
         temp = self.standard_rake(temp, location_hierarchy)
         dfs.append(temp)
     df = pd.concat(dfs, ignore_index=True)
     # subset to just the detail locs, we'll add the intermediate and national on later
     df = df.loc[df.is_nat == 0]
     return df
Exemplo n.º 8
0
def prune_cancer_registry_data(df, location_meta_df):
    """ A bit of a hacky place to remove non-detailed locations

    if these are not removed, then there will be data in the national aggregate
    locations that comes from aggregating with site id 2, as well as data
    from the non-aggregated output with whatever original site id was in
    there. This causes problems, one of which is cause fractions over 1

    """

    # Ukraine data was supposed to be without Crimea Sevastapol
    ukraine_nid_extract = (df['nid'] == 284465) & (df['extract_type_id'] == 53)
    assert (df[ukraine_nid_extract]['location_id'] == 63).all(), \
        "Now ukraine data has more than just ukraine national, and code " \
        "should be changed"
    df.loc[ukraine_nid_extract, 'location_id'] = 50559

    df = add_location_metadata(df, ['most_detailed'],
                               location_meta_df=location_meta_df)
    report_if_merge_fail(df, 'most_detailed', 'location_id')
    # make exception for Ukraine data that gets split up later
    df = df.query('most_detailed == 1')
    df = df.drop('most_detailed', axis=1)
    return df
Exemplo n.º 9
0
 def set_iso3_on_data(self):
     self.df = add_location_metadata(self.df,
                                     'ihme_loc_id',
                                     location_meta_df=self.loc_meta_df)
     self.df['iso3'] = self.df['ihme_loc_id'].apply(lambda x: x[0:3])
     self.df = self.df.drop('ihme_loc_id', axis=1)