示例#1
0
    def simple_aggregate(self):
        """Aggregate causes."""
        df = add_cause_metadata(self.df,
                                ['secret_cause', 'parent_id', 'level'],
                                merge_col='cause_id',
                                cause_meta_df=self.cause_meta_df)
        # quick check that there are no secret causes
        secret_causes = df.loc[df['secret_cause'] == 1]
        if len(secret_causes) > 0:
            raise AssertionError("The following secret causes are still "
                                 "in the data: \n{}".format(
                                     secret_causes['cause_id'].unique()))
        cause_levels = sorted(range(2, 6, 1), reverse=True)

        for level in cause_levels:
            level_df = df[df['level'] == level]
            if len(level_df) > 0:
                # replace the cause_id with the parent_id
                level_df['cause_id'] = level_df['parent_id']
                level_df['level'] = df['level'] - 1
                level_df.drop('parent_id', axis=1, inplace=True)

                level_df = add_cause_metadata(level_df, ['parent_id'],
                                              merge_col=['cause_id'],
                                              cause_meta_df=self.cause_meta_df)
                # add in deaths by each level
                df = pd.concat([level_df, df], ignore_index=True)
        return df
示例#2
0
    def level_3_aggregate(self):

        df = add_cause_metadata(self.df,
                                ['secret_cause', 'parent_id', 'level'],
                                merge_col='cause_id',
                                cause_meta_df=self.cause_meta_df)
        # quick check that there are no secret causes
        secret_causes = df.loc[df['secret_cause'] == 1]
        if len(secret_causes) > 0:
            raise AssertionError("The following secret causes are still "
                                 "in the data: \n{}".format(
                                     secret_causes['cause_id'].unique()))

        for level in [5, 4, 3]:
            level_df = df[df['level'] == level]
            if len(level_df) > 0:
                # replace the cause_id with the parent_id
                level_df['cause_id'] = level_df['parent_id']
                level_df['level'] = level_df['level'] - 1
                level_df.drop('parent_id', axis=1, inplace=True)
                # add parent_id back in for the newly changed cause_id
                # tried with mapping and this was faster
                level_df = add_cause_metadata(level_df, ['parent_id'],
                                              merge_col=['cause_id'],
                                              cause_meta_df=self.cause_meta_df)
                # add in deaths by each level
                df = pd.concat([level_df, df], ignore_index=True)
        return df
示例#3
0
    def level_3_aggregate(self):
        """Aggregate level 4 causes to their level 3 parent.

        This custom cause aggregation is only used where we have one
        type of cause in a source and we only want to aggregate to their parent
        Example: data with only road traffic injuries should only be aggregated
        to 'inj_trans_road' not all of 'inj_trans'.
        """
        df = add_cause_metadata(self.df,
                                ['secret_cause', 'parent_id', 'level'],
                                merge_col='cause_id',
                                cause_meta_df=self.cause_meta_df)
        # quick check that there are no secret causes
        secret_causes = df.loc[df['secret_cause'] == 1]
        if len(secret_causes) > 0:
            raise AssertionError("The following secret causes are still "
                                 "in the data: \n{}".format(
                                     secret_causes['cause_id'].unique()))

        for level in [5, 4, 3]:
            level_df = df[df['level'] == level]
            if len(level_df) > 0:
                # replace the cause_id with the parent_id
                level_df['cause_id'] = level_df['parent_id']
                level_df['level'] = level_df['level'] - 1
                level_df.drop('parent_id', axis=1, inplace=True)
                # add parent_id back in for the newly changed cause_id
                # tried with mapping and this was faster
                level_df = add_cause_metadata(level_df, ['parent_id'],
                                              merge_col=['cause_id'],
                                              cause_meta_df=self.cause_meta_df)
                # add in deaths by each level
                df = pd.concat([level_df, df], ignore_index=True)
        return df
示例#4
0
 def simple_aggregate(self):
     """Aggregate causes."""
     df = add_cause_metadata(self.df,
                             ['secret_cause', 'parent_id', 'level'],
                             merge_col='cause_id',
                             cause_meta_df=self.cause_meta_df)
     # quick check that there are no secret causes
     secret_causes = df.loc[df['secret_cause'] == 1]
     if len(secret_causes) > 0:
         raise AssertionError("The following secret causes are still "
                              "in the data: \n{}".format(
                                  secret_causes['cause_id'].unique()))
     cause_levels = sorted(range(2, 6, 1), reverse=True)
     # TODO
     # there's another bit in stata where we only aggregate causes
     # that belong to the same (source source_label subdiv NID) groups
     # maybe this is not needed now that we're going by NID?
     for level in cause_levels:
         level_df = df[df['level'] == level]
         if len(level_df) > 0:
             # replace the cause_id with the parent_id
             level_df['cause_id'] = level_df['parent_id']
             level_df['level'] = df['level'] - 1
             level_df.drop('parent_id', axis=1, inplace=True)
             # add parent_id back in for the newly changed cause_id
             # tried with mapping and this was faster
             level_df = add_cause_metadata(level_df, ['parent_id'],
                                           merge_col=['cause_id'],
                                           cause_meta_df=self.cause_meta_df)
             # add in deaths by each level
             df = pd.concat([level_df, df], ignore_index=True)
     return df
示例#5
0
    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        # VA sources are the only ones where this may not work
        # might need to split dataframe by data_type_id for bridge map
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()
        df.drop(columns='data_type_id', inplace=True)

        if self.needs_bridging(has_verbal_autopsy):
            file_name = self.get_file_name(has_verbal_autopsy)
            map_df = pd.read_csv(self.bridge_map_path / file_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)

            # perform zz bridge code redistribution before other bridge mapping
            bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df)

            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            # hack, this cause_id snuck in
            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df
示例#6
0
    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()

        if self.needs_bridging(has_verbal_autopsy):
            sheet_name = self.get_sheet_name(has_verbal_autopsy)
            map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)
            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df
示例#7
0
    def get_rates_df(self, cause_meta_df):

        if self.correct_garbage:
            filepath_infix = 'PRE'
        else:
            filepath_infix = 'POST'

        rates = pd.read_stata("FILEPATH".format(fp=filepath_infix,
                                                iso=self.iso3))
        rates = add_cause_metadata(rates, ['cause_id'],
                                   merge_col='acause',
                                   cause_meta_df=cause_meta_df)
        rates.loc[rates['acause'] == "_sepsis_gc",
                  'cause_id'] = self.sepsis_cause_id

        age_df = get_cod_ages()
        age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])]
        age_df['agecat'] = age_df['age_group_years_start']
        age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0
        age_df = age_df[['agecat', 'age_group_id']]
        # merge on ages to rates data
        rates = rates.merge(age_df, on='agecat', how='left')
        assert not rates['age_group_id'].isnull().any()
        # clean up columns
        rates = rates.rename(columns={'sex': 'sex_id'})
        rates = rates.drop(['acause', 'agecat'], axis=1)
        return rates
示例#8
0
    def get_rates_df(self, cause_meta_df):
        """Write a nice description here."""

        if self.correct_garbage:
            filepath_infix = 'PRE'
        else:
            filepath_infix = 'POST'

        rates_path = self.conf.get_resource(
            'hivcorr_global_causespecific_relrates').format(
                pre_post=filepath_infix, iso=self.iso3)
        rates = pd.read_stata(rates_path)
        # convert acause to cause_id
        rates = add_cause_metadata(rates, ['cause_id'],
                                   merge_col='acause',
                                   cause_meta_df=cause_meta_df)
        rates.loc[rates['acause'] == "_sepsis_gc",
                  'cause_id'] = self.sepsis_cause_id
        # convert age to age_group_id
        # TODO THIS NEEDS TO BE CHANGED B/C RIGHT NOW IT QUERIES DB
        # change get_demographics to have caching option too?
        age_df = get_cod_ages()
        age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])]
        age_df['agecat'] = age_df['age_group_years_start']
        age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0
        age_df = age_df[['agecat', 'age_group_id']]
        # merge on ages to rates data
        rates = rates.merge(age_df, on='agecat', how='left')
        assert not rates['age_group_id'].isnull().any()
        # clean up columns
        rates = rates.rename(columns={'sex': 'sex_id'})
        rates = rates.drop(['acause', 'agecat'], axis=1)
        return rates
示例#9
0
    def assert_valid_mappings(self, df, code_system_id):
        """Test that the mapping worked.

        Runs a suite of assertions to make sure that mapping was successful.
        Args:
            df (DataFrame): with at least code_id and cause_id
        Returns:
            None
        Raises:
            AssertionError: Any condition fails
        """
        # add code value from cached code map
        print("Adding value")
        df = add_code_metadata(df, ['value'],
                               code_system_id,
                               force_rerun=False,
                               block_rerun=True,
                               cache_dir=self.cache_dir)
        report_if_merge_fail(df, 'value', 'code_id')
        # get acause from cached cause hierarchy
        print("Adding acause")
        df = add_cause_metadata(df, ['acause'],
                                cause_set_version_id=self.cause_set_version_id,
                                force_rerun=False,
                                block_rerun=True,
                                cache_dir=self.cache_dir)
        report_if_merge_fail(df, 'acause', 'cause_id')

        # Test that all causes starting with 'acause_' are mapped correctly.
        # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd').
        # 'acause__gc_X59' should be mapped to '_gc', etc.
        print("Checking implied acauses")
        check_df = df.loc[df['value'].str.startswith('acause_')]
        check_df['implied_acause'] = \
            check_df['value'].str.replace('acause_', '', 1)

        check_df.loc[check_df['value'].str.contains("acause__gc"),
                     'implied_acause'] = "_gc"
        bad_df = check_df.loc[check_df['acause'] != check_df['implied_acause']]
        if len(bad_df) > 0:
            bad_stuff = bad_df[['value', 'acause']].drop_duplicates()
            raise AssertionError(
                "These code values do not match their acause: "
                "\n{}".format(bad_stuff))

        print("Checking for bad values")
        # assert incorrect acauses are gone
        bad_acauses = [
            'acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug'
        ]

        bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique()
        if len(bad_df) > 0:
            raise AssertionError(
                "Found these bad code values in the data: {}".format(
                    bad_stuff))
示例#10
0
 def get_diagnostic_dataframe(self):
     """Return diagnostics."""
     # important to run this full method first
     df = self.get_computed_dataframe()
     # cause metadata was changed in the process, so merge that on again
     # add on cause metadata to see parent_ids and cause levels
     df = add_cause_metadata(df, ['parent_id', 'level'],
                             merge_col='cause_id',
                             cause_meta_df=self.cause_meta_df)
     return df
示例#11
0
def drop_source_data(df, model_group, location_hierarchy, cause_meta_df):

    # for India VA
    if model_group == "VA-IND":
        srs = df['source'].str.startswith("India_SRS")
        scd = df['source'].str.startswith("India_SCD")
        df = df[~(srs | scd)]

    # special conditions for maternal sources
    if model_group.startswith("MATERNAL"):

        # grab countries for which we produce subnational estimates
        ihme_loc_dict = location_hierarchy.set_index(
            'location_id')['ihme_loc_id'].to_dict()
        df['iso3'] = df['location_id'].map(ihme_loc_dict)
        subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s')
        agg_locs = df['iso3'].isin(subnational_modeled_iso3s)

        # sources allowed to have location aggregates in model_df
        agg_sources = ["Other_Maternal", "Mexico_BIRMM"]
        no_loc_agg_source = ~(df['source'].isin(agg_sources))

        # drop rows where the source shouldn't have a location aggregate
        df = df[~(no_loc_agg_source & agg_locs)]

        # cleanup extra columns
        df.drop('iso3', axis=1, inplace=True)

    # these causes were likely introduced when adding in rd variance
    df = add_cause_metadata(df, 'yld_only', cause_meta_df=cause_meta_df)
    df = df.loc[df['yld_only'] != 1]
    df = df.drop('yld_only', axis=1)

    # VR data for Bolivia is dropped due to low completeness, only keep Chagas
    if model_group == 'VR-BOL':
        chagas = df['cause_id'] == 346
        df = df.loc[chagas]

    # for the malaria model groups, only keep malaria
    malaria = df['cause_id'] == 345
    if model_group.startswith('malaria'):
        df = df.loc[malaria]

    if model_group in ["malaria_IND_hypoendem", "malaria_IND_SRS_hypoendem"]:
        df = df.query('location_id != 163')

    if model_group in ["malaria_IND_mesoendem", "malaria_IND_SRS_mesoendem"]:
        df = df.loc[~(df['location_id'].isin([43902, 43938]))]

    return df
def assign_code_to_created_target_deaths(df, code_system_id, cause_meta_df):
    created = df[df['_merge'] == 'right_only']
    original = df[df['_merge'] != 'right_only']
    created = add_cause_metadata(created,
                                 'acause',
                                 cause_meta_df=cause_meta_df)
    created['value'] = created['acause'].apply(lambda x: 'acause_' + x)
    created.drop(['code_id', 'acause'], axis=1, inplace=True)
    created = add_code_metadata(created,
                                'code_id',
                                code_system_id=code_system_id,
                                merge_col='value',
                                cache_dir=CONF.get_directory('db_cache'),
                                force_rerun=False,
                                block_rerun=True)
    report_if_merge_fail(created, 'code_id', ['value'])
    df = original.append(created)
    df.drop(['_merge', 'value'], axis=1, inplace=True)
    return df
示例#13
0
文件: modeling.py 项目: agesak/thesis
def drop_age_restricted_cols(df):
    start = len(df)
    age_meta_df = get_ages(force_rerun=False, block_rerun=True)
    # secret causes in restrictions
    cause_meta_df = get_current_cause_hierarchy(cause_set_id=4,
                                                **{
                                                    'block_rerun': True,
                                                    'force_rerun': False
                                                })
    restrict_df = pd.read_csv(
        "/homes/agesak/thesis/maps/injuries_overrides.csv")
    restrict_df = add_cause_metadata(restrict_df,
                                     add_cols='cause_id',
                                     merge_col='acause',
                                     cause_meta_df=cause_meta_df)
    restrict_df["age_start_group"] = restrict_df["age_start_group"].fillna(0)

    orig_cols = df.columns
    df = add_age_metadata(
        df,
        add_cols=['age_group_years_start', 'age_group_years_end'],
        age_meta_df=age_meta_df)

    df = df.merge(restrict_df, on='cause_id', how='left')

    # age_group_years_end is weird, 0-14 means age_group_years_end 15
    too_young = df["age_group_years_end"] <= df["age_start_group"]
    too_old = df["age_group_years_start"] > df["age_end_group"]

    df = df[~(too_young | too_old)]
    df = df[orig_cols]
    end = len(df)
    print_log_message(
        f"dropping {start - end} cols that violate age restrictions")

    return df
示例#14
0
def drop_source_data(df, model_group, location_hierarchy, cause_meta_df):
    """Drop source specific data from model dataframe.

    Note: technically only the DHS survey types in
    Other_Maternal should have a location aggregation going
    into nr, however, this messes up raking later.
    """
    # for India VA
    if model_group == "VA-IND":
        srs = df['source'].str.startswith("India_SRS")
        scd = df['source'].str.startswith("India_SCD")
        df = df[~(srs | scd)]

    # for South Asia VA model group, Nepal_Burden_VA needs to be dropped
    if model_group == 'VA-158':
        df = df[df.source != "Nepal_Burden_VA"]

    # special conditions for maternal sources
    if model_group.startswith("MATERNAL"):

        # grab countries for which we produce subnational estimates
        ihme_loc_dict = location_hierarchy.set_index(
            'location_id')['ihme_loc_id'].to_dict()
        df['iso3'] = df['location_id'].map(ihme_loc_dict)
        subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s')
        agg_locs = df['iso3'].isin(subnational_modeled_iso3s)

        # sources allowed to have location aggregates in model_df
        agg_sources = ["Other_Maternal", "Mexico_BIRMM"]
        no_loc_agg_source = ~(df['source'].isin(agg_sources))

        # drop rows where the source shouldn't have a location aggregate
        df = df[~(no_loc_agg_source & agg_locs)]

        # cleanup extra columns
        df.drop('iso3', axis=1, inplace=True)

    # these causes were likely introduced when adding in rd variance
    df = add_cause_metadata(df, 'yld_only', cause_meta_df=cause_meta_df)
    df = df.loc[df['yld_only'] != 1]
    df = df.drop('yld_only', axis=1)

    # VR data for Bolivia is dropped due to low completeness, only keep Chagas
    if model_group == 'VR-BOL':
        chagas = df['cause_id'] == 346
        df = df.loc[chagas]

    # for the malaria model groups, only keep malaria
    malaria = df['cause_id'] == 345
    if model_group.startswith('malaria'):
        df = df.loc[malaria]
    # two model groups for India, but don't want national India to be duplicated
    # drop national level India data from this malaria model group
    # it will then only be included in the malaria_IND_mesoendem group
    if model_group in ["malaria_IND_hypoendem", "malaria_IND_SRS_hypoendem"]:
        df = df.query('location_id != 163')
    # similar fix for telangana, drop from mesoendem so it'll be in hypoendem
    if model_group in ["malaria_IND_mesoendem", "malaria_IND_SRS_mesoendem"]:
        df = df.loc[~(df['location_id'].isin([43902, 43938]))]

    return df