示例#1
0
 def __init__(self, source, cause_meta_df, code_system):
     self.source = source
     self.code_system = code_system
     self.conf = Configurator("standard")
     self.bridge_map_path = Path(self.conf.get_directory('bridge_maps'))
     self.cause_meta_df = cause_meta_df
     self.cache_options = {
         'force_rerun': False,
         'block_rerun': True,
         'cache_results': False,
         'cache_dir': 'standard'
     }
示例#2
0
 def __init__(self, cause_meta_df, source, code_system_id, data_type_id):
     self.source = source
     self.code_system_id = code_system_id
     self.data_type_id = data_type_id
     self.cause_meta_df = cause_meta_df
     self.conf = Configurator("standard")
     self.vr_indicators_path = self.conf.get_resource('vr_indicators')
     self.cache_options = {
         'force_rerun': False,
         'block_rerun': True,
         'cache_results': False,
         'cache_dir': self.conf.get_directory('db_cache')
     }
def run_phase(df, nid, extract_type_id, data_type_id, source,
              representative_id, code_system_id):
    """Prep source data by location, year, age, sex, and garbage level."""
    # set caching
    configurator = Configurator('standard')
    cache_dir = configurator.get_directory('db_cache')
    cache_options = {
        'block_rerun': True,
        'cache_dir': cache_dir,
        'force_rerun': False,
        'cache_results': False
    }

    print_log_message("Pulling map from cause to detail level")
    detail_level_map = get_map_to_package_metadata(code_system_id)

    # merge incoming data with cause detail level df
    print("Merging detail level onto data")
    df = merge_with_detail_map(df, detail_level_map)

    print_log_message("Determining national coverage")
    # get national reprsentativeness
    # in stata we used whether or not there was national
    # or subnational in the source_type
    # but since we no longer have that then the best we can
    # do here is just use "representative_id"
    df = assign_nationally_representative(
        df, source, representative_id, data_type_id, cache_options
    )

    print_log_message("Collapsing data.")
    df = df.groupby(["location_id", "year_id", "nid", "extract_type_id",
                     "source", "data_type_id", "detail_level_id",
                     "nationally_representative", "age_group_id",
                     "sex_id"], as_index=False)["deaths"].sum()

    return df
示例#4
0
class Recoder(CodProcess):

    id_cols = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id',
        'sex_id', 'cause_id', 'site_id'
    ]
    val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']

    def __init__(self, cause_meta_df, source, code_system_id, data_type_id):
        self.source = source
        self.code_system_id = code_system_id
        self.data_type_id = data_type_id
        self.cause_meta_df = cause_meta_df
        self.conf = Configurator("standard")
        self.vr_indicators_path = self.conf.get_resource('vr_indicators')
        self.cache_options = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_results': False,
            'cache_dir': self.conf.get_directory('db_cache')
        }

    def get_computed_dataframe(self, df):

        if 'data_type_id' not in df.columns:
            df = add_nid_metadata(df, "data_type_id", **self.cache_options)
        df = self.recode(df)
        df = self.conform_secret_causes(df)
        df = self.clean_up(df)

        return df

    def get_diagnostic_dataframe(self):
        """Return diagnostics."""
        pass

    def recode_sids(self, df):
        path_to_4_stars_sheet = self.conf.get_resource("four_star_locations")
        four_five_star_locs = pd.read_csv(path_to_4_stars_sheet)
        four_five_star_locs = four_five_star_locs[['location_id']]
        four_five_star_locs = four_five_star_locs.location_id.unique()
        less_than_four_star = ~df['location_id'].isin(four_five_star_locs)
        is_sids = df['cause_id'] == 686
        df.loc[is_sids & less_than_four_star, 'cause_id'] = 380
        return df

    def clean_up(self, df):
        """Group rogue duplicates."""
        df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum()
        return df

    def conform_secret_causes(self, df):

        df = add_cause_metadata(df,
                                add_cols=['secret_cause', 'parent_id'],
                                cause_meta_df=self.cause_meta_df,
                                **self.cache_options)
        injuries_replace_parents = [722, 720, 719]
        replaced_injuries = df['cause_id'].isin(injuries_replace_parents)
        df.loc[replaced_injuries, 'parent_id'] = 723
        secret_causes = df['secret_cause'] == 1
        not_cc_code = df['cause_id'] != 919
        len_before = len(df)
        if df['parent_id'].isnull().values.any():
            raise AssertionError('There are missing parent cause_ids')
        df.loc[secret_causes & not_cc_code, 'cause_id'] = df['parent_id']
        len_after = len(df)
        if len_before != len_after:
            raise AssertionError(
                'The length of the dataframe has changed from {} to {}'.format(
                    len_before, len_after))
        df.drop(['parent_id', 'secret_cause'], axis=1, inplace=True)
        return df

    def drop_leukemia_subtypes(self, df):

        leuk_subtypes = get_all_related_causes('neo_leukemia',
                                               self.cause_meta_df)

        leuk_subtypes.remove(487)

        df.loc[(df['cause_id'].isin(leuk_subtypes)) & (df['deaths_rd'] > 0) &
               (df['deaths_raw'] <= 0), 'cause_id'] = 487

        return df

    def recode(self, df):

        cause_metadata_df = self.cause_meta_df
        cause_metadata_df = cause_metadata_df[[
            "cause_id", "path_to_top_parent", "acause"
        ]]
        ckd_cause_ids = get_all_related_causes('ckd', cause_metadata_df)
        ckd_cause_ids.remove(593)
        ckd_less_other = df['cause_id'].isin(ckd_cause_ids)
        neonate = df['age_group_id'].isin([2, 3])
        df.loc[ckd_less_other & neonate, 'cause_id'] = 652

        resp_ids = [509, 515, 516, 520]
        is_cert_resp_causes = df['cause_id'].isin(resp_ids)

        df.loc[is_cert_resp_causes & neonate, 'cause_id'] = 322

        is_asthma = df['cause_id'] == 515
        df.loc[is_asthma & (df['age_group_id'] == 4), 'cause_id'] = 322

        maternal_cause_ids = get_all_related_causes(366, cause_metadata_df)
        maternal_cause_ids = df['cause_id'].isin(maternal_cause_ids)

        non_maternal_ages = np.logical_not(df['age_group_id'].isin(
            [7, 8, 9, 10, 11, 12, 13, 14, 15, 22]))
        df.loc[maternal_cause_ids & non_maternal_ages, 'cause_id'] = 919

        alzheimers = df['cause_id'] == 543
        under_40 = df['age_group_id'].isin(range(1, 13, 1))
        df.loc[alzheimers & under_40, 'cause_id'] = 919

        cong_causes = get_all_related_causes('cong', cause_metadata_df)
        congenital = df['cause_id'].isin(cong_causes)
        over_70 = df['age_group_id'].isin([19, 20, 30, 31, 32, 235])
        df.loc[congenital & over_70, "cause_id"] = 919

        hepatitis = get_all_related_causes(400, cause_metadata_df)
        hepatitis = df['cause_id'].isin(hepatitis)
        if self.code_system_id in [7, 9]:
            df.loc[hepatitis & neonate, "cause_id"] = 380
        else:
            df.loc[hepatitis & neonate, "cause_id"] = 384

        inj_disaster_light = df['cause_id'] == 984
        df.loc[inj_disaster_light, 'cause_id'] = 716

        if self.code_system_id not in [1, 6]:
            ckd_diabetes = df['cause_id'].isin([997, 998])
            df.loc[ckd_diabetes, 'cause_id'] = 589

        if self.code_system_id not in [1, 6, 9]:
            diabetes_subtypes = df['cause_id'].isin([975, 976])
            df.loc[diabetes_subtypes, 'cause_id'] = 587

        diabetes_type_2 = df['cause_id'] == 976
        under_15 = df['age_group_id'] < 8
        df.loc[diabetes_type_2 & under_15, 'cause_id'] = 975

        iron_or_iodine = df['cause_id'].isin([388, 390])
        df.loc[iron_or_iodine, 'cause_id'] = 919

        under_1 = df['age_group_id'] < 5
        cvd_ihd = df['cause_id'] == 493
        df.loc[cvd_ihd & under_1, 'cause_id'] = 643

        if 686 in df.cause_id.unique():
            df = self.recode_sids(df)

        df.loc[df.cause_id.isin([344, 409, 410, 542, 558, 669, 680, 961]),
               'cause_id'] = 919

        if self.data_type_id not in [6, 7, 8]:
            df.loc[df['cause_id'] == 687, 'cause_id'] = 919

        one_to_14 = df['age_group_id'].isin([5, 6, 7])
        cvd_ihd = df['cause_id'] == 493
        df.loc[cvd_ihd & one_to_14, 'cause_id'] = 507

        cancer_recodes = get_all_related_causes([
            411, 414, 423, 426, 429, 432, 435, 438, 441, 444, 450, 453, 456,
            459, 462, 465, 468, 474, 486, 483
        ], cause_metadata_df)
        cancer_recodes = df['cause_id'].isin(cancer_recodes)
        cancer_ages = df['age_group_id'].isin(range(2, 8, 1))
        df.loc[cancer_recodes & cancer_ages, "cause_id"] = 489

        not_icd10 = self.code_system_id != 1
        neo_meso = df['cause_id'] == 483
        df.loc[neo_meso & not_icd10, "cause_id"] = 489

        if self.source.endswith("AAMSP"):
            digest_hernia = df['cause_id'].isin([531])
            df.loc[digest_hernia, "cause_id"] = 919

        if self.source == "":
            homicide_and_suicide = df['cause_id'].isin(
                [724, 725, 726, 727, 941, 718, 719, 720, 721, 722, 723])
            bad_years = df['year_id'].isin(range(2007, 2015))
            # _unintent
            df.loc[bad_years & homicide_and_suicide, "cause_id"] = 919

        inj_war = get_all_related_causes(945, cause_metadata_df)
        is_inj_war = df['cause_id'].isin(inj_war)
        jamaica = df['location_id'] == 115
        year_2005 = df['year_id'] == 2005
        vr = df['data_type_id'] == 9
        df.loc[is_inj_war & jamaica & year_2005 & vr, 'cause_id'] = 724

        inj_mech_gun = df['cause_id'] == 705
        year_2006 = df['year_id'] == 2006
        df.loc[inj_mech_gun & year_2006 & jamaica & vr, 'cause_id'] = 724

        if self.source == "ICD10":
            digest_ibd = df['cause_id'] == 532
            suriname = df['location_id'] == 118
            year_1995_2012 = df['year_id'].isin(range(1995, 2013, 1))
            df.loc[digest_ibd & suriname & year_1995_2012, 'cause_id'] = 526

        endo_prodcedural = df['cause_id'] == 624
        df.loc[endo_prodcedural, 'cause_id'] = 708

        schizo = df['cause_id'] == 559
        tibet = df['location_id'] == 518
        df.loc[schizo & tibet, 'cause_id'] = 919

        hiv = get_all_related_causes(298, cause_metadata_df)
        hiv = df['cause_id'].isin(hiv)
        pre_1980 = df['year_id'] < 1980
        df.loc[hiv & pre_1980, 'cause_id'] = 919

        diabetes_causes = get_all_related_causes(587, cause_metadata_df)
        diabetes = df['cause_id'].isin(diabetes_causes)
        df.loc[neonate & diabetes, 'cause_id'] = 380

        under_20 = df['age_group_id'].isin(range(0, 8, 1))
        stroke = get_all_related_causes('cvd_stroke', cause_metadata_df)
        stroke_deaths = df['cause_id'].isin(stroke)
        va = df['data_type_id'] == 8

        df.loc[under_20 & stroke_deaths & va, 'cause_id'] = 491

        over_95 = df['age_group_id'] == 235
        inj_trans_road_pedal = df['cause_id'] == 691
        df.loc[over_95 & inj_trans_road_pedal, 'cause_id'] = 919

        df.loc[schizo, 'cause_id'] = 919

        if self.source == "Russia_FMD_1999_2011":
            cvd_pvd = df['cause_id'] == 502
            df.loc[cvd_pvd, 'cause_id'] = 491

        if self.source == "":
            sui_homi_causes = [
                717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 941
            ]
            sui_homi = df['cause_id'].isin(sui_homi_causes)
            bad_years = df['year_id'].isin(range(2007, 2015))
            df.loc[sui_homi & bad_years, 'cause_id'] = 919

        if "India_MCCD" in self.source:
            non_neonates = np.logical_not(df['age_group_id'].isin([2, 3]))
            neonatal_sepsis = df['cause_id'].isin([])
            df.loc[non_neonates & neonatal_sepsis, 'cause_id'] = 380

        if self.source == "India_SCD_states_rural":
            warnings.warn("Implement SCD rd artifact recode")

        inj_war_execution = df['cause_id'] == 854

        if self.source == "ICD9_BTL":
            ecuador = df['location_id'] == 122
            year_1980_1990 = df['year_id'].isin(range(1980, 1991, 1))
            df.loc[inj_war_execution & ecuador & year_1980_1990,
                   'cause_id'] = 855

            bih = df['location_id'] == 44
            year_1985_1991 = df['year_id'].isin(
                [1985, 1986, 1987, 1988, 1989, 1990, 1991])
            df.loc[inj_war_execution & bih & year_1985_1991, 'cause_id'] = 855

            warnings.warn("BTL cancer recode needed")

        if self.source == "ICD10":
            irq = df['location_id'] == 143
            year_2008 = df['year_id'] == 2008
            df.loc[inj_war_execution & year_2008 & irq, 'cause_id'] = 855

        if self.source == "ICD9_detail":
            if ((df['location_id'] == 43) & (df['year_id'] == 1997)).any():
                warnings.warn("Albania homicide recode needed")

        if self.source == "ICD9_USSR_Tabulated":
            warnings.warn("Missing some homicide fixes for TJK, ARM here.")

        df = self.drop_leukemia_subtypes(df)

        if self.data_type_id in [1, 3, 5, 7]:
            maternal_causes = get_all_related_causes('maternal',
                                                     cause_metadata_df)
            injury_causes = get_all_related_causes('_inj', cause_metadata_df)
            maternal = df['cause_id'].isin(maternal_causes)
            inj = df['cause_id'].isin(injury_causes)
            df.loc[~(maternal | inj), 'cause_id'] = 919

            if self.data_type_id == 5:
                df.loc[~maternal, 'cause_id'] = 919

        return df
from cod_prep.downloaders.engine_room import (get_cause_map, get_package_map,
                                              get_cause_package_hierarchy)
from cod_prep.claude.hiv_maternal_pafs import HIVMatPAFs
from cod_prep.utils import (print_log_message, report_duplicates,
                            report_if_merge_fail)
from cod_prep.claude.squaring import Squarer
from cod_prep.claude.redistribution_variance import (
    dataset_has_redistribution_variance, RedistributionVarianceEstimator)

VA_DATA_TYPE = 8

POLICE_SURVEY_DATA_TYPE = [4, 5, 6, 7]

CC_CODE = 919

CONF = Configurator('standard')

MATERNAL_SQUARED = [
    "Mexico_BIRMM", "SUSENAS", "China_MMS_1996_2005", "China_MMS_2006_2012",
    "China_Child_1996_2012", "Other_Maternal"
]


def intify_cols(df):
    """Convert ids that should be integers."""
    df['cause_id'] = df['cause_id'].astype(int)
    df['age_group_id'] = df['age_group_id'].astype(int)
    df['nid'] = df['nid'].astype(int)
    return df

示例#6
0
class HIVMatPAFs(CodProcess):

    calc_cf_col = 'cf'
    all_cf_cols = ['cf', 'cf_raw', 'cf_corr', 'cf_rd']

    def __init__(self):
        self.configurator = Configurator('standard')
        self.cache_dir = self.configurator.get_directory('db_cache')
        self.maternal_hiv_props_path = \
            self.configurator.get_directory('maternal_hiv_props')
        # self.need_subnational_props = [51, 16, 86, 214, 165]

    def get_computed_dataframe(self, df, cause_meta_df, location_meta_df):
        restricted_maternal_df = \
            self.restrict_to_maternal_data(df, cause_meta_df)
        if restricted_maternal_df is None:
            # nothing to do if there is no maternal data to adjust
            return df
        appended_pafs = self.append_maternal_pafs(
            restricted_maternal_df.year_id.unique())
        # no longer need this step since new PAFs have been created
        # extra step to fix missing sub national proportions
        # appended_pafs = self.duplicate_national_props(appended_pafs, location_meta_df)
        merged_data = \
            self.merge_data_and_proportions(restricted_maternal_df,
                                            appended_pafs)
        percent_maternal = self.generate_percentages(merged_data)
        split_maternal = self.generate_splits(percent_maternal)
        hiv_cfs = self.create_maternal_hiv_cfs(split_maternal)
        cleaned = self.clean_adjusted_data(hiv_cfs)
        final = \
            self.append_adjusted_orig(df, restricted_maternal_df, cleaned)
        group_cols = [
            col for col in final.columns
            if col not in self.all_cf_cols and col not in ['sample_size']
        ]
        final = final.groupby(group_cols, as_index=False).agg({
            'sample_size': 'mean',
            'cf': 'sum',
            'cf_raw': 'sum',
            'cf_corr': 'sum',
            'cf_rd': 'sum'
        })
        return final

    def restrict_to_maternal_data(self, df, cause_meta_df):
        """Restrict incoming dataframe to only maternal data."""
        df = df.copy()
        # get age start and age end for maternal ages
        maternal_metadata = cause_meta_df.loc[cause_meta_df['cause_id'] == 366]
        age_start = maternal_metadata['yll_age_start']
        assert len(age_start) == 1
        age_start = age_start.iloc[0]
        age_end = maternal_metadata.yll_age_end
        assert len(age_end) == 1
        age_end = age_end.iloc[0]

        data = add_age_metadata(df,
                                add_cols=['simple_age'],
                                merge_col='age_group_id',
                                force_rerun=False,
                                block_rerun=True,
                                cache_results=False,
                                cache_dir=self.cache_dir)
        data.rename(columns={'simple_age': 'age'}, inplace=True)
        maternal_data = data.loc[(df['cause_id'] == 366)
                                 & (data['age'] >= age_start) &
                                 (data['age'] <= age_end) &
                                 (data['sex_id'] == 2) &
                                 (data['year_id'] >= 1980)]
        maternal_data.drop('age', axis=1, inplace=True)
        if len(maternal_data) == 0:
            return None
        else:
            return maternal_data

    def append_maternal_pafs(self, years):
        """Read in proportions."""
        props = pd.DataFrame()
        for year in years:
            year = int(year)
            props_path = "{}/maternal_hiv_props_{}.csv".format(
                self.maternal_hiv_props_path, year)
            data = pd.read_csv(props_path)
            props = props.append(data)
        props = props.rename(columns={'year': 'year_id'})
        return props

    def duplicate_national_props(self, props_df, loc_df):
        """Duplicate national proportions and fill sub national proportions.

        Note: necessary in countries that we are now modeling sub nationally,
        but since we weren't before there aren't any sub national proportions
        for maternal hiv (yet).
        """
        subnational = loc_df.loc[
            loc_df['level'] > 3,
            ['location_id', 'parent_id', 'level', 'path_to_top_parent']]

        # Russia sub nationals are level 5 while other countries are level 4
        subnational.loc[
            subnational['level'] == 5, 'parent_id'] = \
            subnational['path_to_top_parent'].str.split(',').str[3].astype(int)

        # only keep rows with the needed sub national locations
        subnational = subnational.loc[subnational['parent_id'].isin(
            self.need_subnational_props)]

        # drop level 4 sub national location_ids for Russia
        subnational = subnational.loc[~((subnational['parent_id'] == 62) &
                                        (subnational['level'] == 4))]
        subnational = subnational[['location_id', 'parent_id']]
        subnational.rename(columns={
            'location_id': 'child_location_id',
            'parent_id': 'location_id'
        },
                           inplace=True)

        # create sub national maternal_hiv proportions from national
        subnational = props_df.merge(subnational, on='location_id')
        subnational.drop('location_id', axis=1, inplace=True)
        subnational.rename(columns={'child_location_id': 'location_id'},
                           inplace=True)
        props_df = pd.concat([props_df, subnational])
        assert not props_df.duplicated().any(), 'please check maternal'\
            ' proportions, there are duplicates'
        return props_df

    def merge_data_and_proportions(self, data, props):
        """Merge restricted maternal data and proportions."""
        merged_data = data.merge(props,
                                 on=['location_id', 'age_group_id', 'year_id'],
                                 how='left')
        assert merged_data.notnull().values.all(), 'maternal proportions '\
            'were not successfully merged with incoming data'
        return merged_data

    def generate_percentages(self, df):
        """Create new 'pct_maternal column'.

        This is to prepare for calculating maternal hiv cause fractions
        """
        df['pct_maternal'] = 1 - df['pct_hiv'] - df['pct_maternal_hiv']
        df.loc[df['pct_maternal'].isnull(), 'pct_maternal'] = 1
        df.loc[df['pct_hiv'].isnull(), 'pct_hiv'] = 0
        df.loc[df['pct_maternal_hiv'].isnull(), 'pct_maternal_hiv'] = 0
        assert all(x > 0 for x in df['pct_maternal'])
        assert df[['pct_maternal', 'pct_hiv', 'pct_maternal_hiv'
                   ]].notnull().values.any(), 'there are missing percentages'
        assert all(
            abs(df['pct_maternal'] + df['pct_hiv'] + df['pct_maternal_hiv']) -
            1) < .0001
        # proportion of maternal that is aggravated by hiv
        # cannot be above 13% based on USERNAME's meta-analysis; otherwise
        # this would suggest the percentage of maternal deaths that were
        # hiv positive is >1
        assert (df['pct_maternal_hiv_vr'] <= .13).all()
        # maternal_hiv should not yet exist
        assert not (df['cause_id'] == 741).any()
        return df

    ''' '''

    def generate_splits(self, df):
        """Create a column to indicate how the data should be split.

        (depends on source type)
        """
        df = add_nid_metadata(
            df,
            add_cols='data_type_id',
            block_rerun=True,
            cache_dir=self.cache_dir,
            force_rerun=False,
        )
        df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1
        df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0
        df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1
        df.loc[df['split_maternal'] == 0,
               'pct_maternal_hiv'] = df['pct_maternal_hiv_vr']
        df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0
        df.drop('pct_maternal_hiv_vr', axis=1, inplace=True)
        return df

    def create_maternal_hiv_cfs(self, df):
        """Create cause fractions for maternal hiv."""
        df = df.copy()

        maternal_hiv_df = df.copy()
        maternal_hiv_df['cf'] = maternal_hiv_df['cf'] * \
            maternal_hiv_df['pct_maternal_hiv']
        maternal_hiv_df['cause_id'] = 741
        maternal_hiv_df['cf_raw'] = 0
        maternal_hiv_df['cf_corr'] = 0
        maternal_hiv_df['cf_rd'] = 0

        maternal_df = df.copy()
        maternal_df['cf'] = maternal_df['cf'] * maternal_df['pct_maternal']
        maternal_df['cause_id'] = 366
        df = pd.concat([maternal_hiv_df, maternal_df], ignore_index=True)

        return df

    def clean_adjusted_data(self, df):
        """Clean up adjusted data to add on to the original dataset.

        Add maternal_hiv to maternal, keep the maternal_hiv,
        split_maternal 0 observations and call them maternal
        """
        va_vr = df.loc[df['split_maternal'] == 0]
        if len(va_vr) > 0:
            assert set([741, 366]) == set(va_vr.cause_id.unique())
            va_vr = va_vr.loc[va_vr['cause_id'] != 366]
            va_vr['cause_id'] = 366
        df = pd.concat([df, va_vr], ignore_index=True)
        df = df.groupby([
            'nid', 'extract_type_id', 'location_id', 'year_id', 'site_id',
            'age_group_id', 'sex_id', 'sample_size', 'cause_id'
        ],
                        as_index=False)[self.all_cf_cols].sum()

        # it is possible that, using this method, cause fractions exceed 1.
        # this is meaningless and breaks noise reduction, so cap it
        # make sure that cf isn't something absurd, though
        assert (df['cf'] < 1.1).all()
        df.loc[df['cf'] > 1, 'cf'] = 1

        return df

    def append_adjusted_orig(self, orig, maternal_data, adjusted):
        """Remove original maternal data and append on adjusted."""
        data = orig.merge(maternal_data, how='left', indicator=True)
        data = data.loc[data['_merge'] != 'both']
        data.drop('_merge', axis=1, inplace=True)
        data = data.append(adjusted, ignore_index=True)
        return data
示例#7
0
class GBDCauseMapper(CodProcess):
    """Convert cause codes into cause_ids.

    Arguments:
        id_cols (list):
        data_col (list):
        unique_cols (list):
    Returns:
        df, a pandas DataFrame with addition of cause_id
        diag_df, a pandas DataFrame: assesses the difference
        between different mapping versions
    """

    id_cols = ['nid', 'extract_type_id', 'location_id', 'year_id',
               'age_group_id', 'sex_id', 'cause_id', 'code_id',
               'site_id']
    data_col = ['deaths']
    unique_cols = ['nid', 'extract_type_id', 'location_id', 'year_id',
                   'age_group_id', 'sex_id',
                   'cause_id', 'code_id', 'site_id']
    # These are acauses 'sub_total', and '_sb'
    unnecessary_causes = [920, 744]
    cache_dir = str()

    def __init__(self, cause_set_version_id, code_map):
        self.cg = Configurator("standard")
        self.cache_dir = self.cg.get_directory('db_cache')
        self.cause_set_version_id = cause_set_version_id
        self.code_map = code_map

    def get_computed_dataframe(self, df, code_system_id):

        # make special cause adjustments
        df = self.special_cause_reassignment(df, code_system_id)

        """Map code id to cause id."""
        print_log_message("Merging with cause map")
        # get code metadata from a file already cached
        df = add_code_metadata(
            df, ['cause_id'], code_system_id,
            code_map=self.code_map
        )
        report_if_merge_fail(df, 'cause_id', 'code_id')

        # Make sure the mappings are good!
        print("Asserting it's all good")
        self.assert_valid_mappings(df, code_system_id)
        df = self.drop_unnecessary_causes(df, self.unnecessary_causes)
        print("Collapsing")
        df = self.collapse_and_sum_by_deaths(df)
        return df

    def drop_unnecessary_causes(self, df, unnecessary_causes):
        # Drops causes set as unnecessary, subtotal and stillbirth
        df = df.copy()
        df = df[~df['cause_id'].isin(unnecessary_causes)]
        return df

    def special_cause_reassignment(self, df, code_system_id):
        """Replace the actual data cause under certain conditions.

        There are instances where a PI has good reason to
        believe that a certain group of deaths were assigned
        to the wrong cause, and it is known what cause to re-assign
        those deaths to. Implement here.

        This essentially allows mapping based on not just the cause
        and code system but based on other information like
        the location, NID, year, etc.

        It can also be used (sparingly) for hotfixes like
        changing all codes with values 'acause_digest_gastrititis'
        to be named 'acause_digest_gastritis'.

        Args:
            df (DataFrame): data with cause

        Returns:
            DataFrame: with any modifications
        """

        cache_args = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': 'standard',
            'cache_results': False
        }
        # Some SRS codes get redistributed differently than
        # other ICD10 datasets
        df = add_nid_metadata(
            df, 'source', **cache_args
        )

        if (df['source'] == "India_SRS_states_report").any():
            print_log_message("Changing SRS codes to custom garbage groups")
            assert (df['source'] == "India_SRS_states_report").all()

            df = add_code_metadata(
                df, 'value', code_system_id=code_system_id,
                **cache_args
            )

            custom_grbg = pd.read_csv(
                self.cg.get_resource("srs_custom_garbage_groups")
            )
            custom_grbg = custom_grbg.query('active == 1')
            custom_grbg['value'] = custom_grbg['srs_custom_garbage_group']
            custom_grbg = add_code_metadata(
                custom_grbg, 'code_id', code_system_id=code_system_id,
                merge_col='value', **cache_args
            )
            custom_grbg = custom_grbg.rename(
                columns={'code_id': 'new_code_id'})
            custom_grbg = custom_grbg[['package_id', 'new_code_id']]

            gp_dfs = []
            for package_id in custom_grbg.package_id.unique():
                # THIS QUERIES THE DATABASE - BUT THERE SHOULD NEVER BE A TON
                # OF SRS JOBS HAPPENING AT ONCE SO IT SHOULD BE OK
                gp_df = get_garbage_from_package(
                    code_system_id, package_id, package_arg_type="package_id"
                )
                assert len(gp_df) != 0, \
                    "Found 0 codes for package {}".format(package_id)
                gp_dfs.append(gp_df)
            gp_df = pd.concat(gp_dfs, ignore_index=True)

            gp_df = gp_df.merge(custom_grbg, how='left')
            report_if_merge_fail(gp_df, 'new_code_id', 'package_id')
            gp_df = gp_df[['value', 'new_code_id']]
            gp_df['value'] = gp_df['value'].str.strip()

            df = df.merge(gp_df, how='left', on='value')
            df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id']
            df['code_id'] = df['code_id'].astype(int)
            df = df.drop(['new_code_id', 'value'], axis=1)

        df = df.drop('source', axis=1)

        china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2)
        # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail)
        five_dig_code = df['code_id'] == 13243
        df.loc[
            china_cdc_2008 & five_dig_code,
            'code_id'
        ] = 13242

        return df

    def collapse_and_sum_by_deaths(self, df):
        """Group by final columns, summing across deaths.

        Directly modifies the dataframe, keeping only the columns needed
        to move on to the next Claude step. Also includes an assertion
        that there are no duplicates.
        """
        df = df.groupby(self.id_cols, as_index=False)[self.data_col].sum()
        self.assert_unique_cols_unique(df)
        return df

    def assert_valid_mappings(self, df, code_system_id):
        """Test that the mapping worked.

        Runs a suite of assertions to make sure that mapping was successful.
        Args:
            df (DataFrame): with at least code_id and cause_id
        Returns:
            None
        Raises:
            AssertionError: Any condition fails
        """
        # add code value from cached code map
        print("Adding value")
        df = add_code_metadata(
            df, ['value'], code_system_id,
            force_rerun=False,
            block_rerun=True,
            cache_dir=self.cache_dir
        )
        report_if_merge_fail(df, 'value', 'code_id')
        # get acause from cached cause hierarchy
        print("Adding acause")
        df = add_cause_metadata(
            df, ['acause'],
            cause_set_version_id=self.cause_set_version_id,
            force_rerun=False,
            block_rerun=True,
            cache_dir=self.cache_dir
        )
        report_if_merge_fail(df, 'acause', 'cause_id')

        # Test that all causes starting with 'acause_' are mapped correctly.
        # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd').
        # 'acause__gc_X59' should be mapped to '_gc', etc.
        print("Checking implied acauses")
        check_df = df.loc[df['value'].str.startswith('acause_')]
        check_df['implied_acause'] = \
            check_df['value'].str.replace('acause_', '', 1)

        check_df.loc[
            check_df['value'].str.contains("acause__gc"),
            'implied_acause'
        ] = "_gc"
        bad_df = check_df.loc[
            check_df['acause'] != check_df['implied_acause']
        ]
        if len(bad_df) > 0:
            bad_stuff = bad_df[['value', 'acause']].drop_duplicates()
            raise AssertionError(
                "These code values do not match their acause: "
                "\n{}".format(bad_stuff)
            )

        print("Checking for bad values")
        # assert incorrect acauses are gone
        bad_acauses = ['acause_digest_gastrititis',
                       'acause_hiv_tb',
                       'acause_tb_drug']

        bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique()
        if len(bad_df) > 0:
            raise AssertionError(
                "Found these bad code values in the data: {}".format(bad_stuff)
            )

    def assert_unique_cols_unique(self, df):
        """Test that columns that should uniquely identify the dataframe do."""
        assert not df.duplicated(self.unique_cols).any()
示例#8
0
"""
07/04/2020: The N-code custom groups were incorrectly added to the mcause map
fixing that (and only that) here
"""
import re
import pandas as pd
from cod_prep.claude.configurator import Configurator

CONF = Configurator('standard')


def prep_all_inj_codes(code_system_id, mcod_map):
    '''
    Preps smaller ncode bins
    '''
    code_system_file_name_dict = {
        1: "Copy of 1-ICD10-mapping for X59 and Y34 -Jun-2019",
        6: "Copy of 2- -ICD9-mapping for X59 and Y34-Jun-2019"
    }

    filename = code_system_file_name_dict[code_system_id]
    df = pd.read_excel(
        f"/home/j/WORK/03_cod/01_database/mcod/injuries/maps_from_mohsen/{filename}.xlsx",
        sheet_name=1)

    df = df.loc[df["just for X59 and Y34"].str.contains("^Extern|^NN|^Unspeci",
                                                        flags=re.IGNORECASE,
                                                        regex=True)]

    df.rename(columns={
        "icd_name": "cause_description",
示例#9
0
import pandas as pd
import numpy as np
from cod_process import CodProcess
from cod_prep.claude.configurator import Configurator
from cod_prep.utils.misc import print_log_message
from cod_prep.utils import report_if_merge_fail
from cod_prep.downloaders.ages import get_cod_ages
from cod_prep.downloaders.engine_room import (get_cause_map,
	remove_five_plus_digit_icd_codes)

CONF = Configurator()


def remap_causes(df, remap_codes, target_dict):
	# check that all the remap codes are garbage, if not that means
	# the mapping for these codes changed and we need to rethink this step
	assert (df.loc[df.code_id.isin(remap_codes)].cause_id == 743).all(), \
		'Map has changed, leukemia adjustment should only affect garbage'
	for key in target_dict.keys():
		df.loc[
			(df.code_id.isin(remap_codes)) &
			(df.age_group_id.isin(target_dict[key])),
			'code_id'
		] = key
	return df

def adjust_leukemia_subtypes(df, code_system_id, code_map_version_id):
	# will be remapping onto 4 different leukemia subtypes, get the code ids to map to
    cause_map = get_cause_map(code_system_id, code_map_version_id=code_map_version_id,
                    force_rerun=False, block_rerun=True)
    cause_map = remove_five_plus_digit_icd_codes(cause_map, code_system_id=code_system_id)
示例#10
0
 def __init__(self, df, cause_meta_df):
     self.df = df
     self.start_deaths = self.df.deaths.sum()
     self.conf = Configurator('standard')
     self.cause_meta_df = cause_meta_df
示例#11
0
class LRIRedistributor(CodProcess):
    """Used to redistribute a proportion of LRI deaths to tb_other"""
    lri_ages = [4, 5, 6, 7]
    id_cols = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id',
        'sex_id', 'site_id', 'cause_id'
    ]

    def __init__(self, df, cause_meta_df):
        self.df = df
        self.start_deaths = self.df.deaths.sum()
        self.conf = Configurator('standard')
        self.cause_meta_df = cause_meta_df

    def get_computed_dataframe(self):
        # standardize the input data - anything that went through the injury corrector
        # has extra columns we don't need
        df = self.df[self.id_cols + ['deaths']]

        # use the lri deaths in the data and the proportions to create adjustment df
        adjust_df = self.get_adjust_df(df)
        # if no lri to move, exit
        if not len(adjust_df) > 0:
            return df
        moved_deaths = adjust_df.death_adjustment.sum()

        # scale down the existing lri deaths
        df = self.adjust_lri(df, adjust_df)

        # use the deaths we took from lri to scale up tb_other
        df = self.adjust_tb(df, adjust_df)

        # assert deaths unchanged and cleanup
        df = df[self.id_cols + ['deaths']]
        assert df.notnull().values.all()
        df = df.groupby(self.id_cols, as_index=False).deaths.sum()
        assert np.allclose(df.deaths.sum(), self.start_deaths), \
            'Total deaths have changed during LRI redistribution'
        print_log_message("Moved " + str(moved_deaths) +
                          " from LRI to tb_other")
        return df

    def get_adjust_df(self, df):
        lri_parent = self.cause_meta_df.loc[self.cause_meta_df.acause ==
                                            'lri']['cause_id'].unique()[0]
        lri_causes = get_all_related_causes(lri_parent,
                                            cause_meta_df=self.cause_meta_df)
        df = df.loc[(df.cause_id.isin(lri_causes))
                    & (df.age_group_id.isin(self.lri_ages))]
        # if no lri to move, exit
        if not len(df) > 0:
            return df
        # read in lri/tb redistribution proportions, proportions
        # are location/year specific and apply only to under 15, excluding 0-27 days
        prop_df = pd.read_csv(self.conf.get_resource('lri_tb_proportions'))
        prop_df = prop_df[['location_id', 'year_id', 'tb_prop']]
        df = df.merge(prop_df, on=['location_id', 'year_id'], how='left')
        report_if_merge_fail(df, 'tb_prop', ['location_id', 'year_id'])
        df['death_adjustment'] = df['deaths'] * df['tb_prop']
        df = df[self.id_cols + ['death_adjustment']]
        return df

    def adjust_lri(self, df, adjust_df):
        df = df.merge(adjust_df, on=self.id_cols, how='left')
        df['death_adjustment'] = df['death_adjustment'].fillna(0)
        df['deaths'] = df['deaths'] - df['death_adjustment']
        df.drop('death_adjustment', axis=1, inplace=True)
        return df

    def adjust_tb(self, df, adjust_df):
        # set adjust df cause id to 934, tb_other, and collapse deaths
        # we will add all the lri deaths to tb_other in the main df
        tb_other = self.cause_meta_df.loc[self.cause_meta_df.acause ==
                                          'tb_other']['cause_id'].unique()[0]
        adjust_df['cause_id'] = tb_other
        adjust_df = adjust_df.groupby(self.id_cols,
                                      as_index=False).death_adjustment.sum()
        # merge on to the data, posibility of creating new demographics for tb_other
        # that were not yet present in the data (the right only merges)
        df = df.merge(adjust_df, on=self.id_cols, how='outer', indicator=True)
        df.loc[df._merge == 'both',
               'deaths'] = df['deaths'] + df['death_adjustment']
        df.loc[df._merge == 'right_only', 'deaths'] = df['death_adjustment']
        return df
示例#12
0
class NonZeroFloorer(CodProcess):
    """APPLY NON-ZERO FLOOR OF 1 DEATH PER 10,000,000"""
    conf = Configurator('standard')
    draws = range(0, conf.get_resource('uncertainty_draws'))
    cf_draw_cols = ['cf_draw_{}'.format(draw) for draw in draws]

    def __init__(self, df):
        self.df = df
        self.merge_cols = ['year_id', 'sex_id', 'age_group_id', 'cause_id']
        self.cf_col = 'cf_final'
        if 'cf_draw_0' in self.df:
            self.cf_cols = [self.cf_col] + self.cf_draw_cols
        else:
            self.cf_cols = [self.cf_col]
        # initialize this to something crazy small, then adjust later when
        # nonzero floor file is read in
        self.min_possible_val = 1e-50

    def get_computed_dataframe(self, pop_df, env_df, cause_hierarchy):
        """Calculate mortality rates and replace cause fractions, as needed.

        Make death rates and calculate the cf as if the rate were 2
        MADs below the "global" median. Every cause in the floor file
        is checked to ensure non-zero values in any non-restricted age-sex.
        So, just check and make sure there is something there for the cause,
        filling in zeroes where missing if the cause is present in the floor
        file (will break if there is a cause not present)
        """

        orig_cols = list(self.df.columns)
        age_aggs = self.df[self.df.age_group_id.isin([22, 27])]
        self.df = self.df[~self.df.age_group_id.isin([22, 27])]
        self.merge_pop_env(pop_df, env_df)
        self.merge_nonzero_mad_info(cause_hierarchy)
        self.make_min_floor()
        self.make_replace_cf()
        for col in self.cf_cols:
            self.replace_cf(col)
        self.diag_df = self.df
        null_cfs = self.df.loc[self.df[self.cf_cols].isnull().any(axis=1)]
        if len(null_cfs) > 0:
            raise AssertionError(
                "Found null rates in the data: \n{}".format(null_cfs))
        self.df = self.df[orig_cols]
        self.df = self.df.append(age_aggs)
        # find lowest non-zero value that is in the dataframe and check that
        # it is not lower than lowest non-zero floor value
        data_min_val = self.df[self.df > 0][self.cf_cols].min().min()
        assert data_min_val >= self.min_possible_val, \
            "Data min value [{}] was lower than non-zero floor min " \
            "value [{}]".format(data_min_val, self.min_possible_val)
        return self.df

    def make_replace_cf(self):
        """Replace cause fractions based on mortality rates.

        If the rate is over 0 and less than the floor, then the cause
        fractions are replaced with floor * pop / mean_env
        """
        self.df.loc[self.df['floor'].isnull(), 'floor'] = self.df['min_floor']
        # there are so many checks before this that it would be very surprising
        # if this line does anything, but its another round of safety to make
        # sure that cause fractions arent being replaced with null
        self.df.loc[self.df['floor'].isnull(), 'floor'] = 0
        self.df['cf_replace'] = ((self.df['floor'] * self.df['population']) /
                                 self.df['mean_env'])
        self.min_possible_val = self.df.cf_replace.min()

    def replace_cf(self, check_cf_col):

        # Replace the CF with the rate-adjusted CF if the
        # rate is less than the floor and greater than zero
        self.df['rate'] = ((self.df[check_cf_col] * self.df['mean_env']) /
                           self.df['population'])
        cf_over_0 = self.df[check_cf_col] > 0
        rate_less_than_floor = self.df['rate'] < self.df['floor']
        self.df.loc[cf_over_0 & rate_less_than_floor,
                    check_cf_col] = self.df['cf_replace']

    def make_min_floor(self):
        """Set min floor to the minimum cf of any rows floor by cause."""
        self.df['min_floor'] = self.df.groupby(
            'cause_id', as_index=False)['floor'].transform('min')
        missing_floor = self.df['min_floor'].isnull()
        nonzero_cf = self.df[self.cf_col] > 0
        assert len(self.df[nonzero_cf & missing_floor]) == 0

    def merge_pop_env(self, pop_df, env_df):
        if 'population' not in self.df.columns:
            self.df = add_population(self.df,
                                     add_cols=['population'],
                                     pop_df=pop_df)
        if 'mean_env' not in self.df.columns:
            self.df = add_envelope(self.df,
                                   add_cols=['mean_env'],
                                   env_df=env_df)

    def convert_nonzero_mad(self, df, cmdf):
        # add cause_id
        cmdf = cmdf[['acause', 'cause_id']]
        df = df.merge(cmdf, how='left', on='acause')
        # add id to cols
        df = df.rename(columns={
            'year': 'year_id',
            'sex': 'sex_id',
            'age': 'age_group_id'
        })
        # convert age
        age_to_id_map = {
            1: 5,
            5: 6,
            10: 7,
            15: 8,
            20: 9,
            25: 10,
            30: 11,
            35: 12,
            40: 13,
            45: 14,
            50: 15,
            55: 16,
            60: 17,
            65: 18,
            70: 19,
            75: 20,
            80: 30,
            85: 31,
            90: 32,
            95: 235,
            91: 2,
            93: 3,
            94: 4
        }
        df['age_group_id'] = df['age_group_id'].map(age_to_id_map)
        df = df.drop('acause', axis=1)

        # make sure 2017-2018 are still missing
        missing_years = [2017, 2018]
        assert df.loc[df['year_id'].isin(
            missing_years)].floor.isnull().values.all()
        df = df.loc[~df['year_id'].isin(missing_years)]

        # We have determined that the floor is missing values for:
        # (1) certain cause/age/sexes in 2016 - we will use the 2015 floor to fill in these values
        # (2) certain cause/age/sexes across the entire time series - really nothing we
        # can do short of resetting the floor
        merge_cols = ['cause_id', 'age_group_id', 'sex_id']
        report_duplicates(df, merge_cols + ['year_id'])
        new_floor = pd.merge(df.loc[df.year_id == 2016].copy(),
                             df.loc[df.year_id == 2015].copy(),
                             how='outer',
                             on=merge_cols,
                             suffixes=('', '_2015'))
        new_floor = new_floor.fillna({'floor': new_floor['floor_2015']})\
            .loc[:, merge_cols + ['year_id', 'floor']]
        df = df.loc[df.year_id != 2016]\
            .append(new_floor, ignore_index=True, sort=True)
        # If anything else is still missing, make sure it's missing for the entire time
        # series - otherwise we should write something more sophisticated to fill it in
        assert df.assign(floor_null=df.floor.isnull())\
            .groupby(merge_cols + ['floor_null'])['year_id'].apply(
            lambda x: set(x) == set(range(1980, 2017))).all()

        # copy 2016 to 2017, 2018
        for year in missing_years:
            df = df.append(
                df.loc[df.year_id == 2016].copy().assign(year_id=year),
                ignore_index=True)

        # Due to age restriction changes since last round, we now have data in cause/age
        # groups where we had no floor in GBD 2017
        # Add in a nonzero floor created based on GBD 2019 data for these cause/age groups
        new_cause_ages = pd.read_csv(self.conf.get_resource('nonzero_floor_new_age_restrictions'))\
            .drop('borrow_age_group_id', axis='columns')
        assert new_cause_ages.notnull().values.all()
        df = df.append(new_cause_ages, sort=True)

        # no duplicates
        df = df.loc[df.floor.notnull()]
        report_duplicates(df,
                          ['year_id', 'cause_id', 'age_group_id', 'sex_id'])

        return df

    def fill_na_floors(self, df):
        if df.floor.isnull().any():
            median = np.median(df[~df.floor.isnull()].floor)
            df.loc[df['floor'].isnull(), 'floor'] = median
        return df

    def merge_nonzero_mad_info(self, cmdf):
        """Read in the floor input and merge onto main dataframe."""
        nonzero_mad = pd.read_csv(self.conf.get_resource("nonzero_floor_mad"))
        nonzero_mad = self.convert_nonzero_mad(nonzero_mad, cmdf)
        nonzero_mad_cols = self.merge_cols + ['floor']
        nonzero_mad = nonzero_mad[nonzero_mad_cols]
        self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols)
        if self.df.floor.isnull().any():
            self.df = self.df.groupby(['year_id', 'sex_id',
                                       'cause_id']).apply(self.fill_na_floors)
        if self.df.floor.isnull().any():
            trouble_causes = self.df[self.df.floor.isnull()].cause_id.unique()
            filler = np.median(self.df[~self.df.floor.isnull()].floor)
            print_log_message("using nonzero filler because"
                              " of these causes: {}".format(trouble_causes))
            self.df.floor = self.df.floor.fillna(filler)
        self.df.loc[self.df.cause_id == 975, 'floor'] = 1e-50

        report_if_merge_fail(self.df, 'floor', self.merge_cols)

    def get_diagnostic_dataframe(self):
        """Return diagnostics."""
        try:
            return self.diag_df
        except AttributeError:
            print("You requested the diag dataframe before it was ready,"
                  " returning an empty dataframe.")
            return pd.DataFrame()
import pandas as pd
import numpy as np
from cod_prep.downloaders.ages import get_ages
from cod_prep.downloaders import get_cause_map
from cod_prep.claude.formatting import finalize_formatting
from cod_prep.claude.configurator import Configurator
from cod_prep.utils import get_adult_age_codebook
from cod_prep.utils import get_infant_age_codebook
from cod_prep.utils import map_gbd2016_disagg_targets

CONF = Configurator('standard')

rdp_path = CONF.get_resource('rdp_frac_path')

ID_COLS = [
    'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'data_type_id',
    'representative_id', 'code_system_id', 'code_id', 'site'
]
INT_COLS = [col for col in ID_COLS if 'id' in col]
VALUE_COL = ['deaths']
FINAL_FORMATTED_COLS = ID_COLS + VALUE_COL
WRITE = False
YEARS = [2007, 2013]


def read_data(year):
    year_to_file = {2007: 'FILEPATH', 2013: 'FILEPATH'}
    df = pd.read_excel(f'FILEPATH/{year}/{year_to_file[year]}')
    if year == 2007:
        df = df.drop([f'Unnamed: {col_num}' for col_num in range(80, 83)],
                     axis='columns')
示例#14
0
class BridgeMapper(CodProcess):
    """Replace acauses with those in the bridge map.

    Arguments:
        source (str)
        cause_set_version_id (int)
        code_system (str)
    Returns:
        df, pandas DataFrame: only change is replacing some cause_ids
        diag_df, pandas DataFrame: shows which cause_ids have been changed
    """

    id_cols = ['nid', 'extract_type_id', 'location_id', 'year_id',
               'age_group_id', 'sex_id', 'cause_id',
               'site_id']
    val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']

    # data type id for verbal autopsy
    VA = 8

    def __init__(self, source, cause_meta_df, code_system):
        self.source = source
        self.code_system = code_system
        self.conf = Configurator("standard")
        self.bridge_map_path = self.conf.get_resource('bridge_map')
        self.cause_meta_df = cause_meta_df
        self.cache_options = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_results': False,
            'cache_dir': 'standard'
        }

    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()

        if self.needs_bridging(has_verbal_autopsy):
            sheet_name = self.get_sheet_name(has_verbal_autopsy)
            map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)
            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df

    def needs_bridging(self, has_verbal_autopsy):
        """Check data type and source to see if the bridge map is needed."""
        sources_to_bridge_map = [
            "India_SCD_states_rural", "India_CRS",
            "India_MCCD_states_ICD9", "India_MCCD_states_ICD10",
            "India_Maharashtra_SCD", "India_MCCD_Orissa_ICD10",
            "India_MCCD_Delhi_ICD10", "ICD9_BTL", "Russia_FMD_1989_1998",
            "China_1991_2002", "ICD9_USSR_Tabulation", "ICD10_tabulated",
            "Thailand_Public_Health_Statistics", "India_SRS_states_report",
            "ICD8A", "UKR_databank_ICD10_tab", "Russia_FMD_ICD9",
            'Iran_Mohsen_special_ICD10'
        ]

        if has_verbal_autopsy | (self.source in sources_to_bridge_map):
            return True
        else:
            return False

    def get_sheet_name(self, has_verbal_autopsy):
        """Determine the sheet name needed based on the source.

        Note: The default sheet name will be the name of the code system,
        with some exceptions.
        """
        source_to_sheet = {
            "India_MCCD_Orissa_ICD10": "India_MCCD_states_ICD10",
            "India_MCCD_Delhi_ICD10": "India_MCCD_states_ICD10",
            "Thailand_Public_Health_Statistics": "ICD10_tabulated",
            "India_SRS_states_report": "India_SRS_states_report",
            "UKR_databank_ICD10_tab": "ICD10_tabulated",
            "Russia_FMD_ICD9": "Russia_FMD_1989_1998",
            "Iran_Mohsen_special_ICD10": "Iran_Mohsen_special_ICD10"
        }
        if has_verbal_autopsy and (self.source != 'India_SRS_states_report'):
            sheet_name = 'INDEPTH_ICD10_VA'
        elif self.source in source_to_sheet.keys():
            sheet_name = source_to_sheet[self.source]
        else:
            sheet_name = self.code_system
        return sheet_name

    def acause_to_bridge_code(self, df):
        """Replace the acause with the bridge code."""
        df['swap'] = 0
        df.loc[
            (df['acause'] != df['bridge_code']) &
            (df['bridge_code'].notnull()),
            'swap'
        ] = 1
        df.loc[df['swap'] == 1, 'acause'] = df['bridge_code']
        self.causes_not_in_bridge_map(df)
        return df

    def causes_not_in_bridge_map(self, df):
        """Print causes that aren't in the bridge map, but are in the data."""
        check = set(df.loc[df['bridge_code'].isnull(), 'acause'])
        if len(check) > 0:
            print("These acauses are not in the bridge map: {}".format(check))

    def get_diagnostic_dataframe(self):
        if self.diag_df is None:
            print("No run of get computed dataframe yet")
        else:
            return self.diag_df

    def clean_up(self, df):
        """Group rogue duplicates."""
        df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum()
        return df
示例#15
0
class NonZeroFloorer(CodProcess):
    """APPLY NON-ZERO FLOOR OF 1 DEATH PER 10,000,000"""
    conf = Configurator('standard')
    draws = range(0, conf.get_resource('uncertainty_draws'))
    cf_draw_cols = ['cf_draw_{}'.format(draw) for draw in draws]

    def __init__(self, df):
        self.df = df
        self.merge_cols = ['year_id', 'sex_id', 'age_group_id', 'cause_id']
        self.cf_col = 'cf_final'
        if 'cf_draw_0' in self.df:
            self.cf_cols = [self.cf_col] + self.cf_draw_cols
        else:
            self.cf_cols = [self.cf_col]
        self.min_possible_val = 1e-50

    def _check_all_floors_exist(self, nzf_df):
        ''' Check that all expected cancers, ages, and years, are present and have
            nonzero floor values 
        '''
        def _remove_ages_less_than(a, b):
            '''
            '''
            orig_list = a.copy()
            for val in orig_list:
                if b == 5 & val in [2, 3, 4]:
                    continue
                if val < b:
                    a.remove(val)
            return a

        print("CHECKING FOR ALL CAUSES, AGES, and YEARS...")
        # create cause_list
        db_link = cdb.db_api(db_connection_name='cancer_db')
        gbd_id = utils.get_gbd_parameter('current_gbd_round')
        registry_entity = db_link.get_table('registry_input_entity')
        registry_entity = registry_entity.loc[
            registry_entity['gbd_round_id'].eq(gbd_id)
            & registry_entity['is_active'].eq(1), ]
        cancer_metadata = registry_entity[[
            'acause', 'cause_id', 'yll_age_start', 'yll_age_end'
        ]]
        causes_checklist = registry_entity['acause'].unique().tolist()

        # exceptions for nonzero floors
        causes_checklist.remove('neo_nmsc_bcc')
        causes_checklist.remove('neo_ben_intest')
        causes_checklist.remove('neo_ben_utr')
        causes_checklist.remove('neo_ben_other')
        causes_checklist.remove('neo_ben_brain')
        causes_checklist.remove('_gc')

        # create year_list
        year_start = utils.get_gbd_parameter('min_year_cod')
        year_end = utils.get_gbd_parameter('max_year')  # + 1 for GBD2020
        year_checklist = list(range(year_start, year_end))

        # sex &  age_id checklist
        age_id_checklist = [
            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31,
            32, 235, 2, 3, 4
        ]  #age_ids for 0-95 ages
        sex_checklist = [1, 2]

        # print any causes/years/sexes that are expected and missing
        for cancer in causes_checklist:
            print('working on...{}'.format(cancer))
            subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ]
            age_start = int(
                cancer_metadata.loc[cancer_metadata['acause'].eq(cancer),
                                    'yll_age_start'])
            age_start = (age_start /
                         5) + 5  # conversion from age to GBD age_group_id
            if len(subset) == 0:
                print('MISSING CAUSE... {} '.format(cancer))
            missing_ages = set(age_id_checklist) - set(
                subset['age_group_id'].unique().tolist())
            missing_ages = list(missing_ages)
            missing_ages = _remove_ages_less_than(missing_ages, age_start)
            if len(missing_ages) > 0:
                print('missing the following ages for {}: {}'.format(
                    cancer, missing_ages))
            missing_sexes = set(sex_checklist) - set(
                subset['sex_id'].unique().tolist())
            if len(missing_sexes) > 0:
                print('missing the following sexes for {}: {}'.format(
                    cancer, missing_sexes))
            missing_years = set(year_checklist) - set(
                subset['year_id'].unique().tolist())
            if len(missing_years) > 0:
                print('missing the following years for {}: {}'.format(
                    cancer, missing_years))
        return

    def format_nzf(self, nzf_df, cmdf):
        '''
        '''
        # merge acause column
        nzf_df = pd.merge(nzf_df,
                          cmdf[['acause', 'cause_id']],
                          on='cause_id',
                          how='left')
        return nzf_df

    def get_computed_dataframe(self, pop_df, env_df, cause_hierarchy):
        """Calculate mortality rates and replace cause fractions, as needed.
        """

        orig_cols = list(self.df.columns)
        age_aggs = self.df[self.df.age_group_id.isin([22, 27])]
        self.df = self.df[~self.df.age_group_id.isin([22, 27])]
        self.merge_pop_env(pop_df, env_df)
        self.merge_nonzero_mad_info(cause_hierarchy)
        self.make_min_floor()
        self.make_replace_cf()
        for col in self.cf_cols:
            self.replace_cf(col)
        self.diag_df = self.df
        null_cfs = self.df.loc[self.df[self.cf_cols].isnull().any(axis=1)]
        if len(null_cfs) > 0:
            raise AssertionError(
                "Found null rates in the data: \n{}".format(null_cfs))
        self.df = self.df[orig_cols + ['rate', 'floor']]
        self.df = self.df.append(age_aggs)
        # find lowest non-zero value that is in the dataframe and check that
        # it is not lower than lowest non-zero floor value
        data_min_val = self.df.loc[self.df['cf_final'] > 0, 'cf_final'].min()
        assert data_min_val >= self.min_possible_val, \
            "Data min value [{}] was lower than non-zero floor min " \
            "value [{}]".format(data_min_val, self.min_possible_val)
        return self.df

    def convert_nonzero_mad(self, df, cmdf):
        # add cause_id
        cmdf = cmdf[['acause', 'cause_id']]
        df = df.merge(cmdf, how='left', on='acause')
        # add id to cols
        df = df.rename(columns={
            'year': 'year_id',
            'sex': 'sex_id',
            'age': 'age_group_id'
        })
        # convert age
        age_to_id_map = {
            1: 5,
            5: 6,
            10: 7,
            15: 8,
            20: 9,
            25: 10,
            30: 11,
            35: 12,
            40: 13,
            45: 14,
            50: 15,
            55: 16,
            60: 17,
            65: 18,
            70: 19,
            75: 20,
            80: 30,
            85: 31,
            90: 32,
            95: 235,
            91: 2,
            93: 3,
            94: 4
        }
        df['age_group_id'] = df['age_group_id'].map(age_to_id_map)
        df = df.drop('acause', axis=1)

        return df

    def compile_nonzero_floor(self, cmdf):
        '''
        For GBD2019, new floor values were generated for cancer causes that had 
        updated age restrictions, or was a new modeled cause. This function takes
        the original nonzero floor values, and appends all updated values 
        '''
        work_dir = utils.get_path(process='cod_mortality',
                                  key='nonzero_floor_workspace')
        orig_nzf = pd.read_csv(
            utils.get_path(process='cod_mortality', key='orig_nonzero_file'))

        # convert age_group_ids to comply with GBD's
        formatted_orig_nzf = self.convert_nonzero_mad(orig_nzf, cmdf)

        # load nonzero floor values with new age restrictions, and that were new causes
        # for this GBD cycle
        new_age_rstrct_df = pd.read_csv(
            '{}/nonzero_floor_new_age_restrictions.csv'.format(work_dir))
        new_causes_df = pd.read_csv(
            '{}/nonzero_new_causes.csv'.format(work_dir))

        # append all nonzero values together
        comp_nzf = formatted_orig_nzf.append(new_age_rstrct_df)
        comp_nzf = comp_nzf.append(new_causes_df)

        return comp_nzf

    def make_replace_cf(self):
        """Replace cause fractions based on mortality rates.

        If the rate is over 0 and less than the floor, then the cause
        fractions are replaced with floor * pop / mean_env
        """
        self.df.loc[self.df['floor'].isnull(), 'floor'] = self.df['min_floor']
        self.df.loc[self.df['floor'].isnull(), 'floor'] = 0
        self.df['cf_replace'] = ((self.df['floor'] * self.df['population']) /
                                 self.df['mean_env'])

    def replace_cf(self, check_cf_col):
        # Replace the CF with the rate-adjusted CF if the
        # rate is less than the floor and greater than zero
        self.df['rate'] = ((self.df[check_cf_col] * self.df['mean_env']) /
                           self.df['population'])
        cf_over_0 = self.df[check_cf_col] > 0
        rate_less_than_floor = self.df['rate'] < self.df['floor']
        self.df.loc[cf_over_0 & rate_less_than_floor,
                    check_cf_col] = self.df['cf_replace']

    def make_min_floor(self):
        """Set min floor to the minimum cf of any rows floor by cause."""
        self.df['min_floor'] = self.df.groupby(
            'cause_id', as_index=False)['floor'].transform('min')
        missing_floor = self.df['min_floor'].isnull()
        nonzero_cf = self.df[self.cf_col] > 0
        assert len(self.df[nonzero_cf & missing_floor]) == 0

    def merge_pop_env(self, pop_df, env_df):
        if 'population' not in self.df.columns:
            self.df = add_population(self.df,
                                     add_cols=['population'],
                                     pop_df=pop_df)
        if 'mean_env' not in self.df.columns:
            self.df = add_envelope(self.df,
                                   add_cols=['mean_env'],
                                   env_df=env_df)

    def fill_na_floors(self, df):
        if df.floor.isnull().any():
            median = np.median(df[~df.floor.isnull()].floor)
            df.loc[df['floor'].isnull(), 'floor'] = median
        return df

    def merge_nonzero_mad_info(self, cmdf):
        """Read in the floor input and merge onto main dataframe."""
        # load nonzero floor values
        nonzero_mad = self.compile_nonzero_floor(cmdf)
        nonzero_mad = self.format_nzf(nonzero_mad, cmdf)
        self._check_all_floors_exist(
            nonzero_mad)  # checks that all age_groups/cancer/year/sex exist
        nonzero_mad_cols = self.merge_cols + ['floor']
        nonzero_mad = nonzero_mad[nonzero_mad_cols]
        self.min_possible_val = nonzero_mad['floor'].min()
        self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols)
        # ensure no floor values are missing
        assert self.df.floor.isnull().any() == False, "null floor values exist"
        report_if_merge_fail(self.df, 'floor', self.merge_cols)

    def get_diagnostic_dataframe(self):
        """Return diagnostics."""
        try:
            return self.diag_df
        except AttributeError:
            print("You requested the diag dataframe before it was ready,"
                  " returning an empty dataframe.")
            return pd.DataFrame()
示例#16
0
class Recoder(CodProcess):
    """Move deaths from one thing to another based on expert opinon."""

    id_cols = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id',
        'sex_id', 'cause_id', 'site_id'
    ]
    val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']

    def __init__(self, cause_meta_df, source, code_system_id, data_type_id):
        self.source = source
        self.code_system_id = code_system_id
        self.data_type_id = data_type_id
        self.cause_meta_df = cause_meta_df
        self.conf = Configurator("standard")
        self.vr_indicators_path = self.conf.get_resource('vr_indicators')
        self.cache_options = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_results': False,
            'cache_dir': self.conf.get_directory('db_cache')
        }

    def get_computed_dataframe(self, df):
        """Return computations."""

        # this method is de-activated until we establish how data drops
        # will be executed (new preference is through not uploading them or
        # running them through noise reduction)
        # df = self.drop_low_quality_data(self.df)
        if 'data_type_id' not in df.columns:
            df = add_nid_metadata(df, "data_type_id", **self.cache_options)
        df = self.recode(df)
        df = self.conform_secret_causes(df)
        df = self.clean_up(df)

        return df

    def get_diagnostic_dataframe(self):
        """Return diagnostics."""
        pass

    def recode_sids(self, df):
        # SIDS in under 4 star locations needs to be recoded to neonatal 02/26/18
        path_to_4_stars_sheet = self.conf.get_resource("four_star_locations")
        four_five_star_locs = pd.read_csv(path_to_4_stars_sheet)
        four_five_star_locs = four_five_star_locs[['location_id']]
        four_five_star_locs = four_five_star_locs.location_id.unique()
        less_than_four_star = ~df['location_id'].isin(four_five_star_locs)
        is_sids = df['cause_id'] == 686
        df.loc[is_sids & less_than_four_star, 'cause_id'] = 380
        return df

    def clean_up(self, df):
        """Group rogue duplicates."""
        df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum()
        return df

    def conform_secret_causes(self, df):
        """Remove secret causes and conform to reporting cause hierarchy."""
        # replace parent_id = 723 if cause is "inj_suicide_pesti",
        # "inj_suicide_fire", "inj_suicide_hang")
        df = add_cause_metadata(df,
                                add_cols=['secret_cause', 'parent_id'],
                                cause_meta_df=self.cause_meta_df,
                                **self.cache_options)
        injuries_replace_parents = [722, 720, 719]
        replaced_injuries = df['cause_id'].isin(injuries_replace_parents)
        df.loc[replaced_injuries, 'parent_id'] = 723
        secret_causes = df['secret_cause'] == 1
        not_cc_code = df['cause_id'] != 919
        len_before = len(df)
        if df['parent_id'].isnull().values.any():
            raise AssertionError('There are missing parent cause_ids')
        df.loc[secret_causes & not_cc_code, 'cause_id'] = df['parent_id']
        len_after = len(df)
        if len_before != len_after:
            raise AssertionError(
                'The length of the dataframe has changed from {} to {}'.format(
                    len_before, len_after))
        df.drop(['parent_id', 'secret_cause'], axis=1, inplace=True)
        return df

    def drop_leukemia_subtypes(self, df):
        """Remove leukemia subtypes deaths created by redistribution.

        Deaths that are created in redistribution for leukemia subtypes should
        be recoded to the parent leukemia.
        """
        leuk_subtypes = get_all_related_causes('neo_leukemia',
                                               self.cause_meta_df)

        # remove parent leukemia cause_id
        leuk_subtypes.remove(487)

        df.loc[(df['cause_id'].isin(leuk_subtypes)) & (df['deaths_rd'] > 0) &
               (df['deaths_raw'] <= 0), 'cause_id'] = 487

        return df

    def recode(self, df):
        """Recode based on expert judgement.
        """
        cause_metadata_df = self.cause_meta_df
        cause_metadata_df = cause_metadata_df[[
            "cause_id", "path_to_top_parent", "acause"
        ]]
        # recode ckd except for ckd_other to cong_other in neonates
        ckd_cause_ids = get_all_related_causes('ckd', cause_metadata_df)
        ckd_cause_ids.remove(593)
        ckd_less_other = df['cause_id'].isin(ckd_cause_ids)
        neonate = df['age_group_id'].isin([2, 3])
        df.loc[ckd_less_other & neonate, 'cause_id'] = 652

        # recode resp_copd, resp_asthma, resp_other, resp_interstitial to lri
        # in neonates
        resp_ids = [509, 515, 516, 520]
        is_cert_resp_causes = df['cause_id'].isin(resp_ids)
        # neonate already defined
        df.loc[is_cert_resp_causes & neonate, 'cause_id'] = 322

        # recode resp_asthma to lri in perinates
        is_asthma = df['cause_id'] == 515
        df.loc[is_asthma & (df['age_group_id'] == 4), 'cause_id'] = 322

        # Drop any maternal cause below age 10 and above age 55
        # (recode to cc_code)
        maternal_cause_ids = get_all_related_causes(366, cause_metadata_df)
        maternal_cause_ids = df['cause_id'].isin(maternal_cause_ids)
        # ages not in the maternal age range
        non_maternal_ages = np.logical_not(df['age_group_id'].isin(
            [7, 8, 9, 10, 11, 12, 13, 14, 15, 22]))
        df.loc[maternal_cause_ids & non_maternal_ages, 'cause_id'] = 919

        # Drop alzheimers below age 40 to (recode to cc_code)
        # dementia cause_id = 543
        alzheimers = df['cause_id'] == 543
        under_40 = df['age_group_id'].isin(range(1, 13, 1))
        df.loc[alzheimers & under_40, 'cause_id'] = 919

        # Recode congenital causes to cc_code in ages over 70
        # (stata: substr(acause, 1, 4) == "cong")
        cong_causes = get_all_related_causes('cong', cause_metadata_df)
        congenital = df['cause_id'].isin(cong_causes)
        over_70 = df['age_group_id'].isin([19, 20, 30, 31, 32, 235])
        df.loc[congenital & over_70, "cause_id"] = 919

        # Recode neonatal-aged hepatitis
        # (and all sub-causes) to neonatal_hemolytic
        # except ICD9_USSR_Tabulated and ICD10_tabulated
        # Recode neonatal-aged hepatitis (and all sub-causes) to neonatal
        # if source is ICD9_USSR_Tabulated or ICD10_tabulated
        hepatitis = get_all_related_causes(400, cause_metadata_df)
        hepatitis = df['cause_id'].isin(hepatitis)
        if self.code_system_id in [7, 9]:
            df.loc[hepatitis & neonate, "cause_id"] = 380
        else:
            df.loc[hepatitis & neonate, "cause_id"] = 384

        # inj_disaster_light to inj_othunintent 2/07/18
        inj_disaster_light = df['cause_id'] == 984
        df.loc[inj_disaster_light, 'cause_id'] = 716

        # ckd diabetes type to ckd all but icd10 2/07/18
        # added ICD9_detail to exception 5/15/18
        if self.code_system_id not in [1, 6]:
            ckd_diabetes = df['cause_id'].isin([997, 998])
            df.loc[ckd_diabetes, 'cause_id'] = 589

        # Removing diabetes remap 7/2/2019 - want to use the results of the new
        # unspecified diabetes regression for everything
        # # diabetes subtypes to parent all but icd10 2/07/18
        # # added ICD9_detail, ICD10_tab to exception 5/15/18
        # if self.code_system_id not in [1, 6, 9]:
        #     diabetes_subtypes = df['cause_id'].isin([975, 976])
        #     df.loc[diabetes_subtypes, 'cause_id'] = 587

        # diabetes to type 1 under 15 everywhere 2/07/18
        diabetes_type_2 = df['cause_id'] == 976
        under_15 = df['age_group_id'] < 8
        df.loc[diabetes_type_2 & under_15, 'cause_id'] = 975

        # nutrition iron and iodine to zz every data 2/07/18
        iron_or_iodine = df['cause_id'].isin([388, 390])
        df.loc[iron_or_iodine, 'cause_id'] = 919

        # cvd_ihd move to cong_heart  in under one year 2/07/18
        under_1 = df['age_group_id'] < 5
        cvd_ihd = df['cause_id'] == 493
        df.loc[cvd_ihd & under_1, 'cause_id'] = 643

        if 686 in df.cause_id.unique():
            df = self.recode_sids(df)

        # Need to map _neo, _mental, _infect
        # etc to cc code 2/07/18
        df.loc[df.cause_id.isin([344, 409, 410, 542, 558, 669, 680, 961]),
               'cause_id'] = 919
        # usually we also have to map _inj to cc_code, but in some VA we have
        # other sources for splitting _inj we do not move to cc_code 3/26/2018
        if self.data_type_id not in [6, 7, 8]:
            df.loc[df['cause_id'] == 687, 'cause_id'] = 919

        # cvd_ihd to cvd_other in under age one to 14 years 2/07/18 bridge map
        one_to_14 = df['age_group_id'].isin([5, 6, 7])
        cvd_ihd = df['cause_id'] == 493
        df.loc[cvd_ihd & one_to_14, 'cause_id'] = 507
        # TODO test if the distinction between this and the above is necessary,
        # e.g. would the bridge map already map neonatal_hemolytic to neonatal?

        # Do shared cancer recodes (previously in cancer_recodes.do)
        cancer_recodes = get_all_related_causes([
            411, 414, 423, 426, 429, 432, 435, 438, 441, 444, 450, 453, 456,
            459, 462, 465, 468, 474, 486, 483
        ], cause_metadata_df)
        cancer_recodes = df['cause_id'].isin(cancer_recodes)
        cancer_ages = df['age_group_id'].isin(range(2, 8, 1))
        df.loc[cancer_recodes & cancer_ages, "cause_id"] = 489

        not_icd10 = self.code_system_id != 1
        neo_meso = df['cause_id'] == 483
        df.loc[neo_meso & not_icd10, "cause_id"] = 489

        # Recode digest_hernia to cc_code if source is Ethiopia_AAMSP
        # added Ethiopia_subnational_AAMSP in GBD2017
        if self.source.endswith("AAMSP"):
            digest_hernia = df['cause_id'].isin([531])
            df.loc[digest_hernia, "cause_id"] = 919

        # in these years we split a garbage of homicide/suicide to
        # their causes proportionally, now we want to recode the years
        # that we don't want to use in the homicide/suicide model.
        if self.source == "Iran_Mohsen_special_ICD10":
            homicide_and_suicide = df['cause_id'].isin(
                [724, 725, 726, 727, 941, 718, 719, 720, 721, 722, 723])
            bad_years = df['year_id'].isin(range(2007, 2015))
            # _unintent
            df.loc[bad_years & homicide_and_suicide, "cause_id"] = 919

        # Recode war subcauses to inj_homicide in Jamaica 2005 VR
        inj_war = get_all_related_causes(945, cause_metadata_df)
        is_inj_war = df['cause_id'].isin(inj_war)
        jamaica = df['location_id'] == 115
        year_2005 = df['year_id'] == 2005
        vr = df['data_type_id'] == 9
        df.loc[is_inj_war & jamaica & year_2005 & vr, 'cause_id'] = 724

        # Recode inj_mech_gun to inj_homicide for Jamaica 2006 VR
        # "In ICD10 2005 there a large number of deaths due to
        # homicides, but in 2006 many of these deaths have moved to
        # unintentional firearms.
        # 2006 is missing homicides deaths. USERNAME wants to move deaths from
        # unintentional firearms to homicides."
        inj_mech_gun = df['cause_id'] == 705
        year_2006 = df['year_id'] == 2006
        df.loc[inj_mech_gun & year_2006 & jamaica & vr, 'cause_id'] = 724

        # Recode digest_ibd to digest for Suriname 2005-2012 ICD10
        # "Because NR has a very bad effect on IBD in Surinam please recode all
        # of data from  1995-2012 (ICD10 ) for "digest_ibd" to "digest"  in
        # Suriname and keep them in recoding list for every upload"
        # TODO should this be more years than just 2012? like all of ICD10?
        if self.source == "ICD10":
            digest_ibd = df['cause_id'] == 532
            suriname = df['location_id'] == 118
            year_1995_2012 = df['year_id'].isin(range(1995, 2013, 1))
            df.loc[digest_ibd & suriname & year_1995_2012, 'cause_id'] = 526

        # Recode endo_procedural to inj_homicide, writ-large
        # "GBD2013 HACK: USERNAME and USERNAME want Endo-procedural
        # to go to inj_medical just for this round.
        # In GBD2014 it will go to endo"
        endo_prodcedural = df['cause_id'] == 624
        df.loc[endo_prodcedural, 'cause_id'] = 708

        # Recode Schizophrenia to cc_code in Tibet - USERNAME's reason:
        # "Because have very bad effect in Noise Reduction"
        schizo = df['cause_id'] == 559
        tibet = df['location_id'] == 518
        df.loc[schizo & tibet, 'cause_id'] = 919

        # Recode HIV and all sub-causes before 1980 to cc_code, writ-large
        hiv = get_all_related_causes(298, cause_metadata_df)
        hiv = df['cause_id'].isin(hiv)
        pre_1980 = df['year_id'] < 1980
        df.loc[hiv & pre_1980, 'cause_id'] = 919

        # Recode diabetes and all sub-causes to neonatal, if age is neonatal
        # "2-Any death assigned to Diabetes in neonatal period (age 0-28 days)
        # in all data format (Except ICD9 and ICD10 detail) including all MCCD,
        # DSP , Russia format, VA have to recode to the neonatal death" -USERNAME
        # TODO this should be an age restriction for GBD not a recode
        # TODO implement
        diabetes_causes = get_all_related_causes(587, cause_metadata_df)
        diabetes = df['cause_id'].isin(diabetes_causes)
        df.loc[neonate & diabetes, 'cause_id'] = 380

        # Recode cvd_stroke and all subcauses to cvd
        # in Verbal Autopsy under 20 years
        # "Any death in VA and SCD that assigned to the Stroke
        # in under age 20 years have to recode to all CVD"
        # Not done in bridge map; stata code does this for all VA
        # despite SCD comment.
        under_20 = df['age_group_id'].isin(range(0, 8, 1))
        stroke = get_all_related_causes('cvd_stroke', cause_metadata_df)
        stroke_deaths = df['cause_id'].isin(stroke)
        va = df['data_type_id'] == 8
        # cvd cause_id is 491
        df.loc[under_20 & stroke_deaths & va, 'cause_id'] = 491

        # Recode inj_trans_road_pedal to cc_code if age over 95, for everything
        # USERNAME request 1/20/2017 "remove inj_trans_road_pedal for over
        # 95 in all countries and years"
        # TODO should this be an age restriction? questionable...
        over_95 = df['age_group_id'] == 235
        inj_trans_road_pedal = df['cause_id'] == 691
        df.loc[over_95 & inj_trans_road_pedal, 'cause_id'] = 919

        # Recode mental_schizo to _mental everywhere
        # "USERNAME request 1/31/2017 to get rid of all mental_schizo as a cause
        # of death and map to _mental"
        # TODO implement
        # TODO should this be yld_only, then? questionable...
        # TODO if maintaining this, don't need restriction restricting
        # mental_schizo to cc_code in Tibet
        df.loc[schizo, 'cause_id'] = 919

        # Recode msk and all sub-causes to cc_code in all VA
        # "USERNAME and USERNAME request 2/14/2017 "msk recode to cc_code for all
        # VA and SRS"
        # this is in the bridge map already

        # Recode cvd_pvd to cvd in Russia_FMD_1999_2011
        # Russia 1999 2011 has a weird outlier for pvd, should be cvd according
        # to USERNAME 02/13/2017
        # TODO implement
        if self.source == "Russia_FMD_1999_2011":
            cvd_pvd = df['cause_id'] == 502
            df.loc[cvd_pvd, 'cause_id'] = 491

        # USERNAME said to remove this following recode 2/26/2018
        # # In all VR USERNAME wants to move mental_drug deaths in under 15
        # # to unintentional poisoning. -USERNAME 7/8/2015
        # # cause_id 562 (mental_drug_opioids) has different age restrictions,
        # # so recode it separately
        # mental_causes_no_op = df['cause_id'].isin(
        #     [560, 561, 563, 564, 565, 566]
        # )
        # mental_no_op_ages = df['age_group_id'].isin(range(2, 8, 1))
        # df.loc[mental_causes_no_op & mental_no_op_ages & vr, 'cause_id'] = 700

        # mental_op = df['cause_id'] == 562
        # mental_op_ages = df['age_group_id'].isin([4, 5, 6, 7])
        # df.loc[mental_op & mental_op_ages & vr, 'cause_id'] = 700

        # Temp fix for self imposed redistribution error
        # move suicide and homicide in these years to cc_code
        if self.source == "Iran_Mohsen_special_ICD10":
            sui_homi_causes = [
                717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 941
            ]
            sui_homi = df['cause_id'].isin(sui_homi_causes)
            bad_years = df['year_id'].isin(range(2007, 2015))
            df.loc[sui_homi & bad_years, 'cause_id'] = 919

        # In India MCCD neonatal sepsis should only be in under 1 month
        if "India_MCCD" in self.source:
            non_neonates = np.logical_not(df['age_group_id'].isin([2, 3]))
            neonatal_sepsis = df['cause_id'].isin([])
            df.loc[non_neonates & neonatal_sepsis, 'cause_id'] = 380

        # In India_SCD_states_rural we are trying to get rid of all the
        # redistribution artifacts
        if self.source == "India_SCD_states_rural":
            warnings.warn("Implement SCD rd artifact recode")

        # Recoding state actor violence to war for proper schocks tracking
        # in ICD9btl & icd10 inj_war_execution > inj_war_war in Ecuador '80-'90
        inj_war_execution = df['cause_id'] == 854

        if self.source == "ICD9_BTL":
            ecuador = df['location_id'] == 122
            year_1980_1990 = df['year_id'].isin(range(1980, 1991, 1))
            df.loc[inj_war_execution & ecuador & year_1980_1990,
                   'cause_id'] = 855

            # inj_war_execution > inj_war_war for BIH from 1985-91
            bih = df['location_id'] == 44
            year_1985_1991 = df['year_id'].isin(
                [1985, 1986, 1987, 1988, 1989, 1990, 1991])
            df.loc[inj_war_execution & bih & year_1985_1991, 'cause_id'] = 855
            # in icd9_btl there are cancer recodes to be implemented here
            warnings.warn("BTL cancer recode needed")

        if self.source == "ICD10":
            irq = df['location_id'] == 143
            year_2008 = df['year_id'] == 2008
            df.loc[inj_war_execution & year_2008 & irq, 'cause_id'] = 855

        # USERNAME said cirrhosis and hepatitis in India SRS did not go very well (5/26/19)
        # "Move any death from SRS in the final stage due to cirrhosis to hepatitis in under 15
        # Move 30% death from SRS in the final stage due to cirrhosis to hepatitis in between 15-24"
        if self.source == "India_SRS_states_report":
            # There should be no cirrhosis subtypes in SRS, but include them in case things change
            cirrhosis_ids = [521, 522, 523, 524, 971, 525]
            hepatitis_id = 400

            # Under 15
            under_15 = df['age_group_id'] < 8
            cirrhosis = df['cause_id'].isin(cirrhosis_ids)
            df.loc[under_15 & cirrhosis, 'cause_id'] = hepatitis_id

            # 15-24
            start_deaths = df[self.val_cols].sum(axis=0)
            # Create proportions to split
            split_df = pd.DataFrame()
            for age_group_id in [8, 9]:
                for cirrhosis_id in cirrhosis_ids:
                    small_df = pd.DataFrame({
                        'new_cause_id': [cirrhosis_id, hepatitis_id],
                        'pct': [0.70, 0.30]
                    })
                    small_df['cause_id'] = cirrhosis_id
                    small_df['age_group_id'] = age_group_id
                    split_df = split_df.append(small_df, sort=True)
            # Merge in the proportions and split
            # Do not apply the split retroactively - can't take away deaths from
            # cirrhosis in earlier phases if they aren't there yet
            df = df.merge(split_df,
                          how='left',
                          on=['age_group_id', 'cause_id'])
            matches = df.new_cause_id.notnull()
            df.loc[matches, 'cause_id'] = df['new_cause_id']
            df.loc[matches, 'deaths'] = df['deaths'] * df['pct']
            for col in ['deaths_raw', 'deaths_corr', 'deaths_rd']:
                df.loc[matches & (df['new_cause_id'] == hepatitis_id), col] = 0
            df.drop(["new_cause_id", "pct"], axis='columns', inplace=True)
            assert np.allclose(start_deaths, df[self.val_cols].sum(axis=0))
            assert df.notnull().values.all()

        # USERNAMEFm says we should not have congenital in older age groups
        # in this study. USERNAME says that since congenital is created by the
        # redistribution of sepsis for this study: "Result of redistrbution on sepsis
        # have to be very low, if the problem is just this one drop result of redistribution
        # due to sepsis"
        # The larger question is if/when we should create causes in VA
        malawi_va_study = df['nid'] == 413649
        congenital = df.cause_id.isin(
            get_all_related_causes('cong', cause_metadata_df))
        df.loc[malawi_va_study & congenital, 'cause_id'] = 919

        if self.source == "ICD9_detail":
            if ((df['location_id'] == 43) & (df['year_id'] == 1997)).any():
                warnings.warn("Albania homicide recode needed")

        if self.source == "ICD9_USSR_Tabulated":
            warnings.warn("Missing some homicide fixes for TJK, ARM here.")

        df = self.drop_leukemia_subtypes(df)

        # mortuary, burial, self-reported COD, census/survey,
        # and tabulated hospital data should be reduced down to just
        # injuries, maternal, and cc_code
        if self.data_type_id in [1, 3, 5, 7]:
            maternal_causes = get_all_related_causes('maternal',
                                                     cause_metadata_df)
            injury_causes = get_all_related_causes('_inj', cause_metadata_df)
            maternal = df['cause_id'].isin(maternal_causes)
            inj = df['cause_id'].isin(injury_causes)
            df.loc[~(maternal | inj), 'cause_id'] = 919

            # for sibling history, we only want maternal and cc_code
            if self.data_type_id == 5:
                df.loc[~maternal, 'cause_id'] = 919

        return df
示例#17
0
class InjuryRedistributor(CodProcess):
    def __init__(self, df, loc_meta_df, cause_meta_df):
        self.df = df
        self.start_deaths = self.df['deaths'].sum()
        self.loc_meta_df = loc_meta_df
        self.cause_meta_df = cause_meta_df
        self.conf = Configurator("standard")

    def get_computed_dataframe(self):
        self.set_injury_cause_list()
        self.set_iso3_on_data()
        inj_df = self.get_injury_df(self.df)
        sans_inj_df = self.df[~self.df['cause_id'].isin(self.injury_cause_list
                                                        )]

        inj_by_iso_sex_year = self.prep_deaths_by_iso_sex_year(inj_df)
        inj_props = self.prep_injury_proportions_file()
        props_with_deaths = inj_props.merge(inj_by_iso_sex_year,
                                            on='sex_id',
                                            how='left')
        props_with_deaths[
            'deaths'] = props_with_deaths['deaths'] * props_with_deaths['prop']
        props_with_deaths = props_with_deaths.drop('prop', axis=1)
        inj_env = props_with_deaths.rename(columns={'deaths': 'inj_env'})

        inj_df = self.replace_poisoning_and_suicide(inj_df)
        age_pattern_df = self.get_age_pattern_df(inj_df)
        inj_df = age_pattern_df.merge(
            inj_env,
            on=['year_id', 'iso3', 'sex_id', 'cause_id'],
            how='left',
            indicator=True)
        inj_df['deaths'] = inj_df['prop'] * inj_df['inj_env']
        result = sans_inj_df.append(inj_df, ignore_index=True)
        assert np.isclose(self.start_deaths, result.deaths.sum())
        return result

    def replace_poisoning_and_suicide(self, df):
        df = add_cause_metadata(df, 'acause', cause_meta_df=self.cause_meta_df)
        inj_poison = self.cause_meta_df[
            self.cause_meta_df.acause ==
            "inj_poisoning"]['cause_id'].unique()[0]
        inj_suicide = self.cause_meta_df[self.cause_meta_df.acause ==
                                         "inj_suicide"]['cause_id'].unique()[0]
        df.loc[df['acause'].str.startswith("inj_poison"),
               'cause_id'] = inj_poison
        df.loc[df['acause'].str.startswith("inj_suicide"),
               'cause_id'] = inj_suicide
        df = df[~df['acause'].isin(['inj_homicide', 'inj_trans_road'])]
        df = df.drop("acause", axis=1)
        return df

    def get_age_pattern_df(self, df):
        df = df.groupby([col for col in df.columns if col not in ['deaths']],
                        as_index=False)['deaths'].sum()
        df['all_age_total'] = df.groupby(
            ['sex_id', 'year_id', 'cause_id',
             'location_id'])['deaths'].transform(sum)
        df['prov_total'] = df.groupby(['sex_id', 'year_id',
                                       'cause_id'])['deaths'].transform(sum)
        df['prov_split'] = df['all_age_total'] / df['prov_total']
        df['prop'] = (df['deaths'] / df['all_age_total']) * df['prov_split']
        df = df.drop(['prov_split', 'prov_total', 'all_age_total', 'deaths'],
                     axis=1)
        return df

    def prep_injury_proportions_file(self):
        filepath = self.conf.get_resource('injury_proportions')
        inj_props = pd.read_csv(filepath)
        inj_props = inj_props[inj_props['most_detailed'] == 1]
        inj_props = inj_props[['acause', 'rdp2', 'rdp1']]
        inj_props = add_cause_metadata(inj_props,
                                       'cause_id',
                                       merge_col='acause',
                                       cause_meta_df=self.cause_meta_df)
        inj_props = inj_props.loc[inj_props['cause_id'].notnull()]
        inj_props = inj_props.drop('acause', axis=1)
        inj_props = pd.melt(inj_props,
                            id_vars=['cause_id'],
                            var_name='sex_id',
                            value_name='prop')
        inj_props['sex_id'] = inj_props['sex_id'].apply(lambda x: x[3]).astype(
            int)
        inj_props['total_prop'] = inj_props.groupby(
            'sex_id')['prop'].transform(sum)
        inj_props['prop'] = inj_props['prop'] / inj_props['total_prop']
        inj_props = inj_props.drop('total_prop', axis=1)
        return inj_props

    def set_iso3_on_data(self):
        self.df = add_location_metadata(self.df,
                                        'ihme_loc_id',
                                        location_meta_df=self.loc_meta_df)
        self.df['iso3'] = self.df['ihme_loc_id'].apply(lambda x: x[0:3])
        self.df = self.df.drop('ihme_loc_id', axis=1)

    def set_injury_cause_list(self):
        inj_causes = self.cause_meta_df[
            self.cause_meta_df['acause'].str.startswith("inj")]
        self.injury_cause_list = list(inj_causes['cause_id'].unique())

    def get_injury_df(self, df):
        df = df[df.cause_id.isin(self.injury_cause_list)]
        return df

    def prep_deaths_by_iso_sex_year(self, df):
        df = df[['iso3', 'year_id', 'sex_id', 'deaths']]
        df = df.groupby(['iso3', 'year_id', 'sex_id'],
                        as_index=False)['deaths'].sum()
        return df
示例#18
0
 def __init__(self, df, loc_meta_df, cause_meta_df):
     self.df = df
     self.start_deaths = self.df['deaths'].sum()
     self.loc_meta_df = loc_meta_df
     self.cause_meta_df = cause_meta_df
     self.conf = Configurator("standard")
示例#19
0
class MCauseLauncher(object):

    conf = Configurator('standard')
    cache_options = {
        'force_rerun': True,
        'block_rerun': False,
        'cache_dir': "standard",
        'cache_results': True,
        'verbose': True
    }

    source_memory_dict = {
        'TWN_MOH': '2G',
        'MEX_INEGI': '10G',
        'BRA_SIM': '15G',
        'USA_NVSS': '20G',
        'COL_DANE': '2G',
        'ITA_ISTAT': '8G'
    }

    location_set_version_id = 420
    cause_set_version_id = conf.get_id('reporting_cause_set_version')
    thesis_code = "/homes/agesak/thesis/thesis_data_prep"
    limited_sources = ["TWN_MOH", "MEX_INEGI", "BRA_SIM", "USA_NVSS"]

    def __init__(self, run_filters):
        self.run_filters = run_filters

    def prep_run_filters(self):
        datasets_kwargs = {'force_rerun': True, 'block_rerun': False}
        datasets_kwargs.update({
            k: v
            for k, v in self.run_filters.items()
            if k not in ['intermediate_causes', 'phase', 'inj_garbage']
        })
        datasets = get_datasets(**datasets_kwargs)
        # GBD 2019: Drop Europe data because only for drug overdose
        # 5/23/2020: Drop South Africa because Mohsen said bad for injuries
        datasets = datasets.loc[~(
            datasets['source'].isin(["EUROPE_INJ_POISON", "ZAF_STATSSA"]))]
        datasets = datasets.drop_duplicates(
            ['nid', 'extract_type_id']).set_index(['nid', 'extract_type_id'])[[
                'year_id', 'code_system_id', 'source', 'data_type_id'
            ]]
        datasets['code_map_version_id'] = datasets['code_system_id'].apply(
            lambda x: get_map_version(x, 'YLL', 'best'))
        datasets['remove_decimal'] = datasets['code_system_id'].apply(
            lambda x: get_remove_decimal(x))
        return datasets

    def launch_format_map(self, year, source, int_cause, code_system_id,
                          code_map_version_id, nid, extract_type_id,
                          data_type_id):
        """Submit qsub for format_map phase."""
        # remove existing output
        if self.run_filters["inj_garbage"]:
            subdirs = f"{int_cause}/thesis/inj_garbage"
        else:
            subdirs = f"{int_cause}/thesis"
        delete_claude_output('format_map',
                             nid,
                             extract_type_id,
                             sub_dirs=subdirs)
        if source in self.limited_sources:
            limited_dir = get_limited_use_directory(
                source, int_cause, self.run_filters["inj_garbage"])
            if os.path.exists(
                    f"{limited_dir}/{nid}_{extract_type_id}_format_map.csv"):
                os.remove(
                    f"{limited_dir}/{nid}_{extract_type_id}_format_map.csv")
        worker = f"{self.thesis_code}/run_phase_format_map.py"
        params = [
            int(year), source, int_cause,
            int(code_system_id),
            int(code_map_version_id),
            int(self.cause_set_version_id),
            int(nid),
            int(extract_type_id),
            int(data_type_id), self.run_filters["inj_garbage"]
        ]
        if self.run_filters["inj_garbage"]:
            jobname = f'format_map_injgarbage_{source}_{nid}_{year}_{int_cause}'
        else:
            jobname = f'format_map_{source}_{nid}_{year}_{int_cause}'
        try:
            memory = self.source_memory_dict[source]
        except KeyError:
            print(f"{source} is not in source_memory_dict. Trying with 5G.")
            memory = '5G'

        if data_type_id == 3:
            runtime = '02:00:00'
        else:
            runtime = '06:00:00'

        submit_mcod(jobname,
                    'python',
                    worker,
                    cores=1,
                    memory=memory,
                    params=params,
                    verbose=True,
                    logging=True,
                    jdrive=True,
                    runtime=runtime)

    def launch_check_output(self, source, int_cause, nid, extract_type_id,
                            year_id, phase):
        if self.run_filters["inj_garbage"]:
            subdirs = f"{int_cause}/thesis/inj_garbage"
        else:
            subdirs = f"{int_cause}/thesis"
        if source in self.limited_sources:
            limited_dir = get_limited_use_directory(
                source, int_cause, self.run_filters["inj_garbage"])
            if not os.path.exists(
                    f"{limited_dir}/{nid}_{extract_type_id}_format_map.csv"):
                print_log_message(
                    f"no output found for {source} year {year_id} nid: {nid},"
                    f"extract_type_id: {extract_type_id}")
        else:
            if not check_output_exists(
                    phase, nid, extract_type_id, sub_dirs=subdirs):
                print_log_message(
                    f"no output found for {source} year: {year_id} nid: {nid},"
                    f"extract_type_id: {extract_type_id}")

    def launch(self):
        datasets = self.prep_run_filters()

        if "format_map" in self.run_filters["phase"]:
            for row in datasets.itertuples():
                nid, extract_type_id = row.Index
                for int_cause in self.run_filters['intermediate_causes']:
                    print_log_message(f"launching jobs")
                    self.launch_format_map(row.year_id, row.source, int_cause,
                                           row.code_system_id,
                                           row.code_map_version_id, nid,
                                           extract_type_id, row.data_type_id)
        elif "check_output_exists" in self.run_filters["phase"]:
            for row in datasets.itertuples():
                nid, extract_type_id = row.Index
                for int_cause in self.run_filters['intermediate_causes']:
                    self.launch_check_output(row.source, int_cause, nid,
                                             extract_type_id, row.year_id,
                                             "format_map")
示例#20
0
class MCoDMapper():
    """Map ICD codes to code_ids, cause_ids.

    Arguments:
        int_cause (str): the intermediate cause of interest (e.g. sepsis)
        code_system_id (int): the ICD category, determines which map to use
        code_map_version_id (int): the version of the map to use
        df (dataframe): dataframe of formatted mcod data

    Returns:
        df (dataframe): dataframe with the underlying cause mapped to code id and cause_id
        and the causes in the chain flagged for containing the intermediate cause of interest.


    """
    cache_options = {'force_rerun': False, 'block_rerun': True}
    conf = Configurator()
    inj_causes = ['x59', 'y34']
    int_cause_name_dict = {
        'x59': ['unspecified external factor x59'],
        'y34': ['external causes udi,type unspecified-y34']
    }
    possible_int_causes = list(int_cause_name_dict.keys())

    def __init__(self, int_cause, code_system_id, code_map_version_id,
                 drop_p2):
        self.int_cause = int_cause
        self.code_system_id = code_system_id
        self.code_map_version_id = code_map_version_id
        self.drop_p2 = drop_p2
        assert self.int_cause in self.possible_int_causes, \
            f"{self.int_cause} is not a valid intermediate cause"
        self.full_cause_name = self.int_cause_name_dict[self.int_cause]
        if type(self.full_cause_name) != list:
            self.full_cause_name = [self.full_cause_name]

    @staticmethod
    def get_code_columns(df):
        """Get a list of raw cause columns with ICD codes as values."""
        col_names = list(df.columns)
        code_cols = [
            x for x in col_names if "multiple_cause" in x and "pII" not in x
        ] + ['cause']
        return code_cols

    @staticmethod
    def _get_cause_num(mcod_col):
        """Get sort order for cause columns.

        Assumes you have an underlying cause (cause_x) column and chain columns (multiple_cause_x)
        and that the value to sort off of is after the second underscore.
        """
        if mcod_col.startswith('cause'):
            return '0'
        else:
            assert re.match(r"^multiple_cause_[a-z]*[0-9]*", mcod_col), \
                f"column {mcod_col} does not match expected format: multiple_cause_x"
            return mcod_col.split('_')[2]

    @staticmethod
    def prep_raw_mapped_cause_dictionary(raw_cols, mapped_cols):
        """Create dictionary of raw cause columns to mapped cause columns.

        Ensures that "multiple_cause_2_mapped" is the value associated with
        "multiple_cause_2" key, e.g.
        """
        raw_cols = sorted(raw_cols, key=MCoDMapper._get_cause_num)
        mapped_cols = sorted(mapped_cols, key=MCoDMapper._get_cause_num)
        return dict(list(zip(raw_cols, mapped_cols)))

    @staticmethod
    def fix_icd_codes(df, codes, code_system_id):
        """Adjustment to icd9/10 cause codes."""
        if code_system_id == 6:
            # according to Mohsen, codes between 800 to 900 need an E if underlying
            # assume 800, 900 codes are N codes if in the chain, don't add any prefix
            df.loc[df['cause'].str.contains('^[89]'),
                   'cause'] = 'E' + df['cause']
        elif code_system_id == 1:
            # S + T codes are always intermediate causes of death
            # V + Y codes are always the underlying cause of death
            violations = df['cause'].str.contains('^[ST]')
            num_violations = len(df[violations])
            if num_violations > 0:
                print_log_message(
                    f"Found S or T code as underlying cause, dropping {num_violations} rows"
                )
                assert np.isclose(len(df[~violations]), len(df), rtol=.10)
                df = df.loc[~violations]

            # next check violations in chain causes
            # V and Y codes can only be UCOD
            for col in codes:
                if col != 'cause':
                    violations = df[col].str.contains('^[VY]')
                    num_violations = len(df[violations])
                    if num_violations > 0:
                        print_log_message(
                            f"Setting {num_violations} rows with V/Y in chain to 0000 for {col}"
                        )
                        df.loc[violations, col] = '0000'
        return df

    @staticmethod
    def prep_cause_package_map(cause_package_map):
        """Expects cause-package map.

        Set dictionary of value: map_id since we only care about the package name
        or the cause_id, not the individual ICD code level code.
        """
        check_map = cause_package_map[['map_id', 'map_type']].drop_duplicates()
        report_duplicates(check_map, 'map_id')
        cause_package_map = cause_package_map.set_index(
            'value')['map_id'].to_dict()
        return cause_package_map

    @staticmethod
    def prep_cause_map(cause_map):
        """Clean up cause map."""
        cause_map['value'] = clean_icd_codes(cause_map['value'],
                                             remove_decimal=True)
        # duplicates are a result of weird _gc, the duplicates dropped all
        # have the higher sort_order (999999)
        cause_map = cause_map.drop_duplicates(['code_system_id', 'value'])
        cause_map['code_id'] = cause_map['code_id'].astype(int)
        cause_map = cause_map.set_index('value')['code_id'].to_dict()
        return cause_map

    @staticmethod
    def map_cause_codes(df, coi_map, coi, cols_to_map=None):
        """Map cause codes to any given value (e.g. acause, category, etc.).

        Inputs
        df (pd dataframe): incoming, unmapped data with ICD codes
        cause_map (pd dataframe): primary cause map, probably downloaded from the engine room
        coi_map (pd dataframe): special map designed just for one cause of interest
        coi (string): cause of interest
        Returns
        df (pd dataframe): mapped dataframe with additional columns for each cause
        """
        df = df.copy()
        if not cols_to_map:
            cols_to_map = MCoDMapper.get_code_columns(df)
        # map chain causes using cause of interest map
        for col in cols_to_map:
            df[col] = df[col].fillna('0000')
            df[col] = df[col].astype(object)
            df[col + '_' + coi] = df[col].map(coi_map)
        return df

    @staticmethod
    def trim_and_remap(df, code_dict, cause_map, code_system_id):
        """Trim ICD codes to 4 digits, map again, then 3, and map again."""
        df = df.copy()
        # before trimming, map "null" chain causes to '0000'
        for code, mapped_code in list(code_dict.items()):
            df.loc[df[code] == '0000', mapped_code] = '0000'

        # trim and re map null mappings
        for n in reversed(range(3, 6)):
            for code, mapped_code in list(code_dict.items()):
                temp_code = 'temp_' + code
                df[temp_code] = df[code].copy()
                try:
                    df.loc[df[mapped_code].isnull(),
                           temp_code] = df[temp_code].apply(lambda x: x[0:n])
                except TypeError:
                    # was getting a type error for some unicode issues?
                    if mapped_code != 'cause_mapped':
                        df[mapped_code] = '0000'
                    else:
                        print("problem code here..." + df[code])
                df.loc[df[mapped_code].isnull(),
                       mapped_code] = df[temp_code].map(cause_map)
                df = df.drop(temp_code, axis=1)
        return df

    def prep_int_cause_map(self):
        map_dir = self.conf.get_directory('process_inputs')
        code_system_name = {1: 'icd10', 6: 'icd9'}[self.code_system_id]
        df = pd.read_excel(f"{map_dir}/mcause_map.xlsx",
                           dtype={'icd_code': object})
        df = df[['icd_code', 'package_description',
                 'code_system']].drop_duplicates()

        # cleanup strings and things
        df['icd_code'] = clean_icd_codes(df['icd_code'], remove_decimal=True)
        df[['package_description', 'code_system']] = \
            df[['package_description', 'code_system']].apply(
            lambda x: x.str.lower())

        # only keep the rows we need for this intermediate cause
        # keep n-code rows for injuries
        df = df.loc[(df['package_description'].isin(self.full_cause_name)) | (
            df['package_description'].str.contains('nn'))].drop_duplicates()

        # intermediate causes should be mutually exclusive
        report_duplicates(df, ['icd_code', 'code_system'])

        # subset to just the code system being run through
        df = df.query(f'code_system == "{code_system_name}"')

        assert len(df) > 0, \
            f"There are no mappings for {code_system_name}, {self.full_cause_name}"

        # convert to a dictionary
        mcod_map = dict(list(zip(df['icd_code'], df['package_description'])))

        return mcod_map

    def capture_int_cause(self, df, int_cause_cols):
        """Flag deaths related to the intermediate cause."""
        df[self.int_cause] = None

        for col in int_cause_cols:
            df[col] = df[col].fillna("other")
            df.loc[df[col].isin(self.full_cause_name), self.int_cause] = 1
        df[self.int_cause] = df[self.int_cause].fillna(0)

        assert df[self.int_cause].notnull().values.all()

        return df

    def set_part2_flag(self, df):
        """Mark whether or not the cause of interest is from part 2 of the death certificate."""
        p2_cols = [x for x in df.columns if 'pII' in x]
        int_cause_chains = [
            x for x in df.columns
            if (self.int_cause in x) and ('multiple' in x)
        ]
        p2_chain_dict = dict(list(zip(p2_cols, int_cause_chains)))
        df['pII_' + self.int_cause] = 0
        for p2_col, chain in sorted(p2_chain_dict.items()):
            df.loc[(df[chain].isin(self.full_cause_name)) & (df[p2_col] == 1),
                   'pII_' + self.int_cause] = 1
        return df

    def get_computed_dataframe(self, df):
        """Return mapped dataframe."""
        # list of all cause columns
        raw_cause_cols = MCoDMapper.get_code_columns(df)
        df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id)

        print_log_message("Mapping underlying cause/primary diagnosis")
        cause_map = get_cause_map(code_map_version_id=self.code_map_version_id,
                                  **self.cache_options)
        code_map = MCoDMapper.prep_cause_map(cause_map)
        df['cause_mapped'] = df['cause'].map(code_map)

        print_log_message(
            "Trimming ICD codes and remapping underlying cause/primary diagnosis"
        )
        df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map,
                                       self.code_system_id)
        report_if_merge_fail(df, 'cause_mapped', 'cause')

        # merge on the cause_id for the underlying cause
        df = df.rename(columns={'cause_mapped': 'code_id'})
        df['code_id'] = df['code_id'].astype(int)
        df = add_code_metadata(df,
                               'cause_id',
                               code_map_version_id=self.code_map_version_id,
                               **self.cache_options)
        report_if_merge_fail(df, 'cause_id', 'code_id')

        print_log_message("Mapping chain causes")
        # get the special intermediate cause map
        int_cause_map = self.prep_int_cause_map()
        df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause)

        print_log_message("Trimming ICD codes and remapping chain causes")
        int_cause_cols = [x for x in df.columns if self.int_cause in x]
        int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary(
            raw_cause_cols, int_cause_cols)
        df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map,
                                       self.code_system_id)

        print_log_message(
            "Identifying rows with intermediate cause of interest")
        df = self.capture_int_cause(df, int_cause_cols)
        if not self.drop_p2:
            df = self.set_part2_flag(df)

        return df
示例#21
0
class BridgeMapper(CodProcess):
    """Replace acauses with those in the bridge map.

    Arguments:
        source (str)
        cause_set_version_id (int)
        code_system (str)
    Returns:
        df, pandas DataFrame: only change is replacing some cause_ids
        diag_df, pandas DataFrame: shows which cause_ids have been changed
    """

    id_cols = ['nid', 'extract_type_id', 'location_id', 'year_id',
               'age_group_id', 'sex_id', 'cause_id',
               'site_id']
    val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']

    # data type id for verbal autopsy
    VA = 8

    def __init__(self, source, cause_meta_df, code_system):
        self.source = source
        self.code_system = code_system
        self.conf = Configurator("standard")
        self.bridge_map_path = Path(self.conf.get_directory('bridge_maps'))
        self.cause_meta_df = cause_meta_df
        self.cache_options = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_results': False,
            'cache_dir': 'standard'
        }

    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        # VA sources are the only ones where this may not work
        # might need to split dataframe by data_type_id for bridge map
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()
        df.drop(columns='data_type_id', inplace=True)

        if self.needs_bridging(has_verbal_autopsy):
            file_name = self.get_file_name(has_verbal_autopsy)
            map_df = pd.read_csv(self.bridge_map_path / file_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)

            # perform zz bridge code redistribution before other bridge mapping
            bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df)

            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            # hack, this cause_id snuck in
            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df

    def needs_bridging(self, has_verbal_autopsy):
        """
        Check data type and code_system to see if the bridge map is needed.
        """
        code_systems_to_bridge_map = [
            "ICD9_detail", "ICD9_BTL", "ICD10_tabulated",
            "ICD8_detail", "ICD8A",
            "China_1991_2002", "India_SCD_states_rural", "India_MCCD_states_ICD10",
            "India_MCCD_states_ICD9", "India_SRS_states_report",
            "Russia_FMD_1989_1998", "ICD9_USSR_Tabulation", "INDEPTH_ICD10_VA",
            "India_Maharashtra_SCD", "India_CRS", "PHL_VSR_1999_2005"
        ]
        special_sources_to_bridge_map = [
            "Russia_FMD_ICD9",
            "India_SRS_states_report", "India_MCCD_Orissa_ICD10"
        ]
        # not all VA sources use a bridge map... something to think about
        # in the future, but not necessary right now
        if has_verbal_autopsy | \
            (self.code_system in code_systems_to_bridge_map) | \
            (self.source in special_sources_to_bridge_map):
            # we need to use the bridge map!
            return True
        else:
            # we do not need to use the bridge map
            return False

    def get_file_name(self, has_verbal_autopsy):
        """Determine the file name needed based on the source or code system.

        Note: The default file name will be the name of the code system,
        with some exceptions. For some sources we have specified specific
        files to bridge map with, all other sources will use the file
        that matches its code_system.
        """
        source_to_sheet = {
            "India_MCCD_Orissa_ICD10": "India_MCCD_states_ICD10",
            "India_MCCD_Delhi_ICD10": "India_MCCD_states_ICD10",
            "Thailand_Public_Health_Statistics": "ICD10_tabulated",
            "India_SRS_states_report": "India_SRS_states_report",
            "UKR_databank_ICD10_tab": "ICD10_tabulated",
            "Russia_FMD_ICD9": "Russia_FMD_1989_1998",
        }
        if has_verbal_autopsy and (self.source != 'India_SRS_states_report'):
            file_name = 'INDEPTH_ICD10_VA'
        else:
            file_name = source_to_sheet.get(self.source, self.code_system)
        return file_name + '.csv'

    def redistribute_zz_bridge_codes(self, df, map_df):
        """
        A mini-redistribution, but only redistributes causes bridge mapped to zz codes
        """
        grouping_cols = list(set(self.id_cols) - {'cause_id'})
        start_deaths = {col: df.groupby(grouping_cols)[col].sum() for col in self.val_cols}

        zz_code_idxs = map_df['bridge_code'].str.startswith('ZZ-')
        # get the order to do the zz code redistribution in:
        # start on lowest level of hierarchy and work our way up
        zz_code_targets = (map_df
                           .loc[zz_code_idxs, ['bridge_code']]
                           .drop_duplicates()
                           .assign(acause=lambda d: d['bridge_code'].str.replace('ZZ-', '_'))
                           .merge(self.cause_meta_df, on='acause')
                           .sort_values(['level', 'acause'], ascending=False)
                           .loc[:, 'bridge_code']
                           .tolist()
                           )
        # don't distribute onto anything that maps to a zz code
        all_causes_to_zz_codes = set(map_df.loc[zz_code_idxs, 'acause'])

        for zz_code in zz_code_targets:
            child_cause_ids = get_all_related_causes(zz_code.strip().replace('ZZ-', '_'),
                                                     self.cause_meta_df)
            child_causes = self.cause_meta_df.loc[
                self.cause_meta_df['cause_id'].isin(child_cause_ids),
                'acause'].tolist()

            acauses_to_redistribute = map_df.loc[map_df['bridge_code'] == zz_code, 'acause']
            to_redistribute = df['acause'].isin(acauses_to_redistribute)
            valid_child_causes = set(child_causes) - all_causes_to_zz_codes

            print_log_message('Found ZZ code: {}, deaths: {}'
                              .format(zz_code, df.loc[to_redistribute, 'deaths'].sum()))

            # distribute onto at least all combinations of these
            # this is to ensure everything in df[to_redistribute]
            # get weights
            values_to_include = {
                'acause': valid_child_causes,
            }
            for col in grouping_cols:
                values_to_include[col] = df.loc[to_redistribute, col].unique()
            distributed = distribute(df[to_redistribute],
                                     based_on=df[df['acause'].isin(valid_child_causes)],
                                     distribute_over='acause',
                                     within=grouping_cols,
                                     value_col='deaths',
                                     values_to_include=values_to_include,
                                     base_value=0.001,  # this is mostly arbitrary
                                     )
            report_if_merge_fail(distributed, check_col='acause', merge_cols=grouping_cols)

            # what follows is an unfortunate side effect of having multiple value columns
            # in the data -- it makes the merging somewhat more involved than simply
            # appending distributed data to existing data
            # TODO: refactor this into a generic method in redistribution_utils
            df = df.merge(distributed[grouping_cols + ['acause', 'deaths']],
                          how='outer',
                          on=grouping_cols + ['acause'],
                          suffixes=('', '_new'),
                          )
            # default to 0 deaths in all values where new variables / IDs (i.e. new causes)
            # are in the distributed data (right only)
            # and where distributed does not have data (i.e. other causes in original
            # data that weren't distributed onto) (left only)
            df[self.val_cols + ['deaths_new']] = df[self.val_cols + ['deaths_new']].fillna(0)
            # Set values that were distributed away from their cause to 0.
            # This has the effect of moving deaths away from one cause to another.
            df.loc[df['acause'].isin(acauses_to_redistribute), 'deaths'] = 0
            # now add distributed data to old
            df['deaths'] += df['deaths_new']
            df.drop(columns='deaths_new', inplace=True)

            # make sure deaths didn't move out of a nid-etid-site-location-year-sex-age group
            for col in self.val_cols:
                end_deaths = df.groupby(grouping_cols)[col].sum()
                assert np.allclose(start_deaths[col], end_deaths), \
                    "Dropped/added deaths during ZZ code redistribution: " + \
                    "start {}: {}, end {}: {}".format(col, start_deaths[col], col, end_deaths)
        return df

    def acause_to_bridge_code(self, df):
        """Replace the acause with the bridge code."""
        # there might still be zz codes in the data because we aren't
        # performing zz code redistribution on the other value columns,
        # so if something is coded to i.e. _neo in the raw data, then
        # we keep it as _neo.
        df['acause'].update(df['bridge_code'].str.replace('ZZ-', '_'))
        return df

    def get_diagnostic_dataframe(self):
        """Return a diagnostic dataframe.

        Diagnostic dataframe shows all changes made due to bridge mapping.
        Maybe change this later to there is some sort of output.
        """
        if self.diag_df is None:
            print("No run of get computed dataframe yet")
        else:
            return self.diag_df

    def clean_up(self, df):
        """Group rogue duplicates."""
        df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum()
        return df
示例#22
0
import pandas as pd
import numpy as np

from cod_prep.claude.cod_process import CodProcess
from cod_prep.downloaders.causes import get_parent_and_childen_causes
from cod_prep.downloaders.locations import (
    get_country_level_location_id, add_location_metadata
)
from cod_prep.downloaders.ages import add_age_metadata
from cod_prep.claude.configurator import Configurator

pd.options.mode.chained_assignment = None

CONF = Configurator('standard')
N_DRAWS = CONF.get_resource('uncertainty_draws')


class RTIAdjuster(CodProcess):

    death_cols = ['deaths', 'deaths_corr', 'deaths_raw', 'deaths_rd']
    rti_sources = ['Various_RTI', 'GSRRS_Bloomberg_RTI']

    def __init__(self, df, cause_meta_df, age_meta_df, location_meta_df):
        self.df = df
        self.merge_cols = ['simple_age', 'iso3', 'year_id', 'sex_id']
        self.orig_cols = df.columns
        self.cmdf = cause_meta_df
        self.amdf = age_meta_df
        self.lmdf = location_meta_df

    def get_computed_dataframe(self):
示例#23
0
 def __init__(self, cause_set_version_id, code_map):
     self.cg = Configurator("standard")
     self.cache_dir = self.cg.get_directory('db_cache')
     self.cause_set_version_id = cause_set_version_id
     self.code_map = code_map
    remove national observations

keep only the data for the given nid (later refactor to allow for running
    all location data at once instead of per-nid)

force non-zero floor

aggregate
    locations
    ages

write to file for uploading

"""

CONF = Configurator('standard')
# sources containing maternal deaths that are noise reduced
MATERNAL_NR_SOURCES = [
    "Mexico_BIRMM",
    "Maternal_report",
    "SUSENAS",
    "China_MMS",
    "China_Child",
]

NR_DIR = CONF.get_directory('nr_process_data')


def get_malaria_noise_reduction_model_result(malaria_model_group,
                                             launch_set_id):
    """Read in the csv with saved malaria model result."""
示例#25
0
import sys
import getpass
import pandas as pd
from db_tools import ezfuncs

sys.path.append("FILEPATH")
from cod_prep.claude.claude_io import get_claude_data
from cod_prep.utils import (report_if_merge_fail, cod_timestamp,
                            print_log_message)
from cod_prep.downloaders import (add_nid_metadata, get_code_system_from_id,
                                  get_country_level_location_id,
                                  get_current_location_hierarchy)
from cod_prep.claude.configurator import Configurator

# import standard configurations
CONF = Configurator('standard')

# columns to collapse code_id down to cause_id
GROUP_COLS = [
    'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid',
    'extract_type_id', 'site_id'
]

# columns to sum together
VAL_COLS = ['deaths']

# where to write
new_run_id = sys.argv[1]
OUT_DIR = "FILEPATH".format(new_run_id)
OUT_FILENAME = "allcause_vr_for_mortality"
OUT_FILE = "{}/{}.csv".format(OUT_DIR, OUT_FILENAME)
示例#26
0
    get_country_level_location_id,
    get_cause_map,
    add_location_metadata,
    get_current_location_hierarchy
)
from cod_prep.utils import (
    print_log_message,
    report_duplicates,
    report_if_merge_fail,
    cod_timestamp
)
from cod_prep.claude.claude_io import get_claude_data, makedirs_safely
from cod_prep.claude.configurator import Configurator
from save_proportions_for_tableau import SharedPackage

CONF = Configurator()

MODEL_DATA_CODE_SYSTEMS = [1, 6]
RDP_REG_DIR = CONF.get_directory('rdp_regressions')


def get_package_code_ids(regression_specification, code_system_id):
    """Returns code_ids for garbage codes in package for given code system"""
    package_description = regression_specification[
        'package_descriptions'
    ][code_system_id]

    packages = get_package_list(code_system_id)
    package_id = packages.loc[
        packages['package_description'] == package_description,
        'package_id'
示例#27
0
 def __init__(self):
     self.configurator = Configurator('standard')
     self.cache_dir = self.configurator.get_directory('db_cache')
     self.maternal_hiv_props_path = \
         self.configurator.get_directory('maternal_hiv_props')
示例#28
0
def run_phase(df, cause_set_version_id, location_set_version_id, data_type_id,
              env_run_id, source, nid, extract_type_id, remove_decimal,
              code_map_version_id, iso3):
    """Run the full pipeline, chaining together CodProcesses."""
    configurator = Configurator('standard')
    cache_dir = configurator.get_directory('db_cache')
    cache_options = {
        'block_rerun': True,
        'cache_dir': cache_dir,
        'force_rerun': False,
        'cache_results': False
    }

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **cache_options)

    # get location hierarchy
    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    # get envelope
    env_meta_df = get_env(env_run_id=env_run_id, **cache_options)

    # get env with HIV
    env_hiv_meta_df = get_env(env_run_id=env_run_id,
                              with_hiv=True,
                              **cache_options)

    # get age groups
    age_meta_df = get_ages(**cache_options)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    cause_map = get_cause_map(code_map_version_id=code_map_version_id,
                              **cache_options)

    package_map = get_package_map(code_system_id=code_system_id,
                                  **cache_options)

    disagg_df = get_phase_output("disaggregation", nid, extract_type_id)
    misdc_df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id)

    cause_package_hierarchy = get_cause_package_hierarchy(
        code_system_id, **cache_options)

    if source == "Cancer_Registry":
        df = prune_cancer_registry_data(df, location_meta_df)

    # aggregate location
    # defaults to simple location -> national aggregation
    # running full aggregation for India Survey data
    print_log_message("Aggregating location to country level")
    location_aggregator = LocationAggregator(df, location_meta_df)
    if (data_type_id == 7) & (iso3 == 'IND'):
        df = location_aggregator.get_computed_dataframe('full')
    else:
        df = location_aggregator.get_computed_dataframe()

    if data_type_id in POLICE_SURVEY_DATA_TYPE:
        # special step to remove HIV from maternal data
        print_log_message("Removing HIV from cc_code for maternal data.")
        maternal_hiv_remover = MaternalHIVRemover(df, env_meta_df,
                                                  env_hiv_meta_df, source, nid)
        df = maternal_hiv_remover.get_computed_dataframe()

    print_log_message("Calculating sample size")
    df = calc_sample_size(df)
    print_log_message(log_statistic(df))

    print_log_message("Converting to cause fractions")
    df = df.loc[df['sample_size'] > 0]
    df = convert_to_cause_fractions(
        df, ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw'])
    print_log_message(log_statistic(df))

    if data_type_id == VA_DATA_TYPE:
        # run VA anemia adjusment
        print_log_message("Running VA Anemia adjustment")
        va_anemia_adjuster = AnemiaAdjuster()
        df = va_anemia_adjuster.get_computed_dataframe(df)

    if data_type_id == POLICE_DATA_TYPE:
        if source == 'Various_RTI':
            rti_adjuster = RTIAdjuster(df, cause_meta_df, age_meta_df,
                                       location_meta_df)
            df = rti_adjuster.get_computed_dataframe()

    if data_type_id in POLICE_SURVEY_DATA_TYPE:
        # issue: rows with > 0 sample size are dropped
        # most common in maternal data, but relevant anywhere
        # we have only cc_code and one other cause and there
        # are 0 deaths for the other cause for a given age/sex
        cause_list = df.cause_id.unique()
        square_me = (len(cause_list) == 2) & (CC_CODE in cause_list)
        if (source in MATERNAL_SQUARED) or square_me:
            print_log_message("Squaring maternal data")
            df = square_maternal_sources(df, cause_meta_df, age_meta_df)

    print_log_message("Dropping cc code")
    df = drop_cc_code(df)
    print_log_message(log_statistic(df))

    print_log_message("Splitting locations.")
    env_loc_splitter = EnvelopeLocationSplitter(df, env_meta_df, source)
    df = env_loc_splitter.get_computed_dataframe()
    print_log_message(log_statistic(df))

    # aggregate causes
    print_log_message("Aggregating causes")
    cause_aggregator = CauseAggregator(df, cause_meta_df, source)
    df = cause_aggregator.get_computed_dataframe()
    print_log_message(log_statistic(df))

    print_log_message("Adding parnt-mapped garbage to aggregated causes")
    parent_gbg_adder = ParentMappedAggregatedGarbageAdder(
        nid, extract_type_id, source, cause_package_hierarchy, cause_meta_df,
        package_map, cause_map, remove_decimal, disagg_df, misdc_df)
    df = parent_gbg_adder.get_computed_dataframe(df)

    print_log_message("Applying hiv-prevalance in pregnancy adjustment to "
                      "maternal deaths")
    hmp = HIVMatPAFs()
    df = hmp.get_computed_dataframe(df, cause_meta_df, location_meta_df)
    print_log_message(log_statistic(df))

    # TO DO
    # ** In the recode step for BTL some cancer deaths were moved to the
    # cancer parent. The squaring step created 0's. Get rid of the 0's in
    # country-years the recode was previously applied to.

    print_log_message(
        "Removing HIV and shocks from cause fraction denominator")
    hiv_shock_remover = SampleSizeCauseRemover(cause_meta_df)
    df = hiv_shock_remover.get_computed_dataframe(df)
    print_log_message(log_statistic(df))

    # not sure why we do this, but could use a comment of some kind.
    df = conform_one_like_cf_to_one(df)

    print_log_message("Verifying cause fractions not null between 0 and 1")
    assert_valid_cause_fractions(df)

    if dataset_has_redistribution_variance(data_type_id, source):
        # Determine the redistribution variance
        rdvar = RedistributionVarianceEstimator(
            nid,
            extract_type_id,
            cause_meta_df,
            remove_decimal,
            code_system_id,
            cause_map,
            package_map,
            code_map_version_id=code_map_version_id)
        df = rdvar.get_computed_dataframe(df, **cache_options)

    return df
def run_phase(df, cause_set_version_id, location_set_version_id, data_type_id,
              env_run_id, source, nid, extract_type_id, remove_decimal,
              code_map_version_id):
    """Run the full pipeline, chaining together CodProcesses."""
    configurator = Configurator('standard')
    cache_dir = configurator.get_directory('db_cache')
    cache_options = {
        'block_rerun': True,
        'cache_dir': cache_dir,
        'force_rerun': False,
        'cache_results': False
    }

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **cache_options)

    # get location hierarchy
    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    # get envelope
    env_meta_df = get_env(env_run_id=env_run_id, **cache_options)

    # get env with HIV
    env_hiv_meta_df = get_env(env_run_id=env_run_id,
                              with_hiv=True,
                              **cache_options)

    # get age groups
    age_meta_df = get_ages(**cache_options)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    cause_map = get_cause_map(code_map_version_id=code_map_version_id,
                              **cache_options)

    package_map = get_package_map(code_system_id=code_system_id,
                                  **cache_options)

    disagg_df = get_phase_output("disaggregation", nid, extract_type_id)
    misdc_df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id)

    cause_package_hierarchy = get_cause_package_hierarchy(code_system_id)

    if source == "Cancer_Registry":
        df = prune_cancer_registry_data(df, location_meta_df)

    # aggregate location
    print_log_message("Aggregating location to country level")
    location_aggregator = LocationAggregator(df, location_meta_df)
    df = location_aggregator.get_computed_dataframe()

    if data_type_id in POLICE_SURVEY_DATA_TYPE:
        # special step to remove HIV from maternal data
        print_log_message("Removing HIV from cc_code for maternal data.")
        maternal_hiv_remover = MaternalHIVRemover(df, env_meta_df,
                                                  env_hiv_meta_df, source, nid)
        df = maternal_hiv_remover.get_computed_dataframe()

    print_log_message("Calculating sample size")
    df = calc_sample_size(df)
    print_log_message(log_statistic(df))

    print_log_message("Converting to cause fractions")
    df = df.loc[df['sample_size'] > 0]
    df = convert_to_cause_fractions(
        df, ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw'])
    print_log_message(log_statistic(df))

    if data_type_id == VA_DATA_TYPE:
        # run VA anemia adjusment
        print_log_message("Running VA Anemia adjustment")
        va_anemia_adjuster = AnemiaAdjuster()
        df = va_anemia_adjuster.get_computed_dataframe(df)

    if data_type_id in POLICE_SURVEY_DATA_TYPE:

        cause_list = df.cause_id.unique()
        square_me = (len(cause_list) == 2) & (CC_CODE in cause_list)
        if (source in MATERNAL_SQUARED) or square_me:
            print_log_message("Squaring maternal data")
            df = square_maternal_sources(df, cause_meta_df, age_meta_df)

    print_log_message("Dropping cc code")
    df = drop_cc_code(df)
    print_log_message(log_statistic(df))

    print_log_message("Splitting locations.")
    env_loc_splitter = EnvelopeLocationSplitter(df, env_meta_df, source)
    df = env_loc_splitter.get_computed_dataframe()
    print_log_message(log_statistic(df))

    # aggregate causes
    print_log_message("Aggregating causes")
    cause_aggregator = CauseAggregator(df, cause_meta_df, source)
    df = cause_aggregator.get_computed_dataframe()
    print_log_message(log_statistic(df))

    print_log_message("Adding parnt-mapped garbage to aggregated causes")
    parent_gbg_adder = ParentMappedAggregatedGarbageAdder(
        nid, extract_type_id, source, cause_package_hierarchy, cause_meta_df,
        package_map, cause_map, remove_decimal, disagg_df, misdc_df)
    df = parent_gbg_adder.get_computed_dataframe(df)

    print_log_message("Applying hiv-prevalance in pregnancy adjustment to "
                      "maternal deaths")
    hmp = HIVMatPAFs()
    df = hmp.get_computed_dataframe(df, cause_meta_df, location_meta_df)
    print_log_message(log_statistic(df))

    print_log_message(
        "Removing HIV and shocks from cause fraction denominator")
    hiv_shock_remover = SampleSizeCauseRemover(cause_meta_df)
    df = hiv_shock_remover.get_computed_dataframe(df)
    print_log_message(log_statistic(df))

    df = conform_one_like_cf_to_one(df)

    print_log_message("Verifying cause fractions not null between 0 and 1")
    assert_valid_cause_fractions(df)

    if dataset_has_redistribution_variance(data_type_id, source):
        # Determine the redistribution variance
        rdvar = RedistributionVarianceEstimator(nid, extract_type_id,
                                                cause_meta_df, remove_decimal,
                                                code_system_id, cause_map,
                                                package_map)
        df = rdvar.get_computed_dataframe(df)

    return df
示例#30
0
class HIVMatPAFs(CodProcess):

    calc_cf_col = 'cf'
    all_cf_cols = ['cf', 'cf_raw', 'cf_corr', 'cf_rd']

    def __init__(self):
        self.configurator = Configurator('standard')
        self.cache_dir = self.configurator.get_directory('db_cache')
        self.maternal_hiv_props_path = \
            self.configurator.get_directory('maternal_hiv_props')

    def get_computed_dataframe(self, df, cause_meta_df, location_meta_df):
        restricted_maternal_df = \
            self.restrict_to_maternal_data(df, cause_meta_df)
        if restricted_maternal_df is None:
            return df
        appended_pafs = self.append_maternal_pafs(
            restricted_maternal_df.year_id.unique())

        merged_data = \
            self.merge_data_and_proportions(restricted_maternal_df,
                                            appended_pafs)
        percent_maternal = self.generate_percentages(merged_data)
        split_maternal = self.generate_splits(percent_maternal)
        hiv_cfs = self.create_maternal_hiv_cfs(split_maternal)
        cleaned = self.clean_adjusted_data(hiv_cfs)
        final = \
            self.append_adjusted_orig(df, restricted_maternal_df, cleaned)
        group_cols = [
            col for col in final.columns
            if col not in self.all_cf_cols and col not in ['sample_size']
        ]
        final = final.groupby(group_cols, as_index=False).agg({
            'sample_size': 'mean',
            'cf': 'sum',
            'cf_raw': 'sum',
            'cf_corr': 'sum',
            'cf_rd': 'sum'
        })
        return final

    def restrict_to_maternal_data(self, df, cause_meta_df):
        """Restrict incoming dataframe to only maternal data."""
        df = df.copy()
        # get age start and age end for maternal ages
        maternal_metadata = cause_meta_df.loc[cause_meta_df['cause_id'] == 366]
        age_start = maternal_metadata['yll_age_start']
        assert len(age_start) == 1
        age_start = age_start.iloc[0]
        age_end = maternal_metadata.yll_age_end
        assert len(age_end) == 1
        age_end = age_end.iloc[0]

        data = add_age_metadata(df,
                                add_cols=['simple_age'],
                                merge_col='age_group_id',
                                force_rerun=False,
                                block_rerun=True,
                                cache_results=False,
                                cache_dir=self.cache_dir)
        data.rename(columns={'simple_age': 'age'}, inplace=True)
        maternal_data = data.loc[(df['cause_id'] == 366)
                                 & (data['age'] >= age_start) &
                                 (data['age'] <= age_end) &
                                 (data['sex_id'] == 2) &
                                 (data['year_id'] >= 1980)]
        maternal_data.drop('age', axis=1, inplace=True)
        if len(maternal_data) == 0:
            return None
        else:
            return maternal_data

    def append_maternal_pafs(self, years):
        """Read in proportions."""
        props = pd.DataFrame()
        for year in years:
            year = int(year)
            if os.path.isfile("{}maternal_hiv_props_{}.csv".format(
                    self.maternal_hiv_props_path, year)):
                data = pd.read_csv("FILEPATH".format(year))
                props = props.append(data)

        props = props.rename(columns={'year': 'year_id'})
        return props

    def duplicate_national_props(self, props_df, loc_df):

        subnational = loc_df.loc[
            loc_df['level'] > 3,
            ['location_id', 'parent_id', 'level', 'path_to_top_parent']]

        # Russia sub nationals are level 5 while other countries are level 4
        subnational.loc[
            subnational['level'] == 5, 'parent_id'] = \
            subnational['path_to_top_parent'].str.split(',').str[3].astype(int)

        # only keep rows with the needed sub national locations
        subnational = subnational.loc[subnational['parent_id'].isin(
            self.need_subnational_props)]

        # drop level 4 sub national location_ids for Russia
        subnational = subnational.loc[~((subnational['parent_id'] == 62) &
                                        (subnational['level'] == 4))]
        subnational = subnational[['location_id', 'parent_id']]
        subnational.rename(columns={
            'location_id': 'child_location_id',
            'parent_id': 'location_id'
        },
                           inplace=True)

        # create sub national maternal_hiv proportions from national
        subnational = props_df.merge(subnational, on='location_id')
        subnational.drop('location_id', axis=1, inplace=True)
        subnational.rename(columns={'child_location_id': 'location_id'},
                           inplace=True)
        props_df = pd.concat([props_df, subnational])
        assert not props_df.duplicated().any(), 'please check maternal'\
            ' proportions, there are duplicates'
        return props_df

    def merge_data_and_proportions(self, data, props):
        """Merge restricted maternal data and proportions."""
        merged_data = data.merge(props,
                                 on=['location_id', 'age_group_id', 'year_id'],
                                 how='left')
        assert merged_data.notnull().values.any(), 'maternal proportions '\
            'were not successfully merged with incoming data'
        return merged_data

    def generate_percentages(self, df):

        df['pct_maternal'] = 1 - df['pct_hiv'] - df['pct_maternal_hiv']
        df.loc[df['pct_maternal'].isnull(), 'pct_maternal'] = 1
        df.loc[df['pct_hiv'].isnull(), 'pct_hiv'] = 0
        df.loc[df['pct_maternal_hiv'].isnull(), 'pct_maternal_hiv'] = 0
        assert all(x > 0 for x in df['pct_maternal'])
        assert df[['pct_maternal', 'pct_hiv', 'pct_maternal_hiv'
                   ]].notnull().values.any(), 'there are missing percentages'
        assert all(
            abs(df['pct_maternal'] + df['pct_hiv'] + df['pct_maternal_hiv']) -
            1) < .0001

        assert (df['pct_maternal_hiv_vr'] <= .13).all()
        assert not (df['cause_id'] == 741).any()
        return df

    ''' '''

    def generate_splits(self, df):

        df = add_nid_metadata(
            df,
            add_cols='data_type_id',
            block_rerun=True,
            cache_dir=self.cache_dir,
            force_rerun=False,
        )
        df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1
        df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0
        df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1
        df.loc[df['split_maternal'] == 0,
               'pct_maternal_hiv'] = df['pct_maternal_hiv_vr']
        df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0
        df.drop('pct_maternal_hiv_vr', axis=1, inplace=True)
        return df

    def create_maternal_hiv_cfs(self, df):
        df = df.copy()

        maternal_hiv_df = df.copy()
        maternal_hiv_df['cf'] = maternal_hiv_df['cf'] * \
            maternal_hiv_df['pct_maternal_hiv']
        maternal_hiv_df['cause_id'] = 741
        maternal_hiv_df['cf_raw'] = 0
        maternal_hiv_df['cf_corr'] = 0
        maternal_hiv_df['cf_rd'] = 0

        maternal_df = df.copy()
        maternal_df['cf'] = maternal_df['cf'] * maternal_df['pct_maternal']
        maternal_df['cause_id'] = 366
        df = pd.concat([maternal_hiv_df, maternal_df], ignore_index=True)

        return df

    def clean_adjusted_data(self, df):

        va_vr = df.loc[df['split_maternal'] == 0]
        if len(va_vr) > 0:
            assert set([741, 366]) == set(va_vr.cause_id.unique())
            va_vr = va_vr.loc[va_vr['cause_id'] != 366]
            va_vr['cause_id'] = 366
        df = pd.concat([df, va_vr], ignore_index=True)
        df = df.groupby([
            'nid', 'extract_type_id', 'location_id', 'year_id', 'site_id',
            'age_group_id', 'sex_id', 'sample_size', 'cause_id'
        ],
                        as_index=False)[self.all_cf_cols].sum()

        assert (df['cf'] < 1.1).all()
        df.loc[df['cf'] > 1, 'cf'] = 1

        return df

    def append_adjusted_orig(self, orig, maternal_data, adjusted):
        """Remove original maternal data and append on adjusted."""
        data = orig.merge(maternal_data, how='left', indicator=True)
        data = data.loc[data['_merge'] != 'both']
        data.drop('_merge', axis=1, inplace=True)
        data = data.append(adjusted, ignore_index=True)
        return data