Пример #1
0
    def get_rates_df(self, cause_meta_df):

        if self.correct_garbage:
            filepath_infix = 'PRE'
        else:
            filepath_infix = 'POST'

        rates = pd.read_stata("FILEPATH".format(fp=filepath_infix,
                                                iso=self.iso3))
        rates = add_cause_metadata(rates, ['cause_id'],
                                   merge_col='acause',
                                   cause_meta_df=cause_meta_df)
        rates.loc[rates['acause'] == "_sepsis_gc",
                  'cause_id'] = self.sepsis_cause_id

        age_df = get_cod_ages()
        age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])]
        age_df['agecat'] = age_df['age_group_years_start']
        age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0
        age_df = age_df[['agecat', 'age_group_id']]
        # merge on ages to rates data
        rates = rates.merge(age_df, on='agecat', how='left')
        assert not rates['age_group_id'].isnull().any()
        # clean up columns
        rates = rates.rename(columns={'sex': 'sex_id'})
        rates = rates.drop(['acause', 'agecat'], axis=1)
        return rates
Пример #2
0
    def get_rates_df(self, cause_meta_df):
        """Write a nice description here."""

        if self.correct_garbage:
            filepath_infix = 'PRE'
        else:
            filepath_infix = 'POST'

        rates_path = self.conf.get_resource(
            'hivcorr_global_causespecific_relrates').format(
                pre_post=filepath_infix, iso=self.iso3)
        rates = pd.read_stata(rates_path)
        # convert acause to cause_id
        rates = add_cause_metadata(rates, ['cause_id'],
                                   merge_col='acause',
                                   cause_meta_df=cause_meta_df)
        rates.loc[rates['acause'] == "_sepsis_gc",
                  'cause_id'] = self.sepsis_cause_id
        # convert age to age_group_id
        # TODO THIS NEEDS TO BE CHANGED B/C RIGHT NOW IT QUERIES DB
        # change get_demographics to have caching option too?
        age_df = get_cod_ages()
        age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])]
        age_df['agecat'] = age_df['age_group_years_start']
        age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0
        age_df = age_df[['agecat', 'age_group_id']]
        # merge on ages to rates data
        rates = rates.merge(age_df, on='agecat', how='left')
        assert not rates['age_group_id'].isnull().any()
        # clean up columns
        rates = rates.rename(columns={'sex': 'sex_id'})
        rates = rates.drop(['acause', 'agecat'], axis=1)
        return rates
Пример #3
0
def report_too_much_age_detail(nid_etid_tuple, nid_etid_df):
    '''
    Raise an assertion error if there is too much age detail in the age groups in nid_etid_df.

    For the age groups listed in nid_etid_df, report if there are age groups that can be
    aggregated to more closely match the ideal cod age groups. Note that VA, survey/
    census, and CHAMPS data are NOT checked for too much age detail when passed to this function.

    Arguments:
        nid_etid_tuple, tuple: specifies NID and extract_type_id of nid_etid_df
        nid_etid_df, pandas.DataFrame: A df of unique age groups from the data for the NID and
        extract_type specified by nid_etid_tuple. The columns of nid_etid_df are age_group_id,
        age_group_years_start, age_group_years_end, and data_type_id, and each row is a unique age
        group. The age groups are assumed to be sorted in ascending order, first by
        age_group_years_start, then by age_group_years_end. The age groups are also assumed
        to have no gaps or overlaps between them.
    '''

    data_type_id = nid_etid_df.data_type_id.unique()
    assert len(
        data_type_id
    ) == 1, "NID, extract_type {} has more than 1 data_type_id".format(
        nid_etid_tuple)

    if data_type_id[0] not in [7, 8, 12]:
        cod_ages = get_cod_ages()
        cod_ages = cod_ages[[
            'age_group_id', 'age_group_years_start', 'age_group_years_end'
        ]]
        cod_ages.rename(columns=lambda x: "cod_" + x, inplace=True)

        # For each cod age group, find age groups in the df that are contained within it
        cod_ages['df_age_group_ids'] = cod_ages.apply(
            lambda x: nid_etid_df.loc[
                ((nid_etid_df.age_group_years_start >= x[
                    'cod_age_group_years_start']) &
                 (nid_etid_df.age_group_years_end <= x[
                     'cod_age_group_years_end'])), 'age_group_id'].tolist(),
            axis='columns')

        # If a cod age group contains more than 1 df age group, then the df is too detailed
        cod_ages['is_too_detailed'] = cod_ages.df_age_group_ids.apply(
            lambda x: len(x) > 1)

        if cod_ages.is_too_detailed.any():
            raise AssertionError(
                "Age groups in the data can be aggregated to match standard "
                "cod age groups for NID, extract_type {}: \n {}".format(
                    nid_etid_tuple, cod_ages.loc[cod_ages.is_too_detailed]))
Пример #4
0
    def __init__(self):
        self.cg = Configurator('standard')
        self.cache_dir = self.cg.get_directory('db_cache')
        # if you do not want to write any output files then set test to "True"
        self.test = False
        self.cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': self.cache_dir
        }
        self.dataset_filters = {
            'data_type_id': [8, 9, 10, 12],
            'location_set_id': 35,
            'is_active': True,
            'year_id': range(1980, 2050)
        }
        self.national_nids = self.cg.get_resource("nid_replacements")

        # resources
        self.completeness = self.cg.get_resource("completeness")
        self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'),
                                   **self.cache_options)
        self.location_meta_df = get_current_location_hierarchy(
            location_set_version_id=self.cg.get_id('location_set_version'),
            **self.cache_options)
        self.cod_ages = list(
            get_cod_ages(**self.cache_options)['age_group_id'].unique())

        # identifiers
        self.source_cols = ["source", "nid", "data_type_id"]
        self.geo_cols = ["location_id", "year_id"]
        self.meta_cols = ["nationally_representative", "detail_level_id"]
        self.value_cols = ['deaths']
        self.year_end = self.cg.get_id('year_end')
        self.full_time_series = "full_time_series"

        # directories
        self.current_best_version = "2018_04_03_151739"
        self.out_dir = "FILEPATH"
        self.arch_dir = "{}/_archive".format(self.out_dir)
        self.timestamp = cod_timestamp()
Пример #5
0
    def get_age_weight_df(self):
        """
        We have shifted to pulling age weights based on mortality information after a
        decision by USERNAME and USERNAME. The method below replaces pulling the population
        based weights out of the db with the "get_age_weights" function. - 07/10/2019
        """
        df = get_env(env_run_id=self.cg.get_id('env_run'),
                     force_rerun=False,
                     block_rerun=True)
        # get global, both sex, for all years after 2010
        df = df.query("location_id == 1 & sex_id == 3 & year_id >= 2010")
        # collapse out year
        df = df.groupby(['age_group_id', 'location_id', 'sex_id'],
                        as_index=False).mean_env.sum()
        # total deaths for weights
        total = df.loc[df.age_group_id == 22]['mean_env'].iloc[0]
        # get the ages we care about (cod ages, under 1, and 80+)
        age_df = get_cod_ages()
        ages = age_df.age_group_id.unique().tolist()
        ages += [21, 28]
        # limit env df to relevant ages
        df = df.loc[df.age_group_id.isin(ages)]
        # group by age, and then make weights
        df = df.groupby('age_group_id', as_index=False).mean_env.sum()
        df['weight'] = df['mean_env'] / total
        # some renaming
        df.rename(columns={'weight': 'age_group_weight_value'}, inplace=True)

        # do a quick check to make sure the death totals used to create weights are sensible
        # just making sure age specific totals are within 1% of the all age total
        check_val = abs(
            (df.loc[~df.age_group_id.isin([21, 28])].mean_env.sum() / total) -
            1)
        assert check_val < 0.01

        df = df[['age_group_id', 'age_group_weight_value']]
        return df