예제 #1
0
 def _load_data_frame(self):
     if self.location_set_id is not None:
         df_out = get_population(age_group_id=self.age_group_id,
                                 location_set_id=self.location_set_id,
                                 year_id=self.year_id,
                                 location_id=self.location_id,
                                 sex_id=self.sex_id,
                                 gbd_round_id=self.gbd_round_id)
     elif self.location_set_version_id is not None:
         df_out = get_population(
             age_group_id=self.age_group_id,
             location_set_version_id=self.location_set_version_id,
             location_id=self.location_id,
             year_id=self.year_id,
             sex_id=self.sex_id,
             gbd_round_id=self.gbd_round_id)
     else:
         df_out = get_population(age_group_id=self.age_group_id,
                                 location_id=self.location_id,
                                 year_id=self.year_id,
                                 sex_id=self.sex_id,
                                 gbd_round_id=self.gbd_round_id,
                                 decomp_step=self.decomp_step)
     if 'run_id' in df_out.columns.tolist():
         del df_out['run_id']
     df_out.rename(columns={'population': 'pop_scaled'}, inplace=True)
     return df_out
예제 #2
0
def get_sample_size(df, fix_group237=False):
    """
    This function attaches sample size to hospital data.  It's for sources that
    should have fully covered populations, so sample size is just population.
    Checks if age_group_id is a column that exists and if not, it attaches it.

    Parameters
        df: Pandas DataFrame
            contains the data that you want to add sample_size to.  Will add
            pop to every row.
    """
    # process
    ## attach age group id to data
    ## get pop with those age group ids in the data
    ## attach pop by age group id

    if 'age_group_id' not in df.columns:
        # pull age_group to age_start/age_end map
        age_group = hosp_prep.get_hospital_age_groups()

        # merge age group id on
        pre = df.shape[0]
        df = df.merge(age_group, how='left', on=['age_start', 'age_end'])
        assert df.shape[0] == pre, "number of rows changed during merge"
        assert df.age_group_id.notnull().all(), ("age_group_id is missing "
            "for some rows")

    # get population
    pop = get_population(QUERY)

    if fix_group237:

        fix_pop = get_population(QUERY)
        pre = fix_pop.shape[0]
        fix_pop['age_group_id'] = 237
        fix_pop = fix_pop.groupby(fix_pop.columns.drop('population').tolist()).agg({'population': 'sum'}).reset_index()
        assert pre/2 == fix_pop.shape[0]

        pop = pd.concat([pop, fix_pop], ignore_index=True)

    # rename pop columns to match hospital data columns
    pop.rename(columns={'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']
    pop.drop("run_id", axis=1, inplace=True)

    demography = ['location_id', 'year_start', 'year_end',
                  'age_group_id', 'sex_id']

    # merge on population
    pre_shape = df.shape[0]
    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"
    assert df.population.notnull().all(),\
        "population is missing for some rows. look at this df! \n {}".\
            format(df.loc[df.population.isnull(), demography].drop_duplicates())

    return(df)
예제 #3
0
def transform_to_rate_space(summary_df):
    '''
    Transform the summary data into rate space to allow for easier validation
    '''
    print("**TRANSFORMING TO RATE SPACE**")
    demos = get_demographics(gbd_round_id=5, gbd_team='cod')
    year_ids = list(range(1950,2018))
    full_pops = get_population(gbd_round_id=5,
                               age_group_id=demos['age_group_id'],
                               sex_id=demos['sex_id'],
                               year_id=year_ids,
                               run_id= 104,
                               location_id=demos['location_id'])
    full_pops = full_pops.loc[:,['age_group_id','sex_id','year_id','location_id',
                                'population']]
    summary_df = pd.merge(left=summary_df,
                          right=full_pops,
                          on=['age_group_id','sex_id','year_id','location_id'],
                          how='inner')
    for col in ['val','lower','upper']:
        summary_df[col] = summary_df[col] / summary_df['population']
        summary_df.loc[summary_df[col]>1,col] = 1
        summary_df.loc[summary_df[col]<0,col] = 0
    summary_df = summary_df.drop(labels=['population'], axis=1)
    print("  ...Successfully transformed summary values into rate space.\n")
    return summary_df
예제 #4
0
def prep_norway_pop_weights(code_dir):
    loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=5)
    loc_meta.to_csv(os.path.join(code_dir, 'location_metadata.csv'), 
        index=False, encoding='utf8')
    norway_id = 90
    norway_subs = loc_meta.loc[loc_meta.parent_id==norway_id, 'location_id'].tolist() + [norway_id]
    country_pop = get_population(location_id=norway_id, year_id='all', 
        sex_id='all', age_group_id=164, gbd_round_id=5, status='best')
    country_pop.drop('location_id', axis=1, inplace=True)
    subs_pop = get_population(location_id=norway_subs, year_id='all', 
        sex_id='all', age_group_id=164, gbd_round_id=5, status='best')
    population = subs_pop.merge(country_pop, 
        on=[c for c in subs_pop.columns if c not in ['location_id','population']], 
        suffixes=('_subs', '_national'))
    population.loc[:, 'pop_weight'] = population['population_subs'] / 
        population['population_national']
예제 #5
0
def get_hosp_pops(df):
    """
    The Shiny/Rmarkdown tool which this data feeds into was running
    get_population() live but it was taking too long. This function pulls the
    population for a source and writes it to a csv in the hospital diagnostics
    dir

    Parameters:
        df: Pandas DataFrame
    """
    today = re.sub("\W", "_", str(datetime.datetime.now()))[0:10]

    pop = get_population(age_group_id=list(df.age_group_id.unique()),
                         location_id=list(df.location_id.unique()),
                         year_id=list(df.year_start.unique()),
                         sex_id=[1, 2])

    for source in df.source.unique():

        dat = df[df.source == source]

        locs = dat.location_id.unique()
        years = dat[dat.location_id.isin(locs)].year_start.unique()
        ages = dat.age_group_id.unique()

        src_pop = pop[pop.location_id.isin(locs) & pop.year_id.isin(years) &\
                  pop.age_group_id.isin(ages)].copy()
        src_pop['source'] = source
        src_pop.to_csv("FILENAME"
                       r"FILEPATH".format(source, today),
                       index=False)
    return
예제 #6
0
def get_sy_populations(dems):
    """Pull populations for single-year age-groups without under 1 age groups (already contained in get_populations()) This does not include 95+ because it is already
    in the get_populations call. Will be appended in scripts if necessary."""
    print("Getting single-year populations")
    df = db.get_population(year_id=dems['year_id'], sex_id=dems['sex_id'], location_id=dems["location_id"],
                           age_group_id=list(range(49, 143)), single_year_age=True, gbd_round_id=help.GBD_ROUND)
    return df
예제 #7
0
    def get_data(self, id_template_df):
        locs = id_template_df.location_id.tolist()
        ages = id_template_df.age_group_id.tolist()
        sexes = id_template_df.sex_id.tolist()
        years = id_template_df.year_id.tolist()

        # get deaths and pop
        env_df = get_envelope(
            age_group_id=ages, location_id=locs, year_id=years, sex_id=sexes,
            with_hiv=1, with_shock=0)
        pop_df = get_population(
            age_group_id=ages, location_id=locs, year_id=years, sex_id=sexes)
        df = env_df.merge(
            pop_df, on=["location_id", "year_id", "age_group_id", "sex_id"])

        # convert to rates
        for col in ["mean", "lower", "upper"]:
            df[col] = df[col] / df["population"]

        df = self.drop_zeros_nulls(df, "mean", "lower", "upper")

        # add input_data_key
        df = df.merge(id_template_df,
                      on=["location_id", "year_id", "age_group_id", "sex_id"])

        if df.empty:
            raise NoNonZeroValues

        # aggregate
        df = self.calc_se_from_ui(df, "mean", "lower", "upper")
        df = self.calc_aggregate_se(df, self._data_key, "mean", "se")
        df = df.set_index(self._data_key)
        return df
예제 #8
0
def both_sex_model_results_figures(by_sex_df):
    # Get the set of locations, age groups, and years in the input df
    in_age_groups = by_sex_df.age_group_id.unique().tolist()
    in_years = by_sex_df.year_id.unique().tolist()
    in_locs = by_sex_df.location_id.unique().tolist()
    # Get populations for both male and female for each of these age groups,
    #  years, and locations
    pops = get_population(age_group_id=in_age_groups,
                          location_id=in_locs,
                          year_id=in_years,
                          sex_id=[1,2])
    # Drop non-useful columns from the populations df
    pops = pops.drop(labels=['process_version_map_id'],axis=1)
    # Merge the input dataframe on location, age-group, year, sex
    merge_on_cols=['location_id','year_id','age_group_id','sex_id']
    merged = by_sex_df.merge(pops,on=merge_on_cols,how='inner')
    # Now, multiply the 'mean', 'lower', and 'upper' columns by the population
    #   to get a total COUNT for each location-year-age-sex group
    merged['mean_count'] = merged['mean'] * merged['population']
    merged['lower_count'] = merged['lower'] * merged['population']
    merged['upper_count'] = merged['upper'] * merged['population']
    # Drop the old rate columns and the sex column
    merged = merged.drop(labels=['mean','lower','upper','sex_id'],axis=1)
 
    # Group by all columns except mean_count, lower_count, upper_count,
    #   and population
    dont_group_by = ['mean_count','lower_count','upper_count','population']
    group_by_these = [i for i in merged.columns.tolist() if i not in dont_group_by]
    # Get the sum of the counts AND the total population
    # This is the both-sex df
    both_sex = merged.groupby(group_by_these).sum().reset_index()
    both_sex['sex_id'] = 3
    return both_sex
예제 #9
0
 def pull_pop(self):
     pop_df = get_population(age_group_id=self.age_group_ids,
                             location_id=-1,
                             location_set_id=35,
                             year_id=self.year_id,
                             sex_id=self.sex_id)
     return pop_df[self.index_cols + ['population']]
예제 #10
0
def mortQuery(sex, start_year, start_age, end_age, location_set_version_id,
              gbd_round, db_connection):
    '''
    Strings indicating model parameters -> Pandas Data Frame

    Given a set of model parameters will query from the mortality database and
    return a pandas data frame. The data frame contains the base variables
    used in the CODEm process.
    '''
    loc_df = locQuery(location_set_version_id, db_connection)
    loc_list = loc_df.location_id.values.tolist()
    age_df = createAgeDF(db_connection)
    age_restrict = "all_ages >= {0} & all_ages <= {1}".format(start_age,
                                                              end_age)
    age_list = age_df.query(age_restrict).all_ages.values.tolist()
    env = get_envelope(age_group_id=age_list,
                       sex_id=sex,
                       year_id=range(start_year, gbd_round+1),
                       location_set=35,
                       location_id=loc_list)
    pop = get_population(age_group_id=age_list,
                         sex_id=sex,
                         year_id=range(start_year, gbd_round+1),
                         location_set=35,
                         location_id=loc_list)

    df = pd.merge(env, pop, on=['age_group_id', 'location_id', 'year_id',
                                'sex_id'])
    df.drop(['upper', 'lower', 'run_id_x', 'run_id_y'], axis=1, inplace=True)
    df.rename(columns={'age_group_id': 'age', 'year_id': 'year',
                       'sex_id': 'sex', 'mean': 'envelope',
                       'population': 'pop'}, inplace=True)
    return df
예제 #11
0
def rate_count_switcher(df,
                        gbd_round_id,
                        decomp_step,
                        rate_cols=['mean', 'lower', 'upper']):
    pre_cols = df.columns

    if 'age_group_id' not in pre_cols:
        good_ages = hosp_prep.get_hospital_age_groups()
        df = df.merge(good_ages[['age_start', 'age_group_id']],
                      how='left',
                      on='age_start',
                      validate='m:1')
    if 'year_id' in pre_cols:
        df['year_start'], df['year_end'] = df['year_id'], df['year_id']
    if 'sex_id' not in pre_cols:
        assert set(df['sex'].unique()) == set(['Male', 'Female'])
        df['sex_id'] = 2
        df.loc[df['sex'] == 'Male', 'sex_id'] = 1

    years = list(np.arange(1990, 2018, 1))
    ages = df.age_group_id.unique().tolist()
    locs = df.location_id.unique().tolist()

    pop = db_queries.get_population(gbd_round_id=gbd_round_id,
                                    decomp_step=decomp_step,
                                    year_id=years,
                                    age_group_id=ages,
                                    sex_id=[1, 2],
                                    location_id=locs)
    pop['year_start'], pop['year_end'] = pop['year_id'], pop['year_id']
    pop.drop(['year_id', 'run_id'], axis=1, inplace=True)

    agg_pop = pop.copy()
    agg_pop = hosp_prep.year_binner(agg_pop)
    agg_pop = agg_pop.groupby(['age_group_id', 'year_start', 'year_end', 'location_id', 'sex_id']).\
                            agg({'population': 'mean'}).reset_index()

    if (df['year_start'] + 4 == df['year_end']).all():
        df = df.merge(agg_pop,
                      how='left',
                      on=[
                          'age_group_id', 'sex_id', 'location_id',
                          'year_start', 'year_end'
                      ],
                      validate='m:1')
    else:
        df = df.merge(pop,
                      how='left',
                      on=[
                          'age_group_id', 'sex_id', 'location_id',
                          'year_start', 'year_end'
                      ],
                      validate='m:1')

    assert df['population'].isnull().sum() == 0

    for col in rate_cols:
        df["count_{}".format(col)] = df[col] * df['population']

    return df
예제 #12
0
 def pull_pop(self):
     logger.info("Pulling populations...")
     pop_df = get_population(age_group_id=self.age_group_ids,
                             location_id=-1,
                             location_set_id=35,
                             year_id=self.year_id,
                             sex_id=self.sex_id)
     return pop_df[self.index_cols + ['population']]
예제 #13
0
    def __init__(self,
                 gbd_round_id: int = gbd.GBD_ROUND_ID,
                 decomp_step: str = gbd.decomp_step.ONE,
                 location_set_id: int = LocationSetId.OUTPUTS):
        self.gbd_round_id: int = gbd_round_id
        self.decomp_step: str = decomp_step
        self.location_set_id: int = location_set_id

        self._run_id: int = get_population(
            gbd_round_id=self.gbd_round_id,
            decomp_step=self.decomp_step,
            location_set_id=self.location_set_id).run_id.item()
예제 #14
0
def get_population(
        gbd_round_id: int,
        decomp_step: str,
        location_set_id: int,
        year_ids: List[int],
        year_end: int,
        age_group_ids: List[int],
        sex_ids: List[int],
        square_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Pulls population estimates for given demographics.
    Requests forecasted population if modeler's prediction years go beyond the
    current cycle, and passes null decomp step for rounds prior to 2019.

    Args:
        gbd_round_id: the GBD round for which to pull population
        decomp_step: the decomp step for which to pull population
        location_set_id: the location set for which to pull population
        year_ids: year IDs for which to pull population
        year_end: the last year for which to pull population
        age_group_ids: age group IDs for which to pull population
        sex_ids: sex IDs for which to pull population
        square_df: square dataframe to use to validate that population
            contains required demographics

    Returns:
        DataFrame of population estimates. Has demographics columns and
        population column
    """
    logging.info('Pulling population')
    is_forecasting_model = (
        decomp_step == gbd.constants.decomp_step.ITERATIVE and
        year_end > demographics.FORECASTING_YEAR
    )
    population_df = db_queries.get_population(
        location_set_id=location_set_id,
        location_id='all',
        year_id=year_ids,
        age_group_id=age_group_ids,
        sex_id=sex_ids,
        gbd_round_id=gbd_round_id,
        decomp_step=(
            None if gbd_round_id < 6
            else decomp_step
        ),
        forecasted_pop=is_forecasting_model
    )[columns.DEMOGRAPHICS + [columns.POPULATION]]
    data_validation.validate_population_matches_data(
        population_df, square_df
    )
    return population_df
예제 #15
0
def get_pop(locset_id=8):
    ''' returns population estimates 
    '''
    d_step = utils.get_gbd_parameter('current_decomp_step')
    gbd_id = utils.get_gbd_parameter('current_gbd_round')
    yr_range = range(1980,2030) 
    yr_list = list(yr_range)
    pop_df = get_population(age_group_id=-1, location_id=-1, location_set_id=locset_id,
                            year_id=yr_list,
                            sex_id = -1,
                            decomp_step = d_step,
                            gbd_round_id = gbd_id)
    return(pop_df)
예제 #16
0
def merge_population(df, gbd_round_id, decomp_step, use_draws):
    """
    Function that attaches population info to the DataFrame.  Checks that there
    are no nulls in the population columns.  This has to be ran after the data
    has been made square!

    Parameters:
        df: Pandas DataFrame
    """
    if use_draws:
        chkcol = 'draw_3'  # not draw0 to make sure 0's are getting propogated
    else:
        chkcol = 'mean'

    zero_msg = """There are no rows with zeros, implying
        that the data has not been made square.  This function should be ran
        after the data is square"""
    # assert (df[chkcol] == 0).any(), zero_msg
    if not (df[chkcol] == 0).any():
        warnings.warn(zero_msg)

    # create age/year/location lists to use for pulling population
    age_list = list(df.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_id.unique())

    # pull population
    pop = get_population(age_group_id=age_list,
                         location_id=loc_list,
                         sex_id=[1, 2],
                         year_id=year_list,
                         gbd_round_id=gbd_round_id,
                         decomp_step=decomp_step)

    demography = ['location_id', 'year_id', 'sex_id', 'age_group_id']
    # keep only merge cols and pop
    pop = pop[demography + ['population']]

    pre_shape = df.shape[0]  # store for before comparison
    # then merge population onto the hospital data
    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"

    # assert that there are no nulls in population column:
    hosp_prep.report_if_merge_fail(df,
                                   check_col="population",
                                   id_cols=demography,
                                   store=True,
                                   filename="population_merge_failure")

    return df
예제 #17
0
 def get_data_frame(self, desired_index):
     if self.location_set_id is not None:
         df_out = get_population(
             age_group_id=self.age_group_id,
             location_set_id=self.location_set_id, year_id=self.year_id,
             location_id=self.location_id,
             sex_id=self.sex_id, gbd_round_id=self.gbd_round_id)
     elif self.location_set_version_id is not None:
         df_out = get_population(
             age_group_id=self.age_group_id,
             location_set_version_id=self.location_set_version_id,
             location_id=self.location_id,
             year_id=self.year_id, sex_id=self.sex_id,
             gbd_round_id=self.gbd_round_id)
     else:
         df_out = get_population(
             age_group_id=self.age_group_id,
             location_id=self.location_id,
             year_id=self.year_id, sex_id=self.sex_id,
             gbd_round_id=self.gbd_round_id)
     df_out = DataSource.normalize_columns(df_out, self.name, desired_index)
     df_out.rename(columns={'population': 'pop_scaled'}, inplace=True)
     return df_out
예제 #18
0
 def prep_dismod_results(self):
     ## create all sex
     self.dismod_model = self.dismod_model.loc[~self.dismod_model.age_group_id.isin([1,33, 27, 164])]
     print(self.dismod_model)
     
     print('Getting population')
     locs = self.dismod_model.location_id.unique()
     years = self.dismod_model.year_id.unique().tolist()
     ages = self.dismod_model.age_group_id.unique().tolist()
     print(locs)
     print(years)
     print(ages)
     
     
     self.population = get_population(location_id=self.dismod_model.location_id.unique().tolist(),
                                     year_id = self.dismod_model.year_id.unique().tolist(),
                                     age_group_id=self.dismod_model.age_group_id.unique().tolist(), sex_id=[1,2])
     
     self.dismod_model = self.dismod_model.merge(self.population, 
                                                on=['location_id', 'age_group_id', 'sex_id', 'year_id'],
                                                how='left')
     
     ## aggregate to all sex
     self.all_sex = self.dismod_model.copy()
     self.all_sex['mean'] *= self.all_sex['population']
     self.all_sex['upper'] *= self.all_sex['population']
     self.all_sex = self.all_sex.groupby(['location_id',  'year_id', 'age_group_id']).sum().reset_index()
     self.all_sex['mean'] /= self.all_sex.population
     self.all_sex.upper /= self.all_sex.population
     
     ## add sex
     self.dismod_model = pd.concat((self.dismod_model[['location_id', 'age_group_id', 'sex_id', 'year_id', 'mean', 'population', 'upper']],
                                    self.all_sex[['location_id', 'age_group_id', 'sex_id', 'year_id', 'mean', 'population', 'upper']]))
     del self.all_sex  
     ## aggregate under 1
     
     under_5 = self.dismod_model.loc[self.dismod_model.age_group_id.isin([2,3,4])]
     self.dismod_model = self.dismod_model.loc[~self.dismod_model.age_group_id.isin([2,3,4])]
     under_5['mean'] *= under_5['population']
     under_5['upper'] *= under_5['population']
     under_5 = under_5.groupby(['location_id',  'year_id', 'sex_id']).sum().reset_index()
     under_5['mean'] /= under_5.population
     under_5.upper /= under_5.population
     under_5.age_group_id=28
     
     ## add age
     self.dismod_model = self.dismod_model.append(under_5)      
     self.dismod_model['se_age'] = (self.dismod_model.upper -self.dismod_model['mean'])/1.96
예제 #19
0
def prep_pop(years, upload_dir, vers):
    population = get_population(year_id=years,
                                location_id=-1,
                                age_group_id=-1,
                                sex_id=-1)
    index_cols = ['location_id', 'year_id', 'sex_id', 'age_group_id']

    population = population[index_cols + ['population']]

    population = set_sort_index(population, index_cols)

    output_dir = '%s/v%s/temps' % (upload_dir, vers)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        time.sleep(1)
    population.to_csv('%s/population.csv' % output_dir)
예제 #20
0
    def prep_dismod_results(self):
        ## create all sex
        self.dismod_model = self.dismod_model.loc[
            ~self.dismod_model.age_group_id.isin([1, 33, 27, 164])]

        self.population = get_population(QUERY)

        self.dismod_model = self.dismod_model.merge(self.population,
                                                    on=[cols],
                                                    how='left')

        ## aggregate to all sex
        self.all_sex = self.dismod_model.copy()
        self.all_sex['mean'] *= self.all_sex['population']
        self.all_sex['upper'] *= self.all_sex['population']
        self.all_sex = self.all_sex.groupby(
            ['location_id', 'year_id', 'age_group_id']).sum().reset_index()
        self.all_sex['mean'] /= self.all_sex.population
        self.all_sex.upper /= self.all_sex.population

        ## add sex
        self.dismod_model = pd.concat((self.dismod_model[[
            'location_id', 'age_group_id', 'sex_id', 'year_id', 'mean',
            'population', 'upper'
        ]], self.all_sex[[
            'location_id', 'age_group_id', 'sex_id', 'year_id', 'mean',
            'population', 'upper'
        ]]))
        del self.all_sex
        ## aggregate under 1

        under_5 = self.dismod_model.loc[self.dismod_model.age_group_id.isin(
            [2, 3, 4])]
        self.dismod_model = self.dismod_model.loc[~self.dismod_model.
                                                  age_group_id.isin([2, 3, 4])]
        under_5['mean'] *= under_5['population']
        under_5['upper'] *= under_5['population']
        under_5 = under_5.groupby(['location_id', 'year_id',
                                   'sex_id']).sum().reset_index()
        under_5['mean'] /= under_5.population
        under_5.upper /= under_5.population
        under_5.age_group_id = 28

        ## add age
        self.dismod_model = self.dismod_model.append(under_5)
        self.dismod_model['se_age'] = (self.dismod_model.upper -
                                       self.dismod_model['mean']) / 1.96
예제 #21
0
def merge_population(df):
    """
    Function that attaches population info to the DataFrame.  Checks that there
    are no nulls in the population columns.  This has to be run after the data
    has been made square!

    Parameters:
        df: Pandas DataFrame
    """

    assert (df.mean_raw == 0).any(), """There are no rows with zeros, implying
        that the data has not been made square.  This function should be run
        after the data is square"""

    # create age/year/location lists to use for pulling population
    age_list = list(df.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    # pull population
    pop = get_population(age_group_id=age_list,
                         location_id=loc_list,
                         sex_id=[1, 2],
                         year_id=year_list)

    # rename pop columns to match hospital data columns
    pop.rename(columns={'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']

    demography = [
        'location_id', 'year_start', 'year_end', 'sex_id', 'age_group_id'
    ]

    pre_shape = df.shape[0]  # store for before comparison
    # then merge population onto the hospital data

    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"

    # assert that there are no nulls in population column:
    hosp_prep.report_if_merge_fail(df,
                                   check_col="population",
                                   id_cols=demography,
                                   store=True,
                                   filename="population_merge_failure")

    return df
예제 #22
0
def run_cod_age_sex_splitting(db):
    # CHECK COMPLETENESS
    cause_set_version = 269
    cm = get_cause_metadata(cause_set_version_id=cause_set_version)
    possible_causes = cm['cause_id'].unique().tolist()
    for cause_id in db['cause_id'].unique().tolist():
        assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format(
            cause_id)
    loc_meta = get_location_metadata(gbd_round_id=5, location_set_id=21)
    possible_locs = loc_meta['location_id'].tolist()
    db = db.loc[db['location_id'].isin(possible_locs), :]
    db = db.loc[db['best'] > 0, :]
    db['hi_best_ratio'] = db['high'] / db['best']
    db['lo_best_ratio'] = db['low'] / db['best']
    db = db.reset_index(drop=True)
    db['unique_join'] = db.index
    db_merge_later = db.loc[:,
                            ['unique_join', 'hi_best_ratio', 'lo_best_ratio']]
    db = db.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'],
                 axis=1)
    id_cols = [
        i for i in db.columns if i not in ['best', 'age_group_id', 'sex_id']
    ]
    cause_set_version_id = query("""SELECT cause_set_version_id
                                    FROM ADDRESS
                                    WHERE gbd_round_id=5 AND cause_set_id=4;""",
                                 conn_def='epi').iloc[0, 0]
    pop_run_id = get_population(gbd_round_id=5,
                                status="recent")['run_id'].iloc[0]
    splitter = AgeSexSplitter(cause_set_version_id=cause_set_version,
                              pop_run_id=104,
                              distribution_set_version_id=29,
                              id_cols=['unique_join'],
                              value_column='best')
    split_db = splitter.get_computed_dataframe(df=db,
                                               location_meta_df=loc_meta)
    split_db = pd.merge(left=split_db,
                        right=db_merge_later,
                        on=['unique_join'],
                        how='left')
    split_db['low'] = split_db['best'] * split_db['lo_best_ratio']
    split_db['high'] = split_db['best'] * split_db['hi_best_ratio']
    split_db = split_db.drop(
        labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1)
    return split_db
예제 #23
0
def split_by_pop(full_df, cause_id):
    total_b = round(full_df['best'].sum())
    total_h = round(full_df['high'].sum())
    total_l = round(full_df['low'].sum())

    final = full_df[full_df['cause_id'] != cause_id]
    df = full_df[full_df['cause_id'] == cause_id]

    if cause_id == 387:
        final.append(df.query("age_group_id == 2 | age_group_id == 3"))
        df = df.query("age_group_id != 2 & age_group_id != 3")

    locations = df.location_id.unique()
    years = df.year_id.unique()
    ages = df.age_group_id.unique()

    pop = get_population(age_group_id=list(ages),
                         location_id=list(locations),
                         year_id=list(years),
                         sex_id=[1, 2],
                         run_id=104)

    df = pd.merge(df,
                  pop,
                  how='left',
                  on=['age_group_id', 'location_id', 'year_id', 'sex_id'])
    df['tpop'] = df.groupby(['location_id',
                             'year_id'])['population'].transform(sum)
    df['tbest'] = df.groupby(['location_id', 'year_id'])['best'].transform(sum)
    df['thigh'] = df.groupby(['location_id', 'year_id'])['high'].transform(sum)
    df['tlow'] = df.groupby(['location_id', 'year_id'])['low'].transform(sum)
    df['rate'] = df['population'] / df['tpop']
    df['best'] = df['rate'] * df['tbest']
    df['high'] = df['rate'] * df['thigh']
    df['low'] = df['rate'] * df['tlow']
    df.drop(['population', 'run_id', 'tpop', 'rate', "tbest"],
            axis=1,
            inplace=True)

    final = final.append(df)
    assert round(final['best'].sum()) == total_b
    assert round(final['high'].sum()) == total_h
    assert round(final['low'].sum()) == total_l

    return final
예제 #24
0
def get_sample_size(df):
    """
    This function attaches sample size to hospital data.  It's for sources that
    should have fully covered populations, so sample size is just population.

    """
    # process
    ## attach age group id to data
    ## get pop with those age group ids in the data
    ## attach pop by age group id

    if 'age_group_id' not in df.columns:
        # pull age_group to age_startFILEPATH map
        age_group = hosp_prep.get_hospital_age_groups()

        # merge age group id on
        pre = df.shape[0]
        df = df.merge(age_group, how='left', on=['age_start', 'age_end'])
        assert df.shape[0] == pre, "number of rows changed during merge"
        assert df.age_group_id.notnull().all(), ("age_group_id is missing "
                                                 "for some rows")

    # get population
    pop = get_population(age_group_id=list(df.age_group_id.unique()),
                         location_id=list(df.location_id.unique()),
                         sex_id=[1, 2],
                         year_id=list(df.year_start.unique()))

    # rename pop columns to match hospital data columns
    pop.rename(columns={'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']
    pop.drop("process_version_map_id", axis=1, inplace=True)

    demography = [
        'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'
    ]

    # merge on population
    pre_shape = df.shape[0]
    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"
    assert df.population.notnull().all(), ("population is missing for some"
                                           " rows")

    return (df)
예제 #25
0
def create_age_weight(df):
    """Create mean age weights for wide age groups"""

    df['age_diff'] = df['age_end'] - df['age_start']
    df['age_mean'] = df['age_end'] + df['age_start']
    df['age_mean'] /= 2
    df['age_diff'] = df['age_end'] - df['age_start']
    df_sub = df.loc[df.age_diff > 10]
    df_final = df.loc[df.age_diff <= 10]
    pops = get_population(QUERY)
    pops.rename(columns={'age_group_id': 'age_start'}, inplace=True)

    pops['age_start'] -= 49  ## correct these age_groups
    df_sub = df_sub.reset_index()

    for i in np.arange(len(df_sub)):
        try:
            temp = df_sub.iloc[i].to_dict()
            pop_sub = pops.loc[(pops.location_id == temp['location_id'])
                               & (pops.year_id == temp['year_start'])]
            pop_sub = pop_sub.loc[(pop_sub.age_start >= temp['age_start'])
                                  & (pop_sub.age_start <= temp['age_end'])]
            ## get sex
            temp['sex'] = [x.lower() for x in temp['sex']]
            if temp['sex'] == 'male':
                pop_sub = pop_sub.loc[pop_sub.sex_id == 1]
            if temp['sex'] == 'female':
                pop_sub = pop_sub.loc[pop_sub.sex_id == 2]
            if temp['sex'] == 'both':
                pop_sub = pop_sub.loc[pop_sub.sex_id.isin([1, 2])]
            df_sub.ix[i, 'age_mean'] = (
                pop_sub['age_start'] *
                pop_sub['population']).sum() / pop_sub['population'].sum()

            #print 'worked! ' + str(temp['location_id'])
        except:
            if temp['location_id'] == 6:
                df_sub.ix[i, 'age_mean'] = 37.  ## china is wierd
            else:
                df_sub.ix[i, 'age_mean'] = 41
            print temp['location_id']
            pass

    return pd.concat((df_final, df_sub))
예제 #26
0
def run_subnational_splitting(df):
    original_death_count = df.copy()['best'].sum()
    df['high'] = df['high'].astype("float")
    df['low'] = df['low'].astype("float")
    df['location_id'] = df['location_id'].apply(lambda x: int(x))

    all_locations = get_location_metadata(location_set_id=21)
    not_detailed_locations = all_locations.query("most_detailed == 0")
    not_detailed_locations = set(not_detailed_locations['location_id'])
    df_locations = set(df['location_id'])
    has_nationals = bool(len(not_detailed_locations.intersection(df_locations)))
    locs = get_location_metadata(location_set_id=21)
    pop = get_population(location_id=-1, year_id=-1, decomp_step="step1", location_set_id=21)
    count = 0
    while has_nationals:
        count += 1
        print("iteration {}".format(count))
        if count >= 7:
        df = parallelize(df, locs, pop, iterate_through_df_and_split_nationals_by_population)
        df_locations = set(df['location_id'])
        has_nationals = bool(len(not_detailed_locations.intersection(df_locations)))

    split_death_count = df['best'].sum()

    if "not_detailed" not in df.columns:
        df['not_detailed'] = 0

    difference = split_death_count - original_death_count
    assert np.isclose(difference, 0, atol=10), (
        "deaths before split does not equal deaths after split: Difference {}".format(difference))

    df['high'].fillna(0, inplace=True)
    df['low'].fillna(0, inplace=True)
    df['best'] = df['best'].apply(lambda x: float(x))

    df = df.groupby(["source_event_id", "location_id", "cause_id", "year_id",
                     "nid", "source_id", "raw_data_id", "year_split_data_id",
                     "split_status", "not_detailed", "event_name", "sex_id",
                     "age_group_id", "notes"], as_index=False)['low', 'best', 'high'].sum()

    df['high'] = df['high'].replace(0, float('nan'))
    df['low'] = df['low'].replace(0, float('nan'))
    return df
예제 #27
0
def load_mortality_envelope(location_id_list, age_group_list, year_list):
    ''' Returns the current all-cause mortality envelope
    '''
    env = get_envelope(sex_id=[1, 2],
                       location_id=location_id_list,
                       year_id=year_list,
                       age_group_id=age_group_list)
    env.rename(columns={'mean': 'envelope'}, inplace=True)
    pop = get_population(sex_id=[1, 2],
                         location_id=location_id_list,
                         year_id=year_list,
                         age_group_id=age_group_list)
    env = env.merge(pop,
                    on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    env['death_rate'] = env['envelope'] / env['population']
    env = env[[
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'death_rate'
    ]]
    return (env)
예제 #28
0
def assemble_most_detailed_map(not_detailed_locs, loc_meta):
    '''
    For each of the locations that is not listed as most detailed, create a 
    dataframe of all the most deatiled locations that fall within that 
    non-detailed location.
    Inputs:
      not_detailed_locs: A list of all locations to create the dataframe for
      loc_meta: Dataframe of location metadata for a given location set
    Outputs:
      detailed_map: A dataframe with three fields: "map_from_loc", the non-detailed
        location_id; "map_to_loc", the most detailed location_ids, and 
        "split_fraction" the proportion of the total population in the
        non-detailed loc that can be found within the most detailed loc. 
    '''
    # Keep only most detailed locations
    detailed = loc_meta.loc[loc_meta['most_detailed'] == 1, :]
    # Get population data for all most detailed locations
    pops = (get_population(location_id=detailed['location_id'].unique().tolist(
    )).loc[:, ['location_id', 'population']])
    detailed = pd.merge(left=detailed,
                        right=pops,
                        on=['location_id'],
                        how='inner')
    # Create an empty list that will store subsets of the final dataframe
    sub_map_dfs = list()
    # Iterate through all of the non-detailed locations
    not_detailed_locs = list(set(not_detailed_locs))
    for parent_loc in not_detailed_locs:
        # Subset to all most-detailed locations that fall under this location
        sub_df = (detailed.loc[detailed['path_to_top_parent'].apply(
            lambda x: ",{},".format(int(parent_loc)) in x), :])
        # Get the fraction of the total population in each most detailed location
        sub_df['split_fraction'] = sub_df['population'] / sub_df[
            'population'].sum()
        # Keep only needed columns
        sub_df = sub_df.loc[:, ['location_id', 'split_fraction']]
        sub_df['map_from_loc'] = parent_loc
        # Append to the list of sub-dataframes
        sub_map_dfs.append(sub_df)
    # Concatenate all sub-dataframes
    detailed_map = (pd.concat(sub_map_dfs).rename(
        columns={"location_id": "map_to_loc"}))
    return detailed_map
예제 #29
0
def run_shared_funcs(mat):
    """
    get all the central inputs we'll need. Population and asfr and ifd covariates
    """
    years = list(np.arange(1988, 2018, 1))
    locs = mat.location_id.unique().tolist()
    ages = mat.age_group_id.unique().tolist()
    # get pop
    pop = get_population(age_group_id=ages,
                         location_id=locs,
                         year_id=years,
                         sex_id=[2])

    # GET ASFR and IFD
    # has age/location/year
    asfr = get_covariate_estimates(covariate_id=13,
                                   location_id=locs,
                                   age_group_id=ages,
                                   year_id=years)
    ifd = get_covariate_estimates(covariate_id=51)
    return pop, asfr, ifd
def get_sample(df):
    # get population for sample size
    ages = df.age_group_id.unique().tolist() + [31, 32, 235]
    pop = get_population(year_id=df.year_id.unique().tolist(),
                         location_id=69,
                         age_group_id=ages,
                         sex_id=[1, 2])
    pop.drop('run_id', axis=1, inplace=True)
    pop.rename(columns={'population': 'sample_size'}, inplace=True)
    pop.loc[pop.age_group_id.isin([31, 32, 235]), 'age_group_id'] = 160
    pop = pop.groupby(pop.columns.drop('sample_size').tolist()).agg({
        'sample_size':
        'sum'
    }).reset_index()

    # merge on sample size
    df = df.merge(pop,
                  how='left',
                  on=['age_group_id', 'location_id', 'sex_id', 'year_id'])
    assert df.sample_size.isnull().sum() == 0
    return df