def get_rates_df(self, cause_meta_df): if self.correct_garbage: filepath_infix = 'PRE' else: filepath_infix = 'POST' rates = pd.read_stata("FILEPATH".format(fp=filepath_infix, iso=self.iso3)) rates = add_cause_metadata(rates, ['cause_id'], merge_col='acause', cause_meta_df=cause_meta_df) rates.loc[rates['acause'] == "_sepsis_gc", 'cause_id'] = self.sepsis_cause_id age_df = get_cod_ages() age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])] age_df['agecat'] = age_df['age_group_years_start'] age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0 age_df = age_df[['agecat', 'age_group_id']] # merge on ages to rates data rates = rates.merge(age_df, on='agecat', how='left') assert not rates['age_group_id'].isnull().any() # clean up columns rates = rates.rename(columns={'sex': 'sex_id'}) rates = rates.drop(['acause', 'agecat'], axis=1) return rates
def get_rates_df(self, cause_meta_df): """Write a nice description here.""" if self.correct_garbage: filepath_infix = 'PRE' else: filepath_infix = 'POST' rates_path = self.conf.get_resource( 'hivcorr_global_causespecific_relrates').format( pre_post=filepath_infix, iso=self.iso3) rates = pd.read_stata(rates_path) # convert acause to cause_id rates = add_cause_metadata(rates, ['cause_id'], merge_col='acause', cause_meta_df=cause_meta_df) rates.loc[rates['acause'] == "_sepsis_gc", 'cause_id'] = self.sepsis_cause_id # convert age to age_group_id # TODO THIS NEEDS TO BE CHANGED B/C RIGHT NOW IT QUERIES DB # change get_demographics to have caching option too? age_df = get_cod_ages() age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])] age_df['agecat'] = age_df['age_group_years_start'] age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0 age_df = age_df[['agecat', 'age_group_id']] # merge on ages to rates data rates = rates.merge(age_df, on='agecat', how='left') assert not rates['age_group_id'].isnull().any() # clean up columns rates = rates.rename(columns={'sex': 'sex_id'}) rates = rates.drop(['acause', 'agecat'], axis=1) return rates
def report_too_much_age_detail(nid_etid_tuple, nid_etid_df): ''' Raise an assertion error if there is too much age detail in the age groups in nid_etid_df. For the age groups listed in nid_etid_df, report if there are age groups that can be aggregated to more closely match the ideal cod age groups. Note that VA, survey/ census, and CHAMPS data are NOT checked for too much age detail when passed to this function. Arguments: nid_etid_tuple, tuple: specifies NID and extract_type_id of nid_etid_df nid_etid_df, pandas.DataFrame: A df of unique age groups from the data for the NID and extract_type specified by nid_etid_tuple. The columns of nid_etid_df are age_group_id, age_group_years_start, age_group_years_end, and data_type_id, and each row is a unique age group. The age groups are assumed to be sorted in ascending order, first by age_group_years_start, then by age_group_years_end. The age groups are also assumed to have no gaps or overlaps between them. ''' data_type_id = nid_etid_df.data_type_id.unique() assert len( data_type_id ) == 1, "NID, extract_type {} has more than 1 data_type_id".format( nid_etid_tuple) if data_type_id[0] not in [7, 8, 12]: cod_ages = get_cod_ages() cod_ages = cod_ages[[ 'age_group_id', 'age_group_years_start', 'age_group_years_end' ]] cod_ages.rename(columns=lambda x: "cod_" + x, inplace=True) # For each cod age group, find age groups in the df that are contained within it cod_ages['df_age_group_ids'] = cod_ages.apply( lambda x: nid_etid_df.loc[ ((nid_etid_df.age_group_years_start >= x[ 'cod_age_group_years_start']) & (nid_etid_df.age_group_years_end <= x[ 'cod_age_group_years_end'])), 'age_group_id'].tolist(), axis='columns') # If a cod age group contains more than 1 df age group, then the df is too detailed cod_ages['is_too_detailed'] = cod_ages.df_age_group_ids.apply( lambda x: len(x) > 1) if cod_ages.is_too_detailed.any(): raise AssertionError( "Age groups in the data can be aggregated to match standard " "cod age groups for NID, extract_type {}: \n {}".format( nid_etid_tuple, cod_ages.loc[cod_ages.is_too_detailed]))
def __init__(self): self.cg = Configurator('standard') self.cache_dir = self.cg.get_directory('db_cache') # if you do not want to write any output files then set test to "True" self.test = False self.cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': self.cache_dir } self.dataset_filters = { 'data_type_id': [8, 9, 10, 12], 'location_set_id': 35, 'is_active': True, 'year_id': range(1980, 2050) } self.national_nids = self.cg.get_resource("nid_replacements") # resources self.completeness = self.cg.get_resource("completeness") self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'), **self.cache_options) self.location_meta_df = get_current_location_hierarchy( location_set_version_id=self.cg.get_id('location_set_version'), **self.cache_options) self.cod_ages = list( get_cod_ages(**self.cache_options)['age_group_id'].unique()) # identifiers self.source_cols = ["source", "nid", "data_type_id"] self.geo_cols = ["location_id", "year_id"] self.meta_cols = ["nationally_representative", "detail_level_id"] self.value_cols = ['deaths'] self.year_end = self.cg.get_id('year_end') self.full_time_series = "full_time_series" # directories self.current_best_version = "2018_04_03_151739" self.out_dir = "FILEPATH" self.arch_dir = "{}/_archive".format(self.out_dir) self.timestamp = cod_timestamp()
def get_age_weight_df(self): """ We have shifted to pulling age weights based on mortality information after a decision by USERNAME and USERNAME. The method below replaces pulling the population based weights out of the db with the "get_age_weights" function. - 07/10/2019 """ df = get_env(env_run_id=self.cg.get_id('env_run'), force_rerun=False, block_rerun=True) # get global, both sex, for all years after 2010 df = df.query("location_id == 1 & sex_id == 3 & year_id >= 2010") # collapse out year df = df.groupby(['age_group_id', 'location_id', 'sex_id'], as_index=False).mean_env.sum() # total deaths for weights total = df.loc[df.age_group_id == 22]['mean_env'].iloc[0] # get the ages we care about (cod ages, under 1, and 80+) age_df = get_cod_ages() ages = age_df.age_group_id.unique().tolist() ages += [21, 28] # limit env df to relevant ages df = df.loc[df.age_group_id.isin(ages)] # group by age, and then make weights df = df.groupby('age_group_id', as_index=False).mean_env.sum() df['weight'] = df['mean_env'] / total # some renaming df.rename(columns={'weight': 'age_group_weight_value'}, inplace=True) # do a quick check to make sure the death totals used to create weights are sensible # just making sure age specific totals are within 1% of the all age total check_val = abs( (df.loc[~df.age_group_id.isin([21, 28])].mean_env.sum() / total) - 1) assert check_val < 0.01 df = df[['age_group_id', 'age_group_weight_value']] return df