def main(nid, extract_type_id, code_system_id, launch_set_id): cause_set_version_id = CONF.get_id('cause_set_version') location_set_version_id = CONF.get_id('location_set_version') pop_run_id = CONF.get_id('pop_run') env_run_id = CONF.get_id('env_run') distribution_set_version_id = CONF.get_id('distribution_set_version') # download data from input database df = get_claude_data('formatted', nid=nid, extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) assert len(df) != 0, ("Dataframe is empty." " Are you sure this source is in" "the input database?") # run the pipeline df = run_pipeline(nid, extract_type_id, launch_set_id, df, code_system_id, cause_set_version_id, location_set_version_id, pop_run_id, env_run_id, distribution_set_version_id, diagnostic=False) # upload to database write_phase_output(df, 'disaggregation', nid, extract_type_id, launch_set_id)
def main(nid, extract_type_id, launch_set_id): """Read the data, run the phase, write the output.""" print_log_message("Reading {} data".format(PHASE_ANTECEDENT)) df = get_claude_data(PHASE_ANTECEDENT, nid=nid, extract_type_id=extract_type_id) env_run_id = int(CONF.get_id('env_run')) pop_run_id = int(CONF.get_id('pop_run')) location_set_version_id = int(CONF.get_id('location_set_version')) cause_set_version_id = int(CONF.get_id('cause_set_version')) df = df.rename(columns={'cf': 'cf_final'}) df = run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id) print_log_message( "Writing {n} rows of output for launch set {ls}, nid {nid}, extract " "{e}".format(n=len(df), ls=launch_set_id, nid=nid, e=extract_type_id)) write_phase_output(df, PHASE_NAME, nid, extract_type_id, launch_set_id)
def main(nid, extract_type_id, code_system_id, launch_set_id): cause_set_version_id = CONF.get_id('cause_set_version') location_set_version_id = CONF.get_id('location_set_version') pop_run_id = CONF.get_id('pop_run') env_run_id = CONF.get_id('env_run') # need to use special age/sex distribution for Norway based on National data if get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id) == 'NOR': distribution_set_version_id = CONF.get_id( 'NOR_distribution_set_version') else: distribution_set_version_id = CONF.get_id('distribution_set_version') # download data from input database df = get_claude_data('formatted', nid=nid, extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) assert len(df) != 0, ("Dataframe is empty." " Are you sure this source is in" "the input database?") # run the pipeline df = run_pipeline(nid, extract_type_id, launch_set_id, df, code_system_id, code_map_version_id, cause_set_version_id, location_set_version_id, pop_run_id, env_run_id, distribution_set_version_id, diagnostic=False) # upload to database write_phase_output(df, 'disaggregation', nid, extract_type_id, launch_set_id)
def get_redistribution_envelope(self, agg_df, agg_cause_ids): print_log_message("Getting cause-package map") cause_package_map = get_cause_package_map( self.code_system_id, remove_decimal=self.remove_decimal, cause_map=self.cause_map, package_map=self.package_map) print_log_message("Getting claude data - disaggregation") raw_df = get_claude_data("disaggregation", nid=self.nid, extract_type_id=self.extract_type_id) # if there is no garbage in the raw data, then set to 0 if 743 not in raw_df.cause_id.unique(): return pd.DataFrame() print_log_message("Getting package-targets relation") package_targets = get_package_targets( self.code_system_id, recurse_garbage_targets=False, remove_decimal=self.remove_decimal, force_rerun=False, block_rerun=True) package_targets = self.remove_package_1(package_targets) id_to_package_map = \ cause_package_map.query('map_type == "package_id"')[ ['code_id', 'map_id'] ] id_to_package_map.rename(columns={'map_id': 'package_id'}, inplace=True) print_log_message("Mapping raw data to package ids") raw_df = raw_df.merge(id_to_package_map, on='code_id', how='left') report_if_merge_fail(raw_df.query('cause_id == 743'), 'package_id', 'code_id') dfs = [] print_log_message("Looping over {} causes".format(len(agg_cause_ids))) group_cols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'site_id' ] square_df = agg_df[group_cols].drop_duplicates() bad_nid_extract_pairs = [(69913, 1), (69918, 1), (69922, 1), (93739, 1)] nid_extract_pair = (self.nid, self.extract_type_id) if nid_extract_pair not in bad_nid_extract_pairs: p1_df = self.get_package_1_deaths() for cause_id in agg_cause_ids: cause_ids = get_all_related_causes(cause_id, self.cause_hierarchy) package_ids = list( package_targets[package_targets['cause_id'].isin( cause_ids)]['package_id'].unique()) df = raw_df[raw_df['package_id'].isin(package_ids)].copy() df = df.groupby(group_cols, as_index=False)['deaths'].sum() df = square_df.merge(df, how='left') df['deaths'] = df['deaths'].fillna(0) df = df.rename(columns={'deaths': 'garbage_targeting_cause'}) df['cause_id'] = cause_id dfs.append(df) if nid_extract_pair not in bad_nid_extract_pairs: # Do the same for Package 1 deaths p1_cause_df = p1_df[p1_df.cause_id.isin(cause_ids)] p1_cause_df = p1_cause_df.groupby( ['location_id', 'year_id', 'age_group_id', 'sex_id'], as_index=False).freq.sum() p1_cause_df = square_df.merge(p1_cause_df, how='left') p1_cause_df['freq'] = p1_cause_df['freq'].fillna(0) p1_cause_df = p1_cause_df.rename( columns={'freq': 'garbage_targeting_cause'}) p1_cause_df['cause_id'] = cause_id dfs.append(p1_cause_df) print_log_message("Concatenating") df = pd.concat(dfs, ignore_index=True) df = df.groupby(group_cols + ['cause_id'], as_index=False).sum() return df