def summarize_loc_rei(source, location_id, rei_id, year_id, change_intervals, gbd_round_id, pop, aw): '''aggregate age and sex then calc mean ui for single and multi year for one location risk pair''' if change_intervals: change_years = [i for i in itertools.chain(*change_intervals)] else: change_years = [] multi_yrs = [] single = [] for year in year_id: df = source.content(filters={'location_id': location_id, 'year_id': year, 'rei_id': rei_id}) df.drop(df.columns[df.columns.str.contains('^Unnamed')], axis = 1, inplace = True) both_sex = combine_sexes_indf(df, pop) df = df.append(both_sex) age_agg = combine_ages(df, pop, aw, gbd_compare_ags=True) df = df.append(age_agg) draw_cols = [c for c in df if c.startswith('draw_')] single.append(get_summary(df, draw_cols)) if year in change_years: multi_yrs.append(df) single = pd.concat(single,sort=True) single = single[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id', 'rei_id', 'mean', 'lower', 'upper']] single.rename(columns={'mean': 'val'}, inplace=True) multi_yrs = pd.concat(multi_yrs,sort=True) multi = [] for ci in change_intervals: draw_cols = [c for c in multi_yrs if c.startswith('draw_')] chg_df = pct_change(multi_yrs, ci[0], ci[1], 'year_id', draw_cols) draw_cols = [c for c in chg_df if c.startswith('draw_')] multi.append(get_summary(chg_df, draw_cols)) multi = pd.concat(multi,sort=True) multi = multi[[ 'location_id', 'year_start_id', 'year_end_id', 'age_group_id', 'sex_id', 'measure_id', 'rei_id', 'metric_id', 'pct_change_means', 'lower', 'upper']] multi.rename(columns={'pct_change_means': 'val'}, inplace=True) return single, multi
def _calc_mean_upper_lower(df: pd.DataFrame, draw_cols: List[str]) -> pd.DataFrame: return df\ .pipe(lambda df: cm_summarize.get_summary(df, draw_cols))\ .rename(columns={ columns.MEAN: columns.HALE_MEAN, columns.LOWER: columns.HALE_LOWER, columns.UPPER: columns.HALE_UPPER})\ .drop(columns=columns.MEDIAN)
def summarize_draws(self, arc_draws): self.index_cols.extend(['year_start_id', 'year_end_id']) self.index_cols.remove('year_id') summaries_mean = arc_draws[self.index_cols + ['pct_change_means']] summaries = get_summary(arc_draws, self.draw_cols) summaries.drop(['median','index','pct_change_means'], axis=1, inplace=True) summaries = summaries.merge(summaries_mean, on=self.index_cols) summaries.rename(columns={'pct_change_means': 'val'}, inplace=True) return summaries
def summarize_draws(df, index_cols): """Summarize the draws down to mean/lower/upper columns""" col_order = [ 'measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id', 'cause_id', 'rei_id', 'star_id', 'metric_id', 'mean', 'upper', 'lower' ] sumdf = get_summary(df, list(df.filter(like='draw_').columns)) sumdf = sumdf.reset_index() del sumdf['index'] del sumdf['median'] return sumdf[col_order]
def compute_estimates(df, point_estimate="mean"): """ Compute summaries """ draw_cols = [col for col in df.columns if "draw_" in col] df = get_summary(df, data_cols=draw_cols) if point_estimate == "mean": df.drop(["median"], axis=1, inplace=True) elif point_estimate is None: df.drop(["median", "mean"], axis=1, inplace=True) else: raise ValueError("point_estimate must be one of ['mean', None]") return df
def get_data_frame(self): logger.info("BEGIN compute summaries") self.validate_measure_and_metric(self.in_df, "incoming dataframe") logger.debug("validated") sumdf = get_summary(self.in_df, self.in_df.filter(like='draw_').columns) sumdf = sumdf.reset_index() del sumdf['index'] del sumdf['median'] if 'pct_change_means' in sumdf: logger.info("replacing mean of pct change distribution with pct " "change of means") sumdf['mean'] = sumdf['pct_change_means'] sumdf = sumdf[self.write_out_columns] return sumdf
data = data[input_keep_cols] # Format columns data['year_id'] = data['year'].astype('int64') data['sex_id'] = 3 data['age_group_id'] = 1 data['estimate_stage_id'] = 3 data['sim'] = data['sim'].astype('int64') # Reshape draws wide index_cols = ['location_id', 'ihme_loc_id', 'year_id', 'year', 'sex_id', 'age_group_id', 'estimate_stage_id'] data = data.pivot_table(values="mort", index=index_cols, columns="sim") data = data.reset_index() data = data.rename(columns={x: 'draw_{}'.format(x) for x in range(1000)}) # Get the summary statistics draw_cols = [col for col in data.columns if 'draw' in col] data = get_summary(data, draw_cols) # Format for upload keep_cols = index_cols + ['mean', 'lower', 'upper'] return data[keep_cols] # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('--version_id', type=int, required=True, action='store', help='The version_id to run') parser.add_argument('--location_id', type=int, required=True, action='store', help='The version_id to run') args = parser.parse_args() version_id = args.version_id location_id = args.location_id
def summarize_draws(self, mmr_draws): logger.info("Summarizing MMR draws") summaries = get_summary(mmr_draws, self.draw_cols) summaries.drop('median', axis=1, inplace=True) summaries.rename(columns={'mean': 'val'}, inplace=True) return summaries
# Get input file data = pd.read_csv("{}/{}.csv".format(input_dir, location_id)) # Reshape metric-age long index_cols = ['ihme_loc_id', 'year', 'sex'] data_cols = ['pys1', 'pys2', 'pys3', 'pys4', 'pysenn', 'pyslnn', 'pyspnn','pyspna','pyspnb'] data = data[index_cols + ['sim'] + data_cols] data = pd.melt(data, id_vars=(index_cols + ['sim']), value_vars=data_cols, var_name="age_group", value_name='draw') # Reshape draws wide data = reshape_wide(data, index_cols + ['age_group'], ['draw'], 'sim') data = data.sort_values(['ihme_loc_id', 'year', 'sex', 'age_group']).reset_index(drop=True) data['age_group'] = data['age_group'].map(lambda x: x.replace("pys", "")) # Take point estimates index_cols = ['ihme_loc_id', 'year', 'sex'] draw_cols = ['draw_{}'.format(x) for x in range(1000)] summary_data = get_summary(data, data.filter(like='draw_').columns) summary_data = summary_data.reset_index(drop=True) # Reformat summary_data['location_id'] = location_id summary_data['year_id'] = summary_data['year'].astype('int64') summary_data.loc[(summary_data['sex'] == "male"), 'sex_id'] = 1 summary_data.loc[(summary_data['sex'] == "female"), 'sex_id'] = 2 summary_data['sex_id'] = summary_data['sex_id'].astype('int64') summary_data = summary_data[['location_id', 'ihme_loc_id', 'year_id', 'sex_id', 'age_group', 'mean', 'lower', 'upper']] # Save summary_data.to_csv(output_file, index=False)