def split(self): # get input draws draws = self._epi_draw_source.content(filters=self.demo_filters.copy()) # get split props filters = self.ss_filters filters.update(self.demo_filters) gprops = self._ss_draw_source.content(filters=filters) splits = merge_split(draws, gprops, group_cols=self.dimensions.index_names, value_cols=self.dimensions.data_list()) splits = splits.assign(modelable_entity_id=splits['child_meid']) splits = splits[self.dimensions.index_names + ["modelable_entity_id"] + self.dimensions.data_list()] splits = splits.fillna(0) self.pusher.push(splits, append=False)
def _parallel_merge_split(meid_cause_map, interp_files, output_dir, tmpdir, location_id): try: epi_draw = [] for f in interp_files: epi_draw.append( pd.read_hdf(f, 'draws', where=["location_id=={}".format(location_id)])) epi_draws = pd.concat(epi_draw) cd = pd.read_hdf(os.path.join(tmpdir.name, 'source_cause_draws.h5'), 'draws', where=['location_id=={}'.format(location_id)]) draw_cols = [col for col in cd.columns if 'draw_' in col] epi_draws = epi_draws[epi_draws['age_group_id'].isin( cd['age_group_id'].unique())] # these columns are not needed and cause maths.merge_split to break drop_cols = ['measure_id', 'model_version_id', 'metric_id'] cd.drop(drop_cols, axis=1, inplace=True, errors='ignore') epi_draws.drop(drop_cols, axis=1, inplace=True, errors='ignore') cout = merge_split( cd, epi_draws, ['year_id', 'age_group_id', 'sex_id', 'location_id'], draw_cols) cout = cout.merge(cd[[ 'year_id', 'age_group_id', 'sex_id', 'location_id', 'envelope' ]], how='left') cout['cause_id'] = cout['modelable_entity_id'] cout['cause_id'] = cout['cause_id'].replace(meid_cause_map) cout['measure_id'] = 1 for cid in cout.cause_id.unique(): cid_dir = '{}/{}'.format(output_dir, int(cid)) cid_dir = cid_dir.replace("\r", "") if not os.path.exists(cid_dir): makedirs_safely(cid_dir) fn = '{}/death_{}.csv'.format(cid_dir, location_id) cout.query('cause_id=={}'.format(cid)).to_csv(fn, index=False) return location_id, 0 except Exception: tb_str = traceback.format_exc() return location_id, tb_str
def filet(source_meid, target_prop_map, location_id, split_meas_ids, prop_meas_id, gbd_round_id, mvid_map, source_mvid, decomp_step, n_draws, downsample): """ Splits the draws for source_meid to the target meids given in target_prop_map by the proportions estimated in the prop_meids. The split is applied to all GBD years associated with the given gbd_round_id for the specified location_id. The 'best' version of the meids will be used by default. Arguments: source_meid (int): meid for the draws to be split. target_prop_map (dict): dictionary whose keys are the target meids and whose values are the meids for the corresponding proportion models. location_id (int): location_id to operate on. split_meas_ids (list of ints): The measure_ids from source_meid to be split. prop_meas_id (int): The measure_id that identifies the proportion in prop_meids to use for the split. gbd_round_id (int): the gbd_round_id for models being split. mvid_map (dict): relationship of target MEs to proportion MEs. source_mvid (int): source model version id. decomp_step (str): Decomposition step. Allowed values are None, 'iterative', 'step1', 'step2', 'step3', 'step4', and 'step5' depending on the value of gbd_round_id. n_draws (Optional[int]) downsample (Optional[bool]) Returns: A DataFrame containing the draws for the target meids """ splits = [] props = [] for key in target_prop_map: if mvid_map is not None: version_id = mvid_map[target_prop_map[key]] else: version_id = None this_props = get_draws(gbd_id_type='modelable_entity_id', gbd_id=target_prop_map[key], source='epi', measure_id=prop_meas_id, location_id=location_id, version_id=version_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step, n_draws=n_draws, downsample=downsample) props.append(this_props) props = pd.concat(props) props = props.reset_index(drop=True) props['target_modelable_entity_id'] = (props.modelable_entity_id.replace( {v: k for k, v in target_prop_map.items()})) props_drawcols = [col for col in props.columns if 'draw_' in col] if source_mvid is not None: version_id = source_mvid else: version_id = None for measure_id in split_meas_ids: source = get_draws(gbd_id_type='modelable_entity_id', gbd_id=[source_meid], source='epi', measure_id=measure_id, location_id=location_id, version_id=version_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step) source_drawcols = [col for col in source.columns if 'draw_' in col] props['measure_id'] = measure_id props = props[props.age_group_id.isin(source.age_group_id.unique())] props = props[props.sex_id.isin(source.sex_id.unique())] # These columns are not needed and break maths.merge_split drop_cols = ['modelable_entity_id', 'model_version_id', 'metric_id'] source.drop(drop_cols, axis=1, inplace=True, errors='ignore') if len(source) > 0 and len(props) > 0: if len(target_prop_map) > 1: force_scale = True else: force_scale = False if len(props_drawcols) != len(source_drawcols): raise ValueError( "props and source drawcols are different lengths") split = merge_split(source, props, [ 'year_id', 'age_group_id', 'sex_id', 'location_id', 'measure_id' ], props_drawcols, force_scale=force_scale) splits.append(split) else: pass splits = pd.concat(splits) splits = splits[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'target_modelable_entity_id' ] + props_drawcols] splits.rename( columns={'target_modelable_entity_id': 'modelable_entity_id'}, inplace=True) return splits