def _parallel_merge_split(meid_cause_map, interp_files, output_dir, tmpdir, location_id): try: epi_draw = [] for f in interp_files: epi_draw.append( pd.read_hdf(f, 'draws', where=["location_id=={}".format(location_id)])) epi_draws = pd.concat(epi_draw) cd = pd.read_hdf(os.path.join(tmpdir.name, 'source_cause_draws.h5'), 'draws', where=['location_id=={}'.format(location_id)]) draw_cols = [col for col in cd.columns if 'draw_' in col] epi_draws = epi_draws[epi_draws['age_group_id'].isin( cd['age_group_id'].unique())] # these columns are not needed and cause maths.merge_split to break drop_cols = ['measure_id', 'model_version_id', 'metric_id'] cd.drop(drop_cols, axis=1, inplace=True, errors='ignore') epi_draws.drop(drop_cols, axis=1, inplace=True, errors='ignore') cout = merge_split( cd, epi_draws, ['year_id', 'age_group_id', 'sex_id', 'location_id'], draw_cols) cout = cout.merge(cd[[ 'year_id', 'age_group_id', 'sex_id', 'location_id', 'envelope' ]], how='left') cout['cause_id'] = cout['modelable_entity_id'] cout['cause_id'] = cout['cause_id'].replace(meid_cause_map) cout['measure_id'] = 1 for cid in cout.cause_id.unique(): cid_dir = '{}/{}'.format(output_dir, int(cid)) cid_dir = cid_dir.replace("\r", "") if not os.path.exists(cid_dir): makedirs_safely(cid_dir) fn = '{}/death_{}.csv'.format(cid_dir, location_id) cout.query('cause_id=={}'.format(cid)).to_csv(fn, index=False) return location_id, 0 except Exception: tb_str = traceback.format_exc() return location_id, tb_str
def run_all(self, n_processes: int = 8): """ Creates a process_version (with db tables), performs cause decomposition, saves to file, and uploads results to the db. Use the n_processes argument to determine how many parallel processes to run simultaneously, keeping in mind that each hits the database a couple of times. """ pvid = self.create_process_version() print(f"Process version id {pvid} created on the " f"gbd {self.env.value} database.") makedirs_safely(self.write_dir) print(f"Output directory {self.write_dir} created.") self.run_decomp(n_processes=n_processes) self.upload() process_version = GBDProcessVersion(pvid, env=self.env) process_version._update_status( gbd.gbd_process_version_status['ACTIVE']) print(f"Process version id {pvid} marked active. Run completed")
def split_epi_model(source_meid, target_meids, prop_meids, split_measure_ids=[5, 6], prop_meas_id=18, gbd_round_id=gbd.GBD_ROUND_ID, decomp_step=None, output_dir=None): """ Splits modelable entity draws based on proportion models. Outputs are new draw files for each proportion model, tagged with the target_meid. Arguments: source_meid(int): The id of the me to be split. target_meids(intlist): A list of modelable entity ids that will be used to tag the new draws. Essentially, these are the me_ids for the newly created models. prop_meids (intlist): A list of modelable entity ids corresponding to the proportion models used to do the split. Note: target_meids and proportion_meids need to have 1-1 parity with each other and are order dependent. Example: target_meids = [2891, 2892, 2893, 2894] proportion_meids = [1922, 1920, 1921, 1923] Proportion model 1922 will be used to create model 2891, proportion model 1920 will be used to create model 2892 and so on. split_measure_ids(int or intlist, optional): A list of measure_ids from the source model, to be split according to the proportion models. Default is [5, 6], prevalence and incidence. prop_meas_id(int, optional): The measure id used in the proportion models. Default is 18, proportion. gbd_round_id(int, optional): The gbd_round_id for the proportion models being used. This argument is used to retrieve the best model for a given round. Default is the current GBD round. decomp_step (str): Decomposition step. Allowed values are None, 'iterative', 'step1', 'step2', 'step3', 'step4', and 'step5' depending on the value of gbd_round_id. output_dir(str, optional): The directory where the draw files are created. Subdirectories are created for each target modelable entity id, and files are tagged by location id. Example: output_dir/2891/35.h5 is a draw file for newly created model 2891, location_id 35. Returns: Pandas.DataFrame: The output directory where either the draws or an errors logfile can be found. Raises: IllegalSplitEpiArgument: If source_meid and proportion_measure_id are not integers. If target_meids and prop_meids are not lists of ints or split_measure_ids is not an int or list of ints. If target_meids and prop_meids do not have the same length. ValueError: If the decomp_step argument is invalid. """ # Validate Arguments validate_decomp_step_input(decomp_step, gbd_round_id) validate_source_meid(source_meid) validate_measure_id(prop_meas_id) validate_meids(target_meids, 'Target') validate_meids(prop_meids, 'Proportion') validate_split_measure_ids(split_measure_ids) if len(target_meids) != len(prop_meids): raise IllegalSplitEpiArgument( "Target modelable_entity_ids and proportion modelable_entity_ids " "lists must represent a 1-to-1 mapping of modelable_entity_ids. " "Received: {t_ids} target ids and {p_ids} proportion ids.".format( t_ids=len(target_meids), p_ids=len(prop_meids))) if output_dir is None: output_dir = 'FILEPATH' try: makedirs_safely(output_dir) except Exception: pass res = launch_epi_splits(source_meid, target_meids, prop_meids, split_measure_ids, prop_meas_id, output_dir, gbd_round_id, decomp_step) errors = [r for r in res if r[1] != 0] if len(errors) == 0: return pd.DataFrame({'output_dir': output_dir}, index=[0]) else: logfile = '{}/{}_errors.log'.format(output_dir, str(source_meid)) with open(logfile, 'w') as f: estr = "\n".join([str(r) for r in errors]) f.write(estr) return pd.DataFrame({'error_dir': logfile}, index=[0])
parser.add_argument('--gbd_round_id', type=int) parser.add_argument('--intermediate_dir', type=str) parser.add_argument('--decomp_step', type=str, default=None) args = parser.parse_args() return (args.gbd_id, args.proportion_measure_id, args.gbd_round_id, args.sex_id, args.intermediate_dir, args.decomp_step) if __name__ == '__main__': (gbd_id, measure_id, gbd_round_id, sex_id, outdir, decomp_step) = parse_arguments() if not os.path.exists(outdir): makedirs_safely(outdir) end_year = int(gbd_round_from_gbd_round_id(gbd_round_id)) df = interpolate(gbd_id=gbd_id, gbd_id_type='modelable_entity_id', source='epi', measure_id=measure_id, reporting_year_start=1980, reporting_year_end=end_year, sex_id=sex_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step, num_workers=30) id_cols = [col for col in df.columns if col.endswith('_id')]
def _launch_cod_splits(source_cause_id, target_cause_ids, target_meids, prop_meas_id, gbd_round_id, decomp_step, output_dir, project): """ Split the given source_cause_id given target_meid proportions, saved to the target_cause_ids in output_dir. Arguments: source_cause_id (int): cause_id for the draws to be split target_cause_ids (intlist): list of cause ids that you want the new outputted subcauses to be identified by target_meids (intlist): list of proportion models' modelable_entity_ids that you want the source_cause_id to be split by, to make the target_cause_ids. Target_cause_ids and target_me_ids must be specified in the same order prop_meas_id (int): The measure_id that identifies the proportion in the target_meids to use for the split. gbd_round_id (int): the gbd_round_id for models being split. decomp_step (str): Specifies which decomposition step the returned estimates should be from. If using interpolate for GBD round 6 and above, must specify one of 'step1', 'step2', 'step3', 'step4', 'step5', or 'iterative'. output_dir (str): directory where you want final results stored project (str): The SGE project to launch split_cod_model subjobs to using SplitCodSwarm. Returns: A list of tuples with each location_id paired with either 0, or an error message. This is then parsed in the central function draw_ops.split_cod_model into errors or success messages """ # setup years, sex restrictions, most detailed locations, etc. if gbd_round_id >= 6: cause_set_id = COMPUTATION_CAUSE_SET_ID else: cause_set_id = REPORTING_CAUSE_SET_ID causes = get_cause_metadata( cause_set_id=cause_set_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step).query("cause_id==@source_cause_id") sex_ids = [] if causes['male'].item() != 0: sex_ids.append(1) if causes['female'].item() != 0: sex_ids.append(2) if not sex_ids: raise ValueError( "Source_cause_id {} is restricted for both males and females, " "according to cause metadata".format(source_cause_id)) most_detailed_locs = list( get_location_metadata(35, gbd_round_id=gbd_round_id, decomp_step=decomp_step).query( 'most_detailed==1').location_id.unique()) meid_cause_map = dict(zip(target_meids, target_cause_ids)) # run interpolating/extrapolating intermediate_dir = os.path.join(output_dir, 'intermediate_{}'.format(source_cause_id)) if not os.path.exists(intermediate_dir): makedirs_safely(intermediate_dir) swarm = SplitCoDSwarm(source_id=source_cause_id, proportion_ids=target_meids, proportion_measure_id=prop_meas_id, sex_ids=sex_ids, gbd_round_id=gbd_round_id, decomp_step=decomp_step, intermediate_dir=intermediate_dir, outdir=output_dir, project=project) swarm.add_interpolate_tasks() exit_code = swarm.run() if exit_code != 0: raise RuntimeError( "Interpolating CoD years failed. Check logs in {}.".format( output_dir)) # run splitting for cid in target_cause_ids: cid_dir = os.path.join(output_dir, str(cid)) if not os.path.exists(cid_dir): makedirs_safely(cid_dir) file_list = glob.glob(os.path.join('{}/*.h5'.format(intermediate_dir))) # read in draws for source cause source = _get_draws(source_cause_id, gbd_round_id, decomp_step, sex_ids) # create a temporary directory to store all the draws from the source cause tmpdir = tempfile.TemporaryDirectory(dir=output_dir) # save source cause draws to temporary directory source.to_hdf( os.path.join(tmpdir.name, 'source_cause_draws.h5'), key='draws', mode='w', format='table', data_columns=['location_id', 'year_id', 'sex_id', 'age_group_id']) run_splits = functools.partial(_parallel_merge_split, meid_cause_map, file_list, output_dir, tmpdir) pool = Pool(30) res = pool.map(run_splits, most_detailed_locs) pool.close() pool.join() # clean up tempdir tmpdir.cleanup() return res
def split_cod_model(source_cause_id, target_cause_ids, target_meids, project, prop_meas_id=18, decomp_step=None, gbd_round_id=gbd.GBD_ROUND_ID, output_dir=None): """Returns a dataframe containing only the name of the out_dir where this function will save your new draws. Arguments: source_cause_id (int): the cause_id to be split into target_cause_ids. target_cause_ids (intlist): the cause_ids that should be produced as a result of the split. These should be provided in the same order as the target_meids which define the split proportions. target_meids (intlist): the modelable_entity_ids containing the proportion models by which to split the source_cause_id. These should be provided in the same order as the target_cause_ids. project (str): the team-specific SGE cluster project to launch the subjobs of the split_cod_model swarm to. prop_meas_id (int): The measure_id that identifies the proportion in the target_meids to use for the split. Defaults to measure_id 18, proportion. decomp_step (str): Specifies which decomposition step the returned estimates should be from. Default to None. "Allowed values are None, iterative', 'step1', 'step2', 'step3', 'step4', and 'step5' depending on the value of gbd_round_id." gbd_round_id (int): the gbd_round_id for models being split. Defaults to current GBD round id. output_dir (str): place where you want new draws to be saved. Returns: Pandas.DataFrame: The output directory where either the draws or an errors logfile can be found. Raises: IllegalSplitCoDArgument: If the source_cause_id, any of the target_cause_ids or target_meids are invalid, or if the lists of target causes and target meids are not one-to-one. RuntimeError: If any under-the-hood errors thrown directing the user to the log file. """ # Validate all incoming arguments. validate_decomp_step_input(decomp_step, gbd_round_id) # Validate source cause_id validate_ids(source_cause_id, VALID_CAUSE_IDS, 'source_cause_id') # Validate target_cause_ids validate_ids(target_cause_ids, VALID_CAUSE_IDS, 'target_cause_ids') # Validate modelable_entity_ids validate_ids(target_meids, VALID_MEIDS, 'proportion_meids') if len(target_cause_ids) != len(target_meids): raise IllegalSplitCoDArgument( "target_cause_ids and target_meids lists must represent a 1-to-1 " "mapping of cause_ids to modelable_entity_ids. Received: {t_ids} " "target ids and {p_ids} proportion ids.".format( t_ids=len(target_cause_ids), p_ids=len(target_meids))) if not output_dir: output_dir = 'FILEPATH' if not os.path.exists(output_dir): makedirs_safely(output_dir) res = _launch_cod_splits(source_cause_id, target_cause_ids, target_meids, prop_meas_id, gbd_round_id, decomp_step, output_dir, project) errors = [r for r in res if r[1] != 0] if len(errors) == 0: return pd.DataFrame({'output_dir': output_dir}, index=[0]) else: logfile = '{}/{}_errors.log'.format(output_dir, str(source_cause_id)) with open(logfile, 'w') as f: estr = "\n".join([str(r) for r in errors]) f.write(estr) return pd.DataFrame({'error_dir': logfile}, index=[0])