def create_post_jobs(self): """Depends on rake jobs. Calculates fit stats and cleans up file folders, no mas.""" for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) task = PythonTask( script=POST_SCRIPT, args=[ self.run_id, self.output_path, ko, self.run_type, self.holdouts, submit_params ], name=f'post_{self.run_id}_{ko}_{param_group}', num_cores=1, m_mem_free='2G', max_runtime_seconds=300, max_attempts=2, tag='stgpr_post', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) # add ST upstreams for loc_group in list(range(0, self.nparallel)): gp_label = 'gpr_{}_{}_{}_{}'.format( self.run_id, ko, param_group, loc_group) upstream_job = self.gpr_jobs[gp_label] task.add_upstream(upstream_job) self.workflow.add_task(task) self.post_jobs[task.name] = task
def create_descanso_jobs(self): """Depends on aggregate locations coming out of loc agg jobs""" for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) runtime = 3600 if self.is_diet_model else 300 task = PythonTask( script=IM_SCRIPT, args=[ self.run_id, self.output_path, ko, self.draws, self.nparallel, submit_params ], name=f'descanso_{self.run_id}_{ko}_{param_group}', num_cores=1, m_mem_free= '20G', # upped from 5 to 20 for variance simulation max_runtime_seconds=runtime, max_attempts=2, tag='stgpr_amp_nsv', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) # add ST upstreams for loc_group in list(range(0, self.nparallel)): st_label = 'st_{}_{}_{}_{}'.format(self.run_id, ko, param_group, loc_group) upstream_job = self.st_jobs[st_label] task.add_upstream(upstream_job) self.workflow.add_task(task) self.descanso_jobs[task.name] = task
def main() -> None: args = parse_args() user = getpass.getuser() today_string = datetime.date.today().strftime('%m%d%y') workflow = Workflow( workflow_args=f'anemia_malaria_{args.decomp_step}_{today_string}', name=f'anemia_malaria_{args.decomp_step}_{today_string}', description= f'Anemia: Malaria pre-processing for decomp {args.decomp_step}', project="proj_anemia", stderr="FILEPATH", stdout="FILEPATH", working_dir=path_to_directory, resume=True) # first submit the subtract clinical jobs subtract_tasks = [] demo = get_demographics("epi", gbd_round_id=args.gbd_round_id) for loc in demo['location_id']: task = PythonTask(script="FILEPATH", args=[ "--location_id", loc, "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--out_dir", args.out_dir ], name=f"malaria_subtract_{loc}", tag="malaria_subtract", num_cores=2, m_mem_free="8G", max_attempts=3, max_runtime_seconds=60 * 60 * 3, queue='all.q') subtract_tasks.append(task) workflow.add_tasks(subtract_tasks) # once the new draws exist, save results for modelable_entity_id in [19390, 19394]: task = PythonTask(script="FILEPATH", args=[ "--modelable_entity_id", modelable_entity_id, "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--out_dir", args.out_dir ], name=f"malaria_save_{modelable_entity_id}", tag="malaria_save", upstream_tasks=subtract_tasks, num_cores=8, m_mem_free="100G", max_attempts=3, max_runtime_seconds=60 * 60 * 24, queue='all.q') workflow.add_task(task) status = workflow.run() print(f'Workflow finished with status {status}')
def add_job(self, wf, job_dag, mvm): """ Create a PythonTask, add it to the workflow, and update the DagNode to contain a reference to the task. Args: wf (jobmon.client.swarm.workflow.workflow): jobmon workflow job_dag (dict[str, DagNode]): a mapping of job name to DagNode mvm (cascade_ode.importer.Importer.model_version_meta): a dataframe of model settings The varnish job does the following: 1. Uploads fits 2. Uploads adjusted data 3. Computes fit statistics 4. Uploads fit statistics 5. Attempts to generate diagnostic plots 5. Computes and uploads finals (save-results) 6. Updates the status of the model to finished """ slots, memory, runtime = sge.cluster_limits('varnish', mvm, details=self.details) environment_variables_to_add = settings['env_variables'] environment_variables_to_add['OMP_NUM_THREADS'] = slots # Varnish memory allocation shouldn't change with job size, and # large memory allocations will prohibit scheduling. _, runtime = sge.update_ME_specific_allocations( modelable_entity_id=mvm.modelable_entity_id.unique()[0], memory=memory, runtime=runtime) upstream_tasks = [] for upstream_jobname in self.upstream_jobs: upstream_tasks.extend(job_dag[upstream_jobname].task) task = PythonTask(script=self.script, args=self.args, name=self.job_name, upstream_tasks=upstream_tasks, env_variables=environment_variables_to_add, num_cores=slots, max_runtime_seconds=runtime, queue=queue_from_runtime(runtime), m_mem_free=f"{memory}G", max_attempts=self.max_attempts, tag='upload', j_resource=True) task.context_args = add_if_set(self.host_type) self.task.append(task) wf.add_task(task)
def create_apply_correction_tasks(self) -> None: for sex in self.parameters.sex_ids: for location in self.parameters.most_detailed_location_ids: correct_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.CORRECT ), args=[ '--action', DAG.Tasks.Type.CORRECT, '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id, '--version_id', self.parameters.version_id, '--location_id', location, '--sex_id', sex, '--env_version_id', self.parameters.envelope_version_id ], name=DAG.Tasks.Name.APPLY_CORRECTION.format( location=location, sex=sex ), num_cores=DAG.Tasks.Cores.APPLY_CORRECTION, m_mem_free=DAG.Tasks.Memory.APPLY_CORRECTION, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.ENVELOPE_DRAWS ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=( MortalityInputs.ENVELOPE_SUMMARY ) ) ] ], max_runtime_seconds=DAG.Tasks.Runtime.APPLY_CORRECTION, tag=DAG.Tasks.Type.CORRECT ) for mvid in self.parameters.best_model_version_ids: upstream_name = DAG.Tasks.Name.VALIDATE_DRAWS.format( model_version_id=mvid ) correct_task.add_upstream( self.task_map[DAG.Tasks.Type.VALIDATE][upstream_name] ) self.task_map[DAG.Tasks.Type.CORRECT][ correct_task.name ] = correct_task self.workflow.add_task(correct_task)
def create_draw_validate_tasks(self) -> None: """ Adds tasks to validate the input draws against expected hypercube Creates one job per model_version_id in our parameters of best models. Doesn't utilize any cached assets, so these jobs don't have any upstream dependencies. """ for model_version_id in self.parameters.best_model_version_ids: draw_validate_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.VALIDATE_DRAWS ), args=[ '--version_id', self.parameters.version_id, '--machine_process', self.parameters.process, '--model_version_id', model_version_id ], name=DAG.Tasks.Name.VALIDATE_DRAWS.format( model_version_id=model_version_id ), num_cores=DAG.Tasks.Cores.VALIDATE_DRAWS, m_mem_free=DAG.Tasks.Memory.VALIDATE_DRAWS, max_runtime_seconds=DAG.Tasks.Runtime.VALIDATE_DRAWS, queue=DAG.QUEUE, tag=DAG.Tasks.Type.VALIDATE ) self.task_map[DAG.Tasks.Type.VALIDATE][ draw_validate_task.name ] = draw_validate_task self.workflow.add_task(draw_validate_task)
def get_task(self, parent_dir, input_dir, input_file_pattern, modelable_entity_id, description, measure_id, year_id, decomp_step, n_draws) -> PythonTask: # make task name = self.get_task_name(modelable_entity_id) task = PythonTask(script=this_file, args=[ "--parent_dir", parent_dir, "--input_dir", input_dir, "--input_file_pattern", input_file_pattern, "--modelable_entity_id", str(modelable_entity_id), "--description", description, "--measure_id", " ".join([str(x) for x in measure_id]), "--year_id", " ".join([str(x) for x in year_id]), "--decomp_step", decomp_step, "--n_draws", str(n_draws) ], name=name, num_cores=17, m_mem_free="80.0G", max_attempts=DAG.Tasks.MAX_ATTEMPTS, tag=DAG.Tasks.SAVE, queue=DAG.Tasks.QUEUE) return task
def create_cause_aggregation_tasks(self) -> None: for sex in self.parameters.sex_ids: for location in self.parameters.most_detailed_location_ids: agg_cause_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.CAUSE_AGG ), args=[ '--version_id', self.parameters.version_id, '--parent_dir', self.parameters.parent_dir, '--location_id', location, '--sex_id', sex, ], name=DAG.Tasks.Name.CAUSE_AGGREGATION.format( location=location, sex=sex ), num_cores=DAG.Tasks.Cores.CAUSE_AGGREGATION, m_mem_free=DAG.Tasks.Memory.CAUSE_AGGREGATION, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CORRECT][ DAG.Tasks.Name.APPLY_CORRECTION.format( location=location, sex=sex ) ] ], max_runtime_seconds=DAG.Tasks.Runtime.CAUSE_AGGREGATION, tag=DAG.Tasks.Type.CAUSE_AGG ) self.task_map[DAG.Tasks.Type.CAUSE_AGG][ agg_cause_task.name ] = agg_cause_task self.workflow.add_task(agg_cause_task)
def generate_finalizer_task(self, upstream_tasks): job_hash_name = "finalizer_run_{}".format( self.no_shock_death_number_estimate_version) num_cores = 1 m_mem_free = "2G" runfile = "{}/finalizer_run_all.py".format(self.finalizer_code_dir) # How do we pass args to the finalizer run_all? args = [ "--shock_death_number_estimate_version", self.shock_death_number_estimate, "--shock_life_table_estimate_version", self.shock_life_table_estimate, "--no_shock_death_number_estimate_version", self.no_shock_death_number_estimate_version, "--no_shock_life_table_estimate_version", self.no_shock_life_table_estimate_version, "--username", self.user, "--gbd_year", self.gbd_year, "--mark_best", self.mark_best, "--shock_version_id", self.shock_aggregator_version, "--aggregate_locations", self.aggregate_locations, "--workflow_args", self.shock_death_number_estimate ] return PythonTask(name=job_hash_name, num_cores=num_cores, m_mem_free=m_mem_free, script=runfile, args=args, upstream_tasks=upstream_tasks, queue="long.q", max_runtime_seconds=180000, j_resource=True, max_attempts=1)
def create_gpr_jobs(self): # set runtime and memory based on draws gpr_runtime = 1200 gpr_memory = 4 if self.draws == 100: gpr_runtime = 1500 gpr_memory = 7 elif self.draws == 1000: gpr_runtime = 1800 gpr_memory = 10 if self.is_diet_model: gpr_runtime *= 3 gpr_memory *= 3 for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): upstream_job = self.descanso_jobs['descanso_{}_{}_{}'.format( self.run_id, ko, param_group)] for loc_group in list(range(0, self.nparallel)): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) jname = 'gpr_{}_{}_{}_{}'.format(self.run_id, ko, param_group, loc_group) task = PythonTask(script=GPR_SCRIPT, args=[ self.run_id, self.output_path, ko, self.draws, submit_params, self.nparallel, loc_group ], name=jname, num_cores=1, m_mem_free=f'{gpr_memory}G', max_runtime_seconds=gpr_runtime, max_attempts=2, tag='stgpr_gpr', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) task.add_upstream(upstream_job) self.workflow.add_task(task) self.gpr_jobs[task.name] = task
def add_job(self, wf, job_dag, mvm): """ Create a PythonTask, add it to the workflow, and update the DagNode to contain a reference to the task. Args: wf (jobmon.client.swarm.workflow.workflow): jobmon workflow job_dag (dict[str, DagNode]): a mapping of job name to DagNode mvm (cascade_ode.importer.Importer.model_version_meta): a dataframe of model settings """ num_children = self.details['num_children'] slots, memory, runtime = sge.cluster_limits('node', mvm, num_children, details=self.details) environment_variables_to_add = settings['env_variables'] environment_variables_to_add['OMP_NUM_THREADS'] = slots memory, runtime = sge.update_ME_specific_allocations( modelable_entity_id=mvm.modelable_entity_id.unique()[0], memory=memory, runtime=runtime) upstream_tasks = [] for upstream_jobname in self.upstream_jobs: upstream_tasks.extend(job_dag[upstream_jobname].task) task = PythonTask(script=self.script, args=self.args, name=self.job_name, upstream_tasks=upstream_tasks, env_variables=environment_variables_to_add, num_cores=slots, max_runtime_seconds=runtime, queue=queue_from_runtime(runtime), m_mem_free=f"{memory}G", max_attempts=self.max_attempts, tag='node', j_resource=False) task.context_args = add_if_set(self.host_type) self.task.append(task) wf.add_task(task)
def get_task(self, measure_id, location_id): upsteam_tasks = [] d = self.como_version.nonfatal_dimensions.get_simulation_dimensions( measure_id) # if location is not most detailed then its dependent on loc agg if location_id not in d.index_dim.get_level("location_id"): location_set_version_id = self.agg_loc_set_map[location_id] for component in self.como_version.components: for year_id in self._year_set: for sex_id in [sex.MALE, sex.FEMALE]: if not (component == "impairment" and measure_id == measures.INCIDENCE): task_name = LocationAggTaskFactory.get_task_name( component=component, year_id=year_id, sex_id=sex_id, measure_id=measure_id, location_set_version_id=location_set_version_id ) upsteam_tasks.append(self.task_registry[task_name]) # otherwise it is dependent on simulation tasks or incidence else: if measure_id == measures.INCIDENCE: for sex_id in [sex.MALE, sex.FEMALE]: task_name = IncidenceTaskFactory.get_task_name( location_id=location_id, sex_id=sex_id) upsteam_tasks.append(self.task_registry[task_name]) else: for year_id in self._year_set: for sex_id in [sex.MALE, sex.FEMALE]: task_name = SimulationTaskFactory.get_task_name( location_id=location_id, sex_id=sex_id, year_id=year_id) upsteam_tasks.append(self.task_registry[task_name]) name = self.get_task_name(measure_id, location_id) task = PythonTask( script=THIS_FILE, args=[ "--como_dir", self.como_version.como_dir, "--measure_id", measure_id, "--location_id", location_id ], name=name, upstream_tasks=upsteam_tasks, num_cores=1, m_mem_free="100G", max_attempts=5, max_runtime_seconds=(60 * 60 * 6), tag="summary") self.task_registry[name] = task return task
def create_cleanup_jobs(self): """Saves rake summaries and removes tempfiles no longer needed""" runtime = 600 if self.draws == 0 else 7200 for ko in list(range(0, self.holdouts + 1)): task = PythonTask(script=CLEANUP_SCRIPT, args=[ self.run_id, self.output_path, self.run_type, ko, self.draws ], name=f'clean_{self.run_id}_{ko}', num_cores=1, m_mem_free='1G', max_runtime_seconds=runtime, max_attempts=1, tag='stgpr_clean', queue='all.q') if ko == 0: for loc in self.subnat_locations: task.add_upstream(self.rake_jobs['rake_{}_{}'.format( self.run_id, loc)]) else: task.add_upstream(self.eval_jobs['eval_{}'.format( self.run_id)]) self.workflow.add_task(task) self.cleanup_jobs[task.name] = task
def create_st_jobs(self): for ko in range(0, self.holdouts + 1): upstream_job = self.stage1_jobs['stage1_{}_{}'.format( self.run_id, ko)] for param_group in range(0, len(self.param_groups)): for loc_group in range(0, self.nparallel): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) jname = 'st_{}_{}_{}_{}'.format(self.run_id, ko, param_group, loc_group) memory = 50 runtime = 1500 if self.is_diet_model: memory = 120 runtime = 28800 # 8 hours task = PythonTask(script=ST_SCRIPT, args=[ self.run_id, self.output_path, ko, self.run_type, submit_params, self.nparallel, loc_group ], name=jname, num_cores=6, m_mem_free=f'{memory}G', max_attempts=3, max_runtime_seconds=runtime, tag='stgpr_spacetime', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) task.add_upstream(upstream_job) self.workflow.add_task(task) self.st_jobs[task.name] = task
def create_eval_jobs(self): """ For hyperparameter selection runs only, determine best hyperparameter set based on in-sample or out-of-sample RMSE. - run_type = "in_sample_selection" - run_type = "oos_selection" For runs with only one set of parameters, set the best_param_set to the *only* param_set (param_set 0) for consistency in rake inputs. Lastly, just collect the disparate fit_stats files and combine into a single file, saved as fit_stats.csv for all run types """ task = PythonTask(script=EVAL_SCRIPT, args=[ self.run_id, self.output_path, self.run_type, self.holdouts, self.n_parameter_sets ], name=f'eval_{self.run_id}', num_cores=1, m_mem_free='500M', max_runtime_seconds=180, max_attempts=2, tag='stgpr_eval', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): post_label = 'post_{}_{}_{}'.format(self.run_id, ko, param_group) task.add_upstream(self.post_jobs[post_label]) self.workflow.add_task(task) self.eval_jobs[task.name] = task
def get_task(self, component, year_id, sex_id, measure_id, location_set_version_id): loc_trees = dbtrees.loctree( location_set_version_id=location_set_version_id, return_many=True) # put all aggregate locations in a mapping dict for summary dependency agg_locs = [] for loc_tree in loc_trees: agg_locs.extend([ node for node in loc_tree.node_ids if node not in [x.id for x in loc_tree.leaves()] ]) for location_id in agg_locs: self.agg_loc_set_map[location_id] = location_set_version_id # hunt down upstream task list upstream_tasks = [] for location_id in [node.id for node in loc_tree.leaves()]: if measure_id == measures.INCIDENCE: upstream_name = IncidenceTaskFactory.get_task_name( location_id, sex_id) elif measure_id == measures.PREVALENCE and component in [ "sequela", "injuries", "impairment" ]: upstream_name = SimulationInputTaskFactory.get_task_name( location_id, sex_id) else: upstream_name = SimulationTaskFactory.get_task_name( location_id, sex_id, year_id) upstream_tasks.append(self.task_registry[upstream_name]) name = self.get_task_name(component, year_id, sex_id, measure_id, location_set_version_id) task = PythonTask(script=THIS_FILE, args=[ "--como_dir", self.como_version.como_dir, "--component", component, "--year_id", year_id, "--sex_id", sex_id, "--measure_id", measure_id, "--location_set_version_id", location_set_version_id ], name=name, upstream_tasks=upstream_tasks, num_cores=4, m_mem_free="300G", max_attempts=5, max_runtime_seconds=(60 * 60 * 20), tag="loc_agg") self.task_registry[name] = task return task
def generate_run_mv_task(self, upstream_tasks, loc): job_hash_name = "run_mv_{}_{}".format(self.version_id, loc) num_cores = 5 m_mem_free = "80G" runfile = "{}/predict_machine_vision.py".format(self.code_dir) args = ["--version_id", self.version_id, "--ihme_loc_id", loc] return PythonTask(name=job_hash_name, num_cores=num_cores, m_mem_free=m_mem_free, max_runtime_seconds=90000, j_resource=False, script=runfile, args=args, upstream_tasks=upstream_tasks, queue="all.q")
def get_task(self, location_id, sex_id, n_processes): name = self.get_task_name(location_id, sex_id) task = PythonTask(script=THIS_FILE, args=[ "--como_dir", self.como_version.como_dir, "--location_id", location_id, "--sex_id", sex_id, "--n_processes", n_processes ], name=name, num_cores=25, m_mem_free="60G", max_attempts=5, max_runtime_seconds=(60 * 60 * 6), tag="inci") self.task_registry[name] = task return task
def create_calculate_ylls_tasks(self) -> None: """ Adds tasks to calculate YLLs from our scaled draws by location, sex, and year. Dependent on completed scalar application for respective locations, sexes, and years. """ for sex in self.parameters.sex_ids: for location in self.parameters.most_detailed_location_ids: calc_ylls = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.YLLS ), args=[ '--action', DAG.Tasks.Type.CALCULATE, '--machine_process', self.parameters.process, '--parent_dir', self.parameters.parent_dir, '--location_id', location, '--sex_id', sex ], name=DAG.Tasks.Name.CALC_YLLS.format( location=location, sex=sex ), num_cores=DAG.Tasks.Cores.YLLS, m_mem_free=DAG.Tasks.Memory.YLLS, max_runtime_seconds=DAG.Tasks.Runtime.CALCULATE, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_PRED_EX ], self.task_map[DAG.Tasks.Type.CAUSE_AGG][ DAG.Tasks.Name.CAUSE_AGGREGATION.format( location=location, sex=sex ) ] ], tag=DAG.Tasks.Type.CALCULATE ) self.task_map[DAG.Tasks.Type.CALCULATE][ calc_ylls.name ] = calc_ylls self.workflow.add_task(calc_ylls)
def create_imported_cases_jobs(self): """Generates the tasks and adds them to the task_dag.""" # TODO: profile and revise core/mem allocation. for cause in self.cause_ids: task = PythonTask( script=os.path.join(self.code_dir, 'imported_cases.py'), args=[self.version_id, '--cause_id', cause, '--decomp_step', self.decomp_step, '--gbd_round_id', self.gbd_round_id, '--output_dir', self.out_dir], name='imported_cases_{}_{}'.format(self.version_id, cause), num_cores=42, m_mem_free="100.0G", max_attempts=3, tag='imported_cases', queue='all.q') self.workflow.add_task(task)
def add_interpolate_tasks(self): for meid in self.proportion_ids: for sex in self.sex_ids: arglist = [ '--gbd_id', meid, '--proportion_measure_id', self.proportion_measure_id, '--sex_id', sex, '--gbd_round_id', self.gbd_round_id, '--intermediate_dir', self.intermediate_dir ] if self.decomp_step: arglist.extend(['--decomp_step', self.decomp_step]) task = PythonTask(script=os.path.join(self._CODEDIR, 'split_interp.py'), args=arglist, name='split_model_interpolate_{}_{}'.format( meid, sex), num_cores=30, m_mem_free='60G', max_runtime_seconds=14400, max_attempts=10) self.workflow.add_task(task)
def get_task(self, node, output_dir, location_id, year_id, sex_id, decomp_step, n_draws, dependency_list): # make task name = self.get_task_name(node, location_id, year_id, sex_id) task = PythonTask(script=this_file, args=[ "--output_dir", output_dir, "--location_id", str(location_id), "--year_id", str(year_id), "--sex_id", str(sex_id), "--decomp_step", decomp_step, "--n_draws", str(n_draws) ], upstream_tasks=dependency_list, name=name, num_cores=10, m_mem_free="100.0G", max_attempts=DAG.Tasks.MAX_ATTEMPTS, tag=DAG.Tasks.SUPER_SQUEEZE, queue=DAG.Tasks.QUEUE) return task
def get_task(self, node, process_graph, output_dir, decomp_step, year_id, n_draws): dep_list = get_dependencies(node, process_graph, self.task_registry) # make task name = self.get_task_name(node) task = PythonTask( script=this_file, args=[ "--process_name", node, "--output_dir", output_dir, "--decomp_step", decomp_step, "--year_id", " ".join([str(x) for x in year_id]), "--n_draws", str(n_draws) ], upstream_tasks=dep_list, name=name, num_cores=25, m_mem_free="300.0G", max_attempts=DAG.Tasks.MAX_ATTEMPTS, tag=DAG.Tasks.EX_ADJUST, queue=DAG.Tasks.QUEUE) return task
def get_task(self, location_id, sex_id, year_id, n_simulants, n_processes): # get upstream dep_name = SimulationInputTaskFactory.get_task_name( location_id, sex_id) dep = self.task_registry[dep_name] # make task name = self.get_task_name(location_id, sex_id, year_id) task = PythonTask(script=THIS_FILE, args=[ "--como_dir", self.como_version.como_dir, "--location_id", location_id, "--sex_id", sex_id, "--year_id", year_id, "--n_simulants", n_simulants, "--n_processes", n_processes ], name=name, upstream_tasks=[dep], num_cores=25, m_mem_free="60G", max_attempts=5, max_runtime_seconds=(60 * 60 * 3), tag="sim") self.task_registry[name] = task return task
def create_location_aggregation_tasks(self) -> None: """ Adds tasks to aggregate up the location hierarchy for each location set id, sex, year, and measure in our FauxCorrect run. Dependent on each location, sex, and year specific group of scalar or calculate ylls tasks to be completed for their respective measure ids. """ for location_set_id in self.parameters.location_set_ids: for year in self.parameters.year_ids: for measure in self.parameters.measure_ids: for loc_agg_type in LocationAggregation.Type.CODCORRECT: if ( FilePaths.UNSCALED_DIR in loc_agg_type and measure == Measures.Ids.YLLS ): continue agg_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.LOC_AGG ), args=[ '--action', DAG.Tasks.Type.LOC_AGG, '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', ( self.parameters.gbd_round_id ), '--aggregation_type', loc_agg_type, '--location_set_id', location_set_id, '--year_id', year, '--measure_id', measure ], name=DAG.Tasks.Name.LOCATION_AGGREGATION.format( aggregation_type=( loc_agg_type.replace("/","_") ), location_set=location_set_id, measure=measure, year=year ), num_cores=DAG.Tasks.Cores.LOCATION_AGGREGATION, m_mem_free=( DAG.Tasks.Memory.LOCATION_AGGREGATION ), max_runtime_seconds=( DAG.Tasks.Runtime.LOCATION_AGGREGATION ), queue=DAG.QUEUE, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_REGIONAL_SCALARS ] ], tag=DAG.Tasks.Type.LOC_AGG ) # Attach upstream dependencies, all locations for # sex and year. for sex in self.parameters.sex_ids: for loc in ( self.parameters.most_detailed_location_ids ): if measure == Measures.Ids.YLLS: # If aggregating measure 4 (YLLs), attach # calc ylls jobs as upstream agg_task.add_upstream( self.task_map[DAG.Tasks.Type.CALCULATE][ DAG.Tasks.Name.CALC_YLLS.format( location=loc, sex=sex ) ] ) else: # If measure is not 4 (YLLs), then add # cause agg jobs as upstream. agg_task.add_upstream( self.task_map[DAG.Tasks.Type.CAUSE_AGG][ (DAG.Tasks.Name.CAUSE_AGGREGATION .format( location=loc, sex=sex )) ] ) if location_set_id not in [ LocationSetId.OUTPUTS, LocationSetId.SDI, LocationSetId.STANDARD ]: # If location set is one of the special # sets that central computation only aggregates # at the end of a round, add the outputs location # set as upstream dependency so it finishes first. agg_task.add_upstream( self.task_map[DAG.Tasks.Type.LOC_AGG][ DAG.Tasks.Name.LOCATION_AGGREGATION.format( aggregation_type=( loc_agg_type.replace("/","_") ), location_set=LocationSetId.OUTPUTS, measure=measure, year=year ) ] ) self.task_map[DAG.Tasks.Type.LOC_AGG][ agg_task.name ] = agg_task self.workflow.add_task(agg_task)
def main() -> None: args = parse_args() user = getpass.getuser() today_string = datetime.date.today().strftime('%m%d%y') workflow = Workflow( workflow_args= f'anemia_post_interp_temp_{args.decomp_step}_{today_string}', name=f'anemia_post_{args.decomp_step}_{today_string}', description=f'Anemia: Post-processing for decomp {args.decomp_step}', project="proj_anemia", stderr="FILEPATH", stdout="FILEPATH", working_dir=path_to_directory, resume=True) anemia_causes = ('hiv', 'pud', 'gastritis', 'esrd_dialysis', 'ckd3', 'ckd4', 'ckd5', 'cirrhosis') for anemia_cause in anemia_causes: # load in the info table info_df = pd.read_csv("FILEPATH") new_me_id_list = info_df['proportion_me'].tolist() # submit compute job for each me_id compute_prop_tasks = [] for year in args.year_id: task = PythonTask(script="FILEPATH", args=[ "--year_id", year, "--anemia_cause", anemia_cause, "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--out_dir", args.out_dir ], name=f"make_{anemia_cause}_props_{year}", tag="compute_props", num_cores=1, m_mem_free="12G", max_attempts=3, max_runtime_seconds=60 * 60 * 2, queue='all.q') compute_prop_tasks.append(task) workflow.add_tasks(compute_prop_tasks) # submit save result jobs after compute jobs finish for new_me_id in new_me_id_list: task = PythonTask(script="FILEPATH", args=[ "--modelable_entity_id", new_me_id, "--year_ids", " ".join([str(yr) for yr in args.year_id ]), "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--save_dir", 'FILEPATH' ], name=f"save_props_{new_me_id}", tag="save_props", upstream_tasks=compute_prop_tasks, num_cores=8, m_mem_free="90G", max_attempts=3, max_runtime_seconds=60 * 60 * 8, queue='long.q') workflow.add_task(task) status = workflow.run() print(f'Workflow finished with status {status}')
def create_cache_tasks(self) -> None: """ Adds up to five cache tasks that must occur at the beginning of each FauxCorrect run: 1. envelope 2. population 3. spacetime restrictions 4. regional scalars 5. pred_ex These jobs kick off the CoDCorrect process and do not have any upstream dependencies. """ # Create mortality inputs cache jobs. for mort_process, memory in zip(MortalityInputs.ALL_INPUTS, DAG.Tasks.Memory.CACHE_ALL_MORTALITY_INPUTS ): cache_mortality_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.CACHE_MORT_INPUT ), args=[ '--mort_process', mort_process, '--machine_process', self.parameters.process, '--version_id', self.parameters.version_id ], name=DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=mort_process ), num_cores=DAG.Tasks.Cores.CACHE_MORTALITY, m_mem_free=memory, max_runtime_seconds=DAG.Tasks.Runtime.CACHE_MORTALITY, queue=DAG.QUEUE, tag=DAG.Tasks.Type.CACHE ) self.task_map[DAG.Tasks.Type.CACHE][cache_mortality_task.name] = ( cache_mortality_task ) self.workflow.add_task(cache_mortality_task) # Create cache_spacetime_restrictions job cache_spacetime_restrictions = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.CORRECT ), args=[ '--action', DAG.Tasks.Type.CACHE, '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id ], name=DAG.Tasks.Name.CACHE_SPACETIME, num_cores=DAG.Tasks.Cores.CACHE_SPACETIME_RESTRICTIONS, m_mem_free=DAG.Tasks.Memory.CACHE_SPACETIME_RESTRICTIONS, max_runtime_seconds=DAG.Tasks.Runtime.CACHE_SPACETIME_RESTRICTIONS, queue=DAG.QUEUE, tag=DAG.Tasks.Type.CACHE ) self.task_map[DAG.Tasks.Type.CACHE][ cache_spacetime_restrictions.name] = cache_spacetime_restrictions self.workflow.add_task(cache_spacetime_restrictions) # Create cache_regional_scalars job cache_regional_scalars = PythonTask( script=os.path.join(self.code_dir, DAG.Executables.LOC_AGG), args=[ '--action', DAG.Tasks.Type.CACHE, '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id ], name=DAG.Tasks.Name.CACHE_REGIONAL_SCALARS, num_cores=DAG.Tasks.Cores.CACHE_REGIONAL_SCALARS, m_mem_free=DAG.Tasks.Memory.CACHE_REGIONAL_SCALARS, max_runtime_seconds=DAG.Tasks.Runtime.CACHE_REGIONAL_SCALARS, queue=DAG.QUEUE, tag=DAG.Tasks.Type.CACHE ) self.task_map[DAG.Tasks.Type.CACHE][cache_regional_scalars.name] = ( cache_regional_scalars ) self.workflow.add_task(cache_regional_scalars) if gbd.measures.YLL in self.parameters.measure_ids: # Create cache_pred_ex job cache_pred_ex = PythonTask( script=os.path.join(self.code_dir, DAG.Executables.YLLS), args=[ '--action', DAG.Tasks.Type.CACHE, '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id ], name=DAG.Tasks.Name.CACHE_PRED_EX, num_cores=DAG.Tasks.Cores.CACHE_PRED_EX, m_mem_free=DAG.Tasks.Memory.CACHE_PRED_EX, max_runtime_seconds=DAG.Tasks.Runtime.CACHE_PRED_EX, queue=DAG.QUEUE, tag=DAG.Tasks.Type.CACHE ) self.task_map[DAG.Tasks.Type.CACHE][cache_pred_ex.name] = ( cache_pred_ex ) self.workflow.add_task(cache_pred_ex)
def create_upload_tasks(self) -> None: """ Adds tasks to upload summaries to gbd and / or cod databases. """ # Determine if percent change is part of workflow if self.parameters.year_start_ids is not None: upload_types = ['single', 'multi'] else: upload_types = ['single'] # gbd upload tasks for measure in self.parameters.measure_ids: for upload_type in upload_types: upload_gbd_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.UPLOAD ), args=[ '--machine_process', self.parameters.process, '--version_id', self.parameters.version_id, '--database', DataBases.GBD, '--measure_id', measure, '--upload_type', upload_type ], name=DAG.Tasks.Name.UPLOAD.format( database=DataBases.GBD, uploadtype=upload_type, measure=measure ), num_cores=DAG.Tasks.Cores.UPLOAD, m_mem_free=DAG.Tasks.Memory.UPLOAD, max_runtime_seconds=DAG.Tasks.Runtime.UPLOAD, queue=DAG.UPLOAD_QUEUE, tag=DAG.Tasks.Type.UPLOAD ) # add gbd summarization tasks as upstream dependencies if upload_type == 'single': for year in self.parameters.year_ids: for location in self.parameters.location_ids: upload_gbd_task.add_upstream( self.task_map[DAG.Tasks.Type.SUMMARIZE][ DAG.Tasks.Name.SUMMARIZE_GBD.format( measure=measure, location=location, year=year ) ] ) else: for year_index in range(len(self.parameters.year_start_ids)): for location in self.parameters.location_ids: upload_gbd_task.add_upstream( self.task_map[DAG.Tasks.Type.SUMMARIZE][ DAG.Tasks.Name.SUMMARIZE_PCT_CHANGE.format( measure=measure, location=location, year_start=self.parameters.year_start_ids[ year_index], year_end=self.parameters.year_end_ids[ year_index] ) ] ) self.task_map[DAG.Tasks.Type.UPLOAD][ upload_gbd_task.name ] = upload_gbd_task self.workflow.add_task(upload_gbd_task) # cod upload tasks - DEATHS only if DataBases.COD in self.parameters.databases: upload_cod_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.UPLOAD), args=[ '--machine_process', self.parameters.process, '--version_id', self.parameters.version_id, '--database', DataBases.COD, '--measure_id', Measures.Ids.DEATHS, ], name=DAG.Tasks.Name.UPLOAD.format( database=DataBases.COD, uploadtype='single', measure=Measures.Ids.DEATHS ), num_cores=DAG.Tasks.Cores.UPLOAD, m_mem_free=DAG.Tasks.Memory.UPLOAD, max_runtime_seconds=DAG.Tasks.Runtime.UPLOAD, queue=DAG.QUEUE, tag=DAG.Tasks.Type.UPLOAD ) # add cod summarization tasks as upstream dependencies for year in self.parameters.year_ids: for location in self.parameters.location_ids: upload_cod_task.add_upstream( self.task_map[DAG.Tasks.Type.SUMMARIZE][ DAG.Tasks.Name.SUMMARIZE_COD.format( measure=Measures.Ids.DEATHS, location=location, year=year ) ] ) self.task_map[DAG.Tasks.Type.UPLOAD][ upload_cod_task.name ] = upload_cod_task self.workflow.add_task(upload_cod_task)
def create_summarize_tasks(self) -> None: """ Adds tasks to summarize draw level estimates for each location, year, and measure in our FauxCorrect run. Dependent on the mortality input caching tasks as well as the append shocks tasks. """ for measure in self.parameters.measure_ids: for year in self.parameters.year_ids: for location in self.parameters.location_ids: # Create summarize tasks for gbd schema. summarize_gbd_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.SUMMARIZE_GBD ), args=[ '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id, '--location_id', location, '--year_id', year, '--measure_id', measure, '--machine_process', self.parameters.process ], name=DAG.Tasks.Name.SUMMARIZE_GBD.format( measure=measure, location=location, year=year ), num_cores=DAG.Tasks.Cores.SUMMARIZE, m_mem_free=DAG.Tasks.Memory.SUMMARIZE, max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE, queue=DAG.QUEUE, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.ENVELOPE_DRAWS ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=( MortalityInputs.ENVELOPE_SUMMARY ) ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.POPULATION ) ] ], tag=DAG.Tasks.Type.SUMMARIZE ) for append_sex in self.parameters.sex_ids: summarize_gbd_task.add_upstream( self.task_map[DAG.Tasks.Type.APPEND][ DAG.Tasks.Name.APPEND_SHOCKS.format( location=location, sex=append_sex ) ] ) self.task_map[DAG.Tasks.Type.SUMMARIZE][ summarize_gbd_task.name ] = summarize_gbd_task self.workflow.add_task(summarize_gbd_task) if ( measure == Measures.Ids.YLLS or DataBases.COD not in self.parameters.databases ): continue # Create summarize tasks for deaths and cod schema. summarize_cod_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.SUMMARIZE_COD ), args=[ '--version_id', self.parameters.version_id, '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id, '--location_id', location, '--year_id', year ], name=DAG.Tasks.Name.SUMMARIZE_COD.format( measure=Measures.Ids.DEATHS, location=location, year=year ), num_cores=DAG.Tasks.Cores.SUMMARIZE, m_mem_free=DAG.Tasks.Memory.SUMMARIZE, max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE, queue=DAG.QUEUE, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.ENVELOPE_DRAWS ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=( MortalityInputs.ENVELOPE_SUMMARY ) ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.POPULATION ) ] ], tag=DAG.Tasks.Type.SUMMARIZE ) for append_sex in self.parameters.sex_ids: summarize_cod_task.add_upstream( self.task_map[DAG.Tasks.Type.APPEND][ DAG.Tasks.Name.APPEND_SHOCKS.format( location=location, sex=append_sex ) ] ) self.task_map[DAG.Tasks.Type.SUMMARIZE][ summarize_cod_task.name ] = summarize_cod_task self.workflow.add_task(summarize_cod_task) # pct_change summarization tasks if self.parameters.year_start_ids: for year_index in range(len(self.parameters.year_start_ids)): for pctc_location in self.parameters.location_ids: summarize_pct_change_task = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.SUMMARIZE_PCT_CHANGE ), args=[ '--parent_dir', self.parameters.parent_dir, '--gbd_round_id', self.parameters.gbd_round_id, '--location_id', pctc_location, '--year_start_id', self.parameters.year_start_ids[ year_index], '--year_end_id', self.parameters.year_end_ids[ year_index], '--measure_id', measure, '--machine_process', self.parameters.process ], name=DAG.Tasks.Name.SUMMARIZE_PCT_CHANGE.format( measure=measure, location=pctc_location, year_start=self.parameters.year_start_ids[year_index], year_end=self.parameters.year_end_ids[year_index] ), num_cores=DAG.Tasks.Cores.PCT_CHANGE, m_mem_free=DAG.Tasks.Memory.PCT_CHANGE, max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE, queue=DAG.QUEUE, upstream_tasks=[ self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.ENVELOPE_DRAWS ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=( MortalityInputs.ENVELOPE_SUMMARY ) ) ], self.task_map[DAG.Tasks.Type.CACHE][ DAG.Tasks.Name.CACHE_MORTALITY.format( mort_process=MortalityInputs.POPULATION ) ] ], tag=DAG.Tasks.Type.SUMMARIZE ) for append_sex in self.parameters.sex_ids: summarize_pct_change_task.add_upstream( self.task_map[DAG.Tasks.Type.APPEND][ DAG.Tasks.Name.APPEND_SHOCKS.format( location=pctc_location, sex=append_sex ) ] ) self.task_map[DAG.Tasks.Type.SUMMARIZE][ summarize_pct_change_task.name ] = summarize_pct_change_task self.workflow.add_task(summarize_pct_change_task)
def create_append_shocks_tasks(self) -> None: """ Adds tasks to append shocks to Deaths and YLLs by location, sex, and year. Dependent on completed correction application for respective locations, sexes, and years. Also dependent on location aggregation. """ for sex in self.parameters.sex_ids: for location in self.parameters.location_ids: most_detailed_location = ( location in self.parameters.most_detailed_location_ids ) append_shocks = PythonTask( script=os.path.join( self.code_dir, DAG.Executables.APPEND_SHOCKS ), args=[ '--parent_dir', self.parameters.parent_dir, '--machine_process', self.parameters.process, '--measure_ids', " ".join([str(x) for x in self.parameters.measure_ids]), '--location_id', location, '--most_detailed_location', most_detailed_location, '--sex_id', sex ], name=DAG.Tasks.Name.APPEND_SHOCKS.format( location=location, sex=sex ), num_cores=DAG.Tasks.Cores.APPEND_SHOCKS, m_mem_free=DAG.Tasks.Memory.APPEND_SHOCKS, max_runtime_seconds=DAG.Tasks.Runtime.APPEND, tag=DAG.Tasks.Type.APPEND ) if ( most_detailed_location and Measures.Ids.YLLS in self.parameters.measure_ids ): # attach calc ylls jobs as upstream append_shocks.add_upstream( self.task_map[DAG.Tasks.Type.CALCULATE][ DAG.Tasks.Name.CALC_YLLS.format( location=location, sex=sex ) ] ) elif ( most_detailed_location and Measures.Ids.YLLS not in self.parameters.measure_ids ): # add cause agg jobs as upstream. append_shocks.add_upstream( self.task_map[DAG.Tasks.Type.CAUSE_AGG][ (DAG.Tasks.Name.CAUSE_AGGREGATION .format( location=location, sex=sex )) ] ) else: # Add location aggregation tasks as upstream dependency. for location_set_id in self.parameters.location_set_ids: for loc_agg_year in self.parameters.year_ids: for loc_agg_measure in self.parameters.measure_ids: for loc_agg_type in ( LocationAggregation.Type.CODCORRECT ): if ( FilePaths.UNSCALED_DIR in loc_agg_type and loc_agg_measure == Measures.Ids.YLLS ): continue append_shocks.add_upstream( self.task_map[ DAG.Tasks.Type.LOC_AGG][( DAG.Tasks.Name.LOCATION_AGGREGATION .format( aggregation_type=( loc_agg_type .replace("/","_") ), location_set=location_set_id, measure=loc_agg_measure, year=loc_agg_year, ) )] ) self.task_map[DAG.Tasks.Type.APPEND][ append_shocks.name ] = append_shocks self.workflow.add_task(append_shocks)