Пример #1
0
    def create_post_jobs(self):
        """Depends on rake jobs. Calculates fit stats
        and cleans up file folders, no mas."""

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):

                submit_params = ','.join(
                    [str(x) for x in self.param_groups[param_group]])

                task = PythonTask(
                    script=POST_SCRIPT,
                    args=[
                        self.run_id, self.output_path, ko, self.run_type,
                        self.holdouts, submit_params
                    ],
                    name=f'post_{self.run_id}_{ko}_{param_group}',
                    num_cores=1,
                    m_mem_free='2G',
                    max_runtime_seconds=300,
                    max_attempts=2,
                    tag='stgpr_post',
                    queue='all.q',
                    resource_scales=RESOURCE_SCALES,
                    hard_limits=True)

                # add ST upstreams
                for loc_group in list(range(0, self.nparallel)):
                    gp_label = 'gpr_{}_{}_{}_{}'.format(
                        self.run_id, ko, param_group, loc_group)
                    upstream_job = self.gpr_jobs[gp_label]
                    task.add_upstream(upstream_job)

                self.workflow.add_task(task)
                self.post_jobs[task.name] = task
Пример #2
0
    def create_descanso_jobs(self):
        """Depends on aggregate locations coming out of loc agg jobs"""
        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):

                submit_params = ','.join(
                    [str(x) for x in self.param_groups[param_group]])

                runtime = 3600 if self.is_diet_model else 300
                task = PythonTask(
                    script=IM_SCRIPT,
                    args=[
                        self.run_id, self.output_path, ko, self.draws,
                        self.nparallel, submit_params
                    ],
                    name=f'descanso_{self.run_id}_{ko}_{param_group}',
                    num_cores=1,
                    m_mem_free=
                    '20G',  # upped from 5 to 20 for variance simulation
                    max_runtime_seconds=runtime,
                    max_attempts=2,
                    tag='stgpr_amp_nsv',
                    queue='all.q',
                    resource_scales=RESOURCE_SCALES,
                    hard_limits=True)

                # add ST upstreams
                for loc_group in list(range(0, self.nparallel)):
                    st_label = 'st_{}_{}_{}_{}'.format(self.run_id, ko,
                                                       param_group, loc_group)
                    upstream_job = self.st_jobs[st_label]
                    task.add_upstream(upstream_job)

                self.workflow.add_task(task)
                self.descanso_jobs[task.name] = task
Пример #3
0
def main() -> None:
    args = parse_args()
    user = getpass.getuser()
    today_string = datetime.date.today().strftime('%m%d%y')
    workflow = Workflow(
        workflow_args=f'anemia_malaria_{args.decomp_step}_{today_string}',
        name=f'anemia_malaria_{args.decomp_step}_{today_string}',
        description=
        f'Anemia: Malaria pre-processing for decomp {args.decomp_step}',
        project="proj_anemia",
        stderr="FILEPATH",
        stdout="FILEPATH",
        working_dir=path_to_directory,
        resume=True)

    # first submit the subtract clinical jobs
    subtract_tasks = []
    demo = get_demographics("epi", gbd_round_id=args.gbd_round_id)
    for loc in demo['location_id']:
        task = PythonTask(script="FILEPATH",
                          args=[
                              "--location_id", loc, "--gbd_round_id",
                              args.gbd_round_id, "--decomp_step",
                              args.decomp_step, "--out_dir", args.out_dir
                          ],
                          name=f"malaria_subtract_{loc}",
                          tag="malaria_subtract",
                          num_cores=2,
                          m_mem_free="8G",
                          max_attempts=3,
                          max_runtime_seconds=60 * 60 * 3,
                          queue='all.q')
        subtract_tasks.append(task)
    workflow.add_tasks(subtract_tasks)

    # once the new draws exist, save results
    for modelable_entity_id in [19390, 19394]:
        task = PythonTask(script="FILEPATH",
                          args=[
                              "--modelable_entity_id", modelable_entity_id,
                              "--gbd_round_id", args.gbd_round_id,
                              "--decomp_step", args.decomp_step, "--out_dir",
                              args.out_dir
                          ],
                          name=f"malaria_save_{modelable_entity_id}",
                          tag="malaria_save",
                          upstream_tasks=subtract_tasks,
                          num_cores=8,
                          m_mem_free="100G",
                          max_attempts=3,
                          max_runtime_seconds=60 * 60 * 24,
                          queue='all.q')
        workflow.add_task(task)

    status = workflow.run()
    print(f'Workflow finished with status {status}')
Пример #4
0
    def add_job(self, wf, job_dag, mvm):
        """
        Create a PythonTask, add it to the workflow, and update the DagNode to
        contain a reference to the task.

        Args:
            wf (jobmon.client.swarm.workflow.workflow): jobmon workflow
            job_dag (dict[str, DagNode]): a mapping of job name to DagNode
            mvm (cascade_ode.importer.Importer.model_version_meta): a dataframe
                of model settings

        The varnish job does the following:
           1. Uploads fits
           2. Uploads adjusted data
           3. Computes fit statistics
           4. Uploads fit statistics
           5. Attempts to generate diagnostic plots
           5. Computes and uploads finals (save-results)
           6. Updates the status of the model to finished
        """
        slots, memory, runtime = sge.cluster_limits('varnish',
                                                    mvm,
                                                    details=self.details)
        environment_variables_to_add = settings['env_variables']
        environment_variables_to_add['OMP_NUM_THREADS'] = slots

        # Varnish memory allocation shouldn't change with job size, and
        # large memory allocations will prohibit scheduling.
        _, runtime = sge.update_ME_specific_allocations(
            modelable_entity_id=mvm.modelable_entity_id.unique()[0],
            memory=memory,
            runtime=runtime)

        upstream_tasks = []
        for upstream_jobname in self.upstream_jobs:
            upstream_tasks.extend(job_dag[upstream_jobname].task)

        task = PythonTask(script=self.script,
                          args=self.args,
                          name=self.job_name,
                          upstream_tasks=upstream_tasks,
                          env_variables=environment_variables_to_add,
                          num_cores=slots,
                          max_runtime_seconds=runtime,
                          queue=queue_from_runtime(runtime),
                          m_mem_free=f"{memory}G",
                          max_attempts=self.max_attempts,
                          tag='upload',
                          j_resource=True)
        task.context_args = add_if_set(self.host_type)

        self.task.append(task)
        wf.add_task(task)
Пример #5
0
    def create_apply_correction_tasks(self) -> None:
        for sex in self.parameters.sex_ids:
            for location in self.parameters.most_detailed_location_ids:
                correct_task = PythonTask(
                    script=os.path.join(
                        self.code_dir, DAG.Executables.CORRECT
                    ),
                    args=[
                        '--action', DAG.Tasks.Type.CORRECT,
                        '--parent_dir', self.parameters.parent_dir,
                        '--gbd_round_id', self.parameters.gbd_round_id,
                        '--version_id', self.parameters.version_id,
                        '--location_id', location,
                        '--sex_id', sex,
                        '--env_version_id', self.parameters.envelope_version_id
                    ],
                    name=DAG.Tasks.Name.APPLY_CORRECTION.format(
                        location=location,
                        sex=sex
                    ),
                    num_cores=DAG.Tasks.Cores.APPLY_CORRECTION,
                    m_mem_free=DAG.Tasks.Memory.APPLY_CORRECTION,
                    upstream_tasks=[
                        self.task_map[DAG.Tasks.Type.CACHE][
                            DAG.Tasks.Name.CACHE_MORTALITY.format(
                                mort_process=MortalityInputs.ENVELOPE_DRAWS
                            )
                        ],
                        self.task_map[DAG.Tasks.Type.CACHE][
                            DAG.Tasks.Name.CACHE_MORTALITY.format(
                                mort_process=(
                                    MortalityInputs.ENVELOPE_SUMMARY
                                )
                            )
                        ]
                    ],
                    max_runtime_seconds=DAG.Tasks.Runtime.APPLY_CORRECTION,
                    tag=DAG.Tasks.Type.CORRECT
                )
                for mvid in self.parameters.best_model_version_ids:
                    upstream_name = DAG.Tasks.Name.VALIDATE_DRAWS.format(
                        model_version_id=mvid
                    )
                    correct_task.add_upstream(
                        self.task_map[DAG.Tasks.Type.VALIDATE][upstream_name]
                    )
                self.task_map[DAG.Tasks.Type.CORRECT][
                    correct_task.name
                ] = correct_task

                self.workflow.add_task(correct_task)
Пример #6
0
    def create_draw_validate_tasks(self) -> None:
        """
        Adds tasks to validate the input draws against expected hypercube

        Creates one job per model_version_id in our parameters of best models.
        Doesn't utilize any cached assets, so these jobs don't have any
        upstream dependencies.
        """
        for model_version_id in self.parameters.best_model_version_ids:
            draw_validate_task = PythonTask(
                script=os.path.join(
                    self.code_dir, DAG.Executables.VALIDATE_DRAWS
                ),
                args=[
                    '--version_id', self.parameters.version_id,
                    '--machine_process', self.parameters.process,
                    '--model_version_id', model_version_id
                ],
                name=DAG.Tasks.Name.VALIDATE_DRAWS.format(
                    model_version_id=model_version_id
                ),
                num_cores=DAG.Tasks.Cores.VALIDATE_DRAWS,
                m_mem_free=DAG.Tasks.Memory.VALIDATE_DRAWS,
                max_runtime_seconds=DAG.Tasks.Runtime.VALIDATE_DRAWS,
                queue=DAG.QUEUE,
                tag=DAG.Tasks.Type.VALIDATE
            )
            self.task_map[DAG.Tasks.Type.VALIDATE][
                draw_validate_task.name
            ] = draw_validate_task
            self.workflow.add_task(draw_validate_task)
Пример #7
0
    def get_task(self, parent_dir, input_dir, input_file_pattern,
                 modelable_entity_id, description, measure_id, year_id,
                 decomp_step, n_draws) -> PythonTask:
        # make task
        name = self.get_task_name(modelable_entity_id)
        task = PythonTask(script=this_file,
                          args=[
                              "--parent_dir", parent_dir, "--input_dir",
                              input_dir, "--input_file_pattern",
                              input_file_pattern, "--modelable_entity_id",
                              str(modelable_entity_id), "--description",
                              description, "--measure_id",
                              " ".join([str(x) for x in measure_id]),
                              "--year_id", " ".join([str(x) for x in year_id]),
                              "--decomp_step", decomp_step, "--n_draws",
                              str(n_draws)
                          ],
                          name=name,
                          num_cores=17,
                          m_mem_free="80.0G",
                          max_attempts=DAG.Tasks.MAX_ATTEMPTS,
                          tag=DAG.Tasks.SAVE,
                          queue=DAG.Tasks.QUEUE)

        return task
Пример #8
0
 def create_cause_aggregation_tasks(self) -> None:
     for sex in self.parameters.sex_ids:
         for location in self.parameters.most_detailed_location_ids:
             agg_cause_task = PythonTask(
                 script=os.path.join(
                     self.code_dir, DAG.Executables.CAUSE_AGG
                 ),
                 args=[
                     '--version_id', self.parameters.version_id,
                     '--parent_dir', self.parameters.parent_dir,
                     '--location_id', location,
                     '--sex_id', sex,
                 ],
                 name=DAG.Tasks.Name.CAUSE_AGGREGATION.format(
                     location=location,
                     sex=sex
                 ),
                 num_cores=DAG.Tasks.Cores.CAUSE_AGGREGATION,
                 m_mem_free=DAG.Tasks.Memory.CAUSE_AGGREGATION,
                 upstream_tasks=[
                     self.task_map[DAG.Tasks.Type.CORRECT][
                         DAG.Tasks.Name.APPLY_CORRECTION.format(
                             location=location,
                             sex=sex
                         )
                     ]
                 ],
                 max_runtime_seconds=DAG.Tasks.Runtime.CAUSE_AGGREGATION,
                 tag=DAG.Tasks.Type.CAUSE_AGG
             )
             self.task_map[DAG.Tasks.Type.CAUSE_AGG][
                 agg_cause_task.name
             ] = agg_cause_task
             self.workflow.add_task(agg_cause_task)
Пример #9
0
    def generate_finalizer_task(self, upstream_tasks):
        job_hash_name = "finalizer_run_{}".format(
            self.no_shock_death_number_estimate_version)
        num_cores = 1
        m_mem_free = "2G"
        runfile = "{}/finalizer_run_all.py".format(self.finalizer_code_dir)

        # How do we pass args to the finalizer run_all?
        args = [
            "--shock_death_number_estimate_version",
            self.shock_death_number_estimate,
            "--shock_life_table_estimate_version",
            self.shock_life_table_estimate,
            "--no_shock_death_number_estimate_version",
            self.no_shock_death_number_estimate_version,
            "--no_shock_life_table_estimate_version",
            self.no_shock_life_table_estimate_version, "--username", self.user,
            "--gbd_year", self.gbd_year, "--mark_best", self.mark_best,
            "--shock_version_id", self.shock_aggregator_version,
            "--aggregate_locations", self.aggregate_locations,
            "--workflow_args", self.shock_death_number_estimate
        ]

        return PythonTask(name=job_hash_name,
                          num_cores=num_cores,
                          m_mem_free=m_mem_free,
                          script=runfile,
                          args=args,
                          upstream_tasks=upstream_tasks,
                          queue="long.q",
                          max_runtime_seconds=180000,
                          j_resource=True,
                          max_attempts=1)
Пример #10
0
    def create_gpr_jobs(self):
        # set runtime and memory based on draws
        gpr_runtime = 1200
        gpr_memory = 4
        if self.draws == 100:
            gpr_runtime = 1500
            gpr_memory = 7
        elif self.draws == 1000:
            gpr_runtime = 1800
            gpr_memory = 10

        if self.is_diet_model:
            gpr_runtime *= 3
            gpr_memory *= 3

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):
                upstream_job = self.descanso_jobs['descanso_{}_{}_{}'.format(
                    self.run_id, ko, param_group)]
                for loc_group in list(range(0, self.nparallel)):

                    submit_params = ','.join(
                        [str(x) for x in self.param_groups[param_group]])

                    jname = 'gpr_{}_{}_{}_{}'.format(self.run_id, ko,
                                                     param_group, loc_group)

                    task = PythonTask(script=GPR_SCRIPT,
                                      args=[
                                          self.run_id, self.output_path, ko,
                                          self.draws, submit_params,
                                          self.nparallel, loc_group
                                      ],
                                      name=jname,
                                      num_cores=1,
                                      m_mem_free=f'{gpr_memory}G',
                                      max_runtime_seconds=gpr_runtime,
                                      max_attempts=2,
                                      tag='stgpr_gpr',
                                      queue='all.q',
                                      resource_scales=RESOURCE_SCALES,
                                      hard_limits=True)

                    task.add_upstream(upstream_job)

                    self.workflow.add_task(task)
                    self.gpr_jobs[task.name] = task
Пример #11
0
    def add_job(self, wf, job_dag, mvm):
        """
        Create a PythonTask, add it to the workflow, and update the DagNode to
        contain a reference to the task.

        Args:
            wf (jobmon.client.swarm.workflow.workflow): jobmon workflow
            job_dag (dict[str, DagNode]): a mapping of job name to DagNode
            mvm (cascade_ode.importer.Importer.model_version_meta): a dataframe
                of model settings
        """
        num_children = self.details['num_children']
        slots, memory, runtime = sge.cluster_limits('node',
                                                    mvm,
                                                    num_children,
                                                    details=self.details)
        environment_variables_to_add = settings['env_variables']
        environment_variables_to_add['OMP_NUM_THREADS'] = slots

        memory, runtime = sge.update_ME_specific_allocations(
            modelable_entity_id=mvm.modelable_entity_id.unique()[0],
            memory=memory,
            runtime=runtime)

        upstream_tasks = []
        for upstream_jobname in self.upstream_jobs:
            upstream_tasks.extend(job_dag[upstream_jobname].task)

        task = PythonTask(script=self.script,
                          args=self.args,
                          name=self.job_name,
                          upstream_tasks=upstream_tasks,
                          env_variables=environment_variables_to_add,
                          num_cores=slots,
                          max_runtime_seconds=runtime,
                          queue=queue_from_runtime(runtime),
                          m_mem_free=f"{memory}G",
                          max_attempts=self.max_attempts,
                          tag='node',
                          j_resource=False)
        task.context_args = add_if_set(self.host_type)

        self.task.append(task)
        wf.add_task(task)
Пример #12
0
    def get_task(self, measure_id, location_id):
        upsteam_tasks = []
        d = self.como_version.nonfatal_dimensions.get_simulation_dimensions(
            measure_id)

        # if location is not most detailed then its dependent on loc agg
        if location_id not in d.index_dim.get_level("location_id"):
            location_set_version_id = self.agg_loc_set_map[location_id]
            for component in self.como_version.components:
                for year_id in self._year_set:
                    for sex_id in [sex.MALE, sex.FEMALE]:
                        if not (component == "impairment" and
                                measure_id == measures.INCIDENCE):
                            task_name = LocationAggTaskFactory.get_task_name(
                                component=component,
                                year_id=year_id,
                                sex_id=sex_id,
                                measure_id=measure_id,
                                location_set_version_id=location_set_version_id
                            )
                            upsteam_tasks.append(self.task_registry[task_name])
        # otherwise it is dependent on simulation tasks or incidence
        else:
            if measure_id == measures.INCIDENCE:
                for sex_id in [sex.MALE, sex.FEMALE]:
                    task_name = IncidenceTaskFactory.get_task_name(
                        location_id=location_id,
                        sex_id=sex_id)
                    upsteam_tasks.append(self.task_registry[task_name])
            else:
                for year_id in self._year_set:
                    for sex_id in [sex.MALE, sex.FEMALE]:
                        task_name = SimulationTaskFactory.get_task_name(
                            location_id=location_id,
                            sex_id=sex_id,
                            year_id=year_id)
                        upsteam_tasks.append(self.task_registry[task_name])

        name = self.get_task_name(measure_id, location_id)
        task = PythonTask(
            script=THIS_FILE,
            args=[
                "--como_dir", self.como_version.como_dir,
                "--measure_id", measure_id,
                "--location_id", location_id
            ],
            name=name,
            upstream_tasks=upsteam_tasks,
            num_cores=1,
            m_mem_free="100G",
            max_attempts=5,
            max_runtime_seconds=(60 * 60 * 6),
            tag="summary")
        self.task_registry[name] = task
        return task
Пример #13
0
    def create_cleanup_jobs(self):
        """Saves rake summaries and removes
        tempfiles no longer needed"""
        runtime = 600 if self.draws == 0 else 7200

        for ko in list(range(0, self.holdouts + 1)):
            task = PythonTask(script=CLEANUP_SCRIPT,
                              args=[
                                  self.run_id, self.output_path, self.run_type,
                                  ko, self.draws
                              ],
                              name=f'clean_{self.run_id}_{ko}',
                              num_cores=1,
                              m_mem_free='1G',
                              max_runtime_seconds=runtime,
                              max_attempts=1,
                              tag='stgpr_clean',
                              queue='all.q')

            if ko == 0:
                for loc in self.subnat_locations:
                    task.add_upstream(self.rake_jobs['rake_{}_{}'.format(
                        self.run_id, loc)])
            else:
                task.add_upstream(self.eval_jobs['eval_{}'.format(
                    self.run_id)])

            self.workflow.add_task(task)
            self.cleanup_jobs[task.name] = task
Пример #14
0
    def create_st_jobs(self):
        for ko in range(0, self.holdouts + 1):
            upstream_job = self.stage1_jobs['stage1_{}_{}'.format(
                self.run_id, ko)]
            for param_group in range(0, len(self.param_groups)):
                for loc_group in range(0, self.nparallel):

                    submit_params = ','.join(
                        [str(x) for x in self.param_groups[param_group]])
                    jname = 'st_{}_{}_{}_{}'.format(self.run_id, ko,
                                                    param_group, loc_group)

                    memory = 50
                    runtime = 1500
                    if self.is_diet_model:
                        memory = 120
                        runtime = 28800  # 8 hours

                    task = PythonTask(script=ST_SCRIPT,
                                      args=[
                                          self.run_id, self.output_path, ko,
                                          self.run_type, submit_params,
                                          self.nparallel, loc_group
                                      ],
                                      name=jname,
                                      num_cores=6,
                                      m_mem_free=f'{memory}G',
                                      max_attempts=3,
                                      max_runtime_seconds=runtime,
                                      tag='stgpr_spacetime',
                                      queue='all.q',
                                      resource_scales=RESOURCE_SCALES,
                                      hard_limits=True)

                    task.add_upstream(upstream_job)

                    self.workflow.add_task(task)
                    self.st_jobs[task.name] = task
Пример #15
0
    def create_eval_jobs(self):
        """
        For hyperparameter selection runs only, determine best hyperparameter
        set based on in-sample or out-of-sample RMSE.
        - run_type = "in_sample_selection"
        - run_type = "oos_selection"

        For runs with only one set of parameters, set the best_param_set
        to the *only* param_set (param_set 0) for consistency in rake inputs.

        Lastly, just collect the disparate fit_stats files and combine into
        a single file, saved as fit_stats.csv for all run types
        """

        task = PythonTask(script=EVAL_SCRIPT,
                          args=[
                              self.run_id, self.output_path, self.run_type,
                              self.holdouts, self.n_parameter_sets
                          ],
                          name=f'eval_{self.run_id}',
                          num_cores=1,
                          m_mem_free='500M',
                          max_runtime_seconds=180,
                          max_attempts=2,
                          tag='stgpr_eval',
                          queue='all.q',
                          resource_scales=RESOURCE_SCALES,
                          hard_limits=True)

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):
                post_label = 'post_{}_{}_{}'.format(self.run_id, ko,
                                                    param_group)
                task.add_upstream(self.post_jobs[post_label])

        self.workflow.add_task(task)
        self.eval_jobs[task.name] = task
Пример #16
0
    def get_task(self, component, year_id, sex_id, measure_id,
                 location_set_version_id):
        loc_trees = dbtrees.loctree(
            location_set_version_id=location_set_version_id, return_many=True)
        # put all aggregate locations in a mapping dict for summary dependency
        agg_locs = []
        for loc_tree in loc_trees:
            agg_locs.extend([
                node for node in loc_tree.node_ids
                if node not in [x.id for x in loc_tree.leaves()]
            ])
        for location_id in agg_locs:
            self.agg_loc_set_map[location_id] = location_set_version_id

        # hunt down upstream task list
        upstream_tasks = []
        for location_id in [node.id for node in loc_tree.leaves()]:
            if measure_id == measures.INCIDENCE:
                upstream_name = IncidenceTaskFactory.get_task_name(
                    location_id, sex_id)
            elif measure_id == measures.PREVALENCE and component in [
                    "sequela", "injuries", "impairment"
            ]:
                upstream_name = SimulationInputTaskFactory.get_task_name(
                    location_id, sex_id)
            else:
                upstream_name = SimulationTaskFactory.get_task_name(
                    location_id, sex_id, year_id)
            upstream_tasks.append(self.task_registry[upstream_name])

        name = self.get_task_name(component, year_id, sex_id, measure_id,
                                  location_set_version_id)
        task = PythonTask(script=THIS_FILE,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--component", component, "--year_id", year_id,
                              "--sex_id", sex_id, "--measure_id", measure_id,
                              "--location_set_version_id",
                              location_set_version_id
                          ],
                          name=name,
                          upstream_tasks=upstream_tasks,
                          num_cores=4,
                          m_mem_free="300G",
                          max_attempts=5,
                          max_runtime_seconds=(60 * 60 * 20),
                          tag="loc_agg")
        self.task_registry[name] = task
        return task
Пример #17
0
 def generate_run_mv_task(self, upstream_tasks, loc):
     job_hash_name = "run_mv_{}_{}".format(self.version_id, loc)
     num_cores = 5
     m_mem_free = "80G"
     runfile = "{}/predict_machine_vision.py".format(self.code_dir)
     args = ["--version_id", self.version_id, "--ihme_loc_id", loc]
     return PythonTask(name=job_hash_name,
                       num_cores=num_cores,
                       m_mem_free=m_mem_free,
                       max_runtime_seconds=90000,
                       j_resource=False,
                       script=runfile,
                       args=args,
                       upstream_tasks=upstream_tasks,
                       queue="all.q")
Пример #18
0
 def get_task(self, location_id, sex_id, n_processes):
     name = self.get_task_name(location_id, sex_id)
     task = PythonTask(script=THIS_FILE,
                       args=[
                           "--como_dir", self.como_version.como_dir,
                           "--location_id", location_id, "--sex_id", sex_id,
                           "--n_processes", n_processes
                       ],
                       name=name,
                       num_cores=25,
                       m_mem_free="60G",
                       max_attempts=5,
                       max_runtime_seconds=(60 * 60 * 6),
                       tag="inci")
     self.task_registry[name] = task
     return task
Пример #19
0
    def create_calculate_ylls_tasks(self) -> None:
        """
        Adds tasks to calculate YLLs from our scaled draws by location, sex,
        and year.

        Dependent on completed scalar application for respective locations,
        sexes, and years.
        """
        for sex in self.parameters.sex_ids:
            for location in self.parameters.most_detailed_location_ids:
                calc_ylls = PythonTask(
                    script=os.path.join(
                        self.code_dir, DAG.Executables.YLLS
                    ),
                    args=[
                        '--action', DAG.Tasks.Type.CALCULATE,
                        '--machine_process', self.parameters.process,
                        '--parent_dir', self.parameters.parent_dir,
                        '--location_id', location,
                        '--sex_id', sex
                    ],
                    name=DAG.Tasks.Name.CALC_YLLS.format(
                        location=location,
                        sex=sex
                    ),
                    num_cores=DAG.Tasks.Cores.YLLS,
                    m_mem_free=DAG.Tasks.Memory.YLLS,
                    max_runtime_seconds=DAG.Tasks.Runtime.CALCULATE,
                    upstream_tasks=[
                        self.task_map[DAG.Tasks.Type.CACHE][
                            DAG.Tasks.Name.CACHE_PRED_EX
                        ],
                        self.task_map[DAG.Tasks.Type.CAUSE_AGG][
                            DAG.Tasks.Name.CAUSE_AGGREGATION.format(
                                location=location,
                                sex=sex
                            )
                        ]
                    ],
                    tag=DAG.Tasks.Type.CALCULATE
                )
                self.task_map[DAG.Tasks.Type.CALCULATE][
                    calc_ylls.name
                ] = calc_ylls
                self.workflow.add_task(calc_ylls)
Пример #20
0
 def create_imported_cases_jobs(self):
     """Generates the tasks and adds them to the task_dag."""
     # TODO: profile and revise core/mem allocation.
     for cause in self.cause_ids:
         task = PythonTask(
             script=os.path.join(self.code_dir, 'imported_cases.py'),
             args=[self.version_id,
                   '--cause_id', cause,
                   '--decomp_step', self.decomp_step,
                   '--gbd_round_id', self.gbd_round_id,
                   '--output_dir', self.out_dir],
             name='imported_cases_{}_{}'.format(self.version_id, cause),
             num_cores=42,
             m_mem_free="100.0G",
             max_attempts=3,
             tag='imported_cases',
             queue='all.q')
         self.workflow.add_task(task)
Пример #21
0
    def add_interpolate_tasks(self):
        for meid in self.proportion_ids:
            for sex in self.sex_ids:
                arglist = [
                    '--gbd_id', meid, '--proportion_measure_id',
                    self.proportion_measure_id, '--sex_id', sex,
                    '--gbd_round_id', self.gbd_round_id, '--intermediate_dir',
                    self.intermediate_dir
                ]
                if self.decomp_step:
                    arglist.extend(['--decomp_step', self.decomp_step])

                task = PythonTask(script=os.path.join(self._CODEDIR,
                                                      'split_interp.py'),
                                  args=arglist,
                                  name='split_model_interpolate_{}_{}'.format(
                                      meid, sex),
                                  num_cores=30,
                                  m_mem_free='60G',
                                  max_runtime_seconds=14400,
                                  max_attempts=10)
                self.workflow.add_task(task)
Пример #22
0
    def get_task(self, node, output_dir, location_id, year_id, sex_id,
                 decomp_step, n_draws, dependency_list):

        # make task
        name = self.get_task_name(node, location_id, year_id, sex_id)
        task = PythonTask(script=this_file,
                          args=[
                              "--output_dir", output_dir, "--location_id",
                              str(location_id), "--year_id",
                              str(year_id), "--sex_id",
                              str(sex_id), "--decomp_step", decomp_step,
                              "--n_draws",
                              str(n_draws)
                          ],
                          upstream_tasks=dependency_list,
                          name=name,
                          num_cores=10,
                          m_mem_free="100.0G",
                          max_attempts=DAG.Tasks.MAX_ATTEMPTS,
                          tag=DAG.Tasks.SUPER_SQUEEZE,
                          queue=DAG.Tasks.QUEUE)

        return task
Пример #23
0
    def get_task(self, node, process_graph, output_dir, decomp_step,
            year_id, n_draws):

        dep_list = get_dependencies(node, process_graph, self.task_registry)
        # make task
        name = self.get_task_name(node)
        task =  PythonTask(
            script=this_file,
            args=[
                "--process_name", node,
                "--output_dir", output_dir,
                "--decomp_step", decomp_step,
                "--year_id", " ".join([str(x) for x in year_id]),
                "--n_draws", str(n_draws)
            ],
            upstream_tasks=dep_list,
            name=name,
            num_cores=25,
            m_mem_free="300.0G",
            max_attempts=DAG.Tasks.MAX_ATTEMPTS,
            tag=DAG.Tasks.EX_ADJUST,
            queue=DAG.Tasks.QUEUE)

        return task
Пример #24
0
    def get_task(self, location_id, sex_id, year_id, n_simulants, n_processes):
        # get upstream
        dep_name = SimulationInputTaskFactory.get_task_name(
            location_id, sex_id)
        dep = self.task_registry[dep_name]

        # make task
        name = self.get_task_name(location_id, sex_id, year_id)
        task = PythonTask(script=THIS_FILE,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--location_id", location_id, "--sex_id", sex_id,
                              "--year_id", year_id, "--n_simulants",
                              n_simulants, "--n_processes", n_processes
                          ],
                          name=name,
                          upstream_tasks=[dep],
                          num_cores=25,
                          m_mem_free="60G",
                          max_attempts=5,
                          max_runtime_seconds=(60 * 60 * 3),
                          tag="sim")
        self.task_registry[name] = task
        return task
Пример #25
0
    def create_location_aggregation_tasks(self) -> None:
        """
        Adds tasks to aggregate up the location hierarchy for each location set
        id, sex, year, and measure in our FauxCorrect run.

        Dependent on each location, sex, and year specific group of scalar or
        calculate ylls tasks to be completed for their respective measure ids.
        """
        for location_set_id in self.parameters.location_set_ids:
            for year in self.parameters.year_ids:
                for measure in self.parameters.measure_ids:
                    for loc_agg_type in LocationAggregation.Type.CODCORRECT:
                        if (
                            FilePaths.UNSCALED_DIR in loc_agg_type
                            and
                            measure == Measures.Ids.YLLS
                        ):
                            continue
                        agg_task = PythonTask(
                            script=os.path.join(
                                self.code_dir,
                                DAG.Executables.LOC_AGG
                            ),
                            args=[
                                '--action', DAG.Tasks.Type.LOC_AGG,
                                '--parent_dir', self.parameters.parent_dir,
                                '--gbd_round_id', (
                                    self.parameters.gbd_round_id
                                ),
                                '--aggregation_type', loc_agg_type,
                                '--location_set_id', location_set_id,
                                '--year_id', year,
                                '--measure_id', measure
                            ],
                            name=DAG.Tasks.Name.LOCATION_AGGREGATION.format(
                                aggregation_type=(
                                    loc_agg_type.replace("/","_")
                                ),
                                location_set=location_set_id,
                                measure=measure,
                                year=year
                            ),
                            num_cores=DAG.Tasks.Cores.LOCATION_AGGREGATION,
                            m_mem_free=(
                                DAG.Tasks.Memory.LOCATION_AGGREGATION
                            ),
                            max_runtime_seconds=(
                                DAG.Tasks.Runtime.LOCATION_AGGREGATION
                            ),
                            queue=DAG.QUEUE,
                            upstream_tasks=[
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_REGIONAL_SCALARS
                                ]
                            ],
                            tag=DAG.Tasks.Type.LOC_AGG
                        )
                        # Attach upstream dependencies, all locations for
                        # sex and year.
                        for sex in self.parameters.sex_ids:
                            for loc in (
                                self.parameters.most_detailed_location_ids
                            ):
                                if measure == Measures.Ids.YLLS:
                                    # If aggregating measure 4 (YLLs), attach
                                    # calc ylls jobs as upstream
                                    agg_task.add_upstream(
                                        self.task_map[DAG.Tasks.Type.CALCULATE][
                                            DAG.Tasks.Name.CALC_YLLS.format(
                                                location=loc,
                                                sex=sex
                                            )
                                        ]
                                    )
                                else:
                                    # If measure is not 4 (YLLs), then add
                                    # cause agg jobs as upstream.
                                    agg_task.add_upstream(
                                       self.task_map[DAG.Tasks.Type.CAUSE_AGG][
                                            (DAG.Tasks.Name.CAUSE_AGGREGATION
                                            .format(
                                                location=loc,
                                                sex=sex
                                            ))
                                        ]
                                    )
                        if location_set_id not in [
                            LocationSetId.OUTPUTS, LocationSetId.SDI,
                            LocationSetId.STANDARD
                        ]:
                            # If location set is one of the special
                            # sets that central computation only aggregates
                            # at the end of a round, add the outputs location
                            # set as upstream dependency so it finishes first.
                            agg_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.LOC_AGG][
                                    DAG.Tasks.Name.LOCATION_AGGREGATION.format(
                                        aggregation_type=(
                                            loc_agg_type.replace("/","_")
                                        ),
                                        location_set=LocationSetId.OUTPUTS,
                                        measure=measure,
                                        year=year
                                    )
                                ]
                            )
                        self.task_map[DAG.Tasks.Type.LOC_AGG][
                            agg_task.name
                        ] = agg_task

                        self.workflow.add_task(agg_task)
Пример #26
0
def main() -> None:
    args = parse_args()
    user = getpass.getuser()
    today_string = datetime.date.today().strftime('%m%d%y')
    workflow = Workflow(
        workflow_args=
        f'anemia_post_interp_temp_{args.decomp_step}_{today_string}',
        name=f'anemia_post_{args.decomp_step}_{today_string}',
        description=f'Anemia: Post-processing for decomp {args.decomp_step}',
        project="proj_anemia",
        stderr="FILEPATH",
        stdout="FILEPATH",
        working_dir=path_to_directory,
        resume=True)

    anemia_causes = ('hiv', 'pud', 'gastritis', 'esrd_dialysis', 'ckd3',
                     'ckd4', 'ckd5', 'cirrhosis')

    for anemia_cause in anemia_causes:

        # load in the info table
        info_df = pd.read_csv("FILEPATH")
        new_me_id_list = info_df['proportion_me'].tolist()

        # submit compute job for each me_id
        compute_prop_tasks = []
        for year in args.year_id:
            task = PythonTask(script="FILEPATH",
                              args=[
                                  "--year_id", year, "--anemia_cause",
                                  anemia_cause, "--gbd_round_id",
                                  args.gbd_round_id, "--decomp_step",
                                  args.decomp_step, "--out_dir", args.out_dir
                              ],
                              name=f"make_{anemia_cause}_props_{year}",
                              tag="compute_props",
                              num_cores=1,
                              m_mem_free="12G",
                              max_attempts=3,
                              max_runtime_seconds=60 * 60 * 2,
                              queue='all.q')
            compute_prop_tasks.append(task)
        workflow.add_tasks(compute_prop_tasks)

        # submit save result jobs after compute jobs finish
        for new_me_id in new_me_id_list:
            task = PythonTask(script="FILEPATH",
                              args=[
                                  "--modelable_entity_id", new_me_id,
                                  "--year_ids",
                                  " ".join([str(yr) for yr in args.year_id
                                            ]), "--gbd_round_id",
                                  args.gbd_round_id, "--decomp_step",
                                  args.decomp_step, "--save_dir", 'FILEPATH'
                              ],
                              name=f"save_props_{new_me_id}",
                              tag="save_props",
                              upstream_tasks=compute_prop_tasks,
                              num_cores=8,
                              m_mem_free="90G",
                              max_attempts=3,
                              max_runtime_seconds=60 * 60 * 8,
                              queue='long.q')
            workflow.add_task(task)

    status = workflow.run()
    print(f'Workflow finished with status {status}')
Пример #27
0
    def create_cache_tasks(self) -> None:
        """
        Adds up to five cache tasks that must occur at the beginning of each
        FauxCorrect run:

            1. envelope
            2. population
            3. spacetime restrictions
            4. regional scalars
            5. pred_ex

        These jobs kick off the CoDCorrect process and do not have any
        upstream dependencies.
        """
        # Create mortality inputs cache jobs.
        for mort_process, memory in zip(MortalityInputs.ALL_INPUTS,
            DAG.Tasks.Memory.CACHE_ALL_MORTALITY_INPUTS
        ):
            cache_mortality_task = PythonTask(
                script=os.path.join(
                    self.code_dir, DAG.Executables.CACHE_MORT_INPUT
                ),
                args=[
                    '--mort_process', mort_process,
                    '--machine_process', self.parameters.process,
                    '--version_id', self.parameters.version_id
                ],
                name=DAG.Tasks.Name.CACHE_MORTALITY.format(
                    mort_process=mort_process
                ),
                num_cores=DAG.Tasks.Cores.CACHE_MORTALITY,
                m_mem_free=memory,
                max_runtime_seconds=DAG.Tasks.Runtime.CACHE_MORTALITY,
                queue=DAG.QUEUE,
                tag=DAG.Tasks.Type.CACHE
            )
            self.task_map[DAG.Tasks.Type.CACHE][cache_mortality_task.name] = (
                cache_mortality_task
            )
            self.workflow.add_task(cache_mortality_task)

        # Create cache_spacetime_restrictions job
        cache_spacetime_restrictions = PythonTask(
            script=os.path.join(
                self.code_dir, DAG.Executables.CORRECT
            ),
            args=[
                '--action', DAG.Tasks.Type.CACHE,
                '--parent_dir', self.parameters.parent_dir,
                '--gbd_round_id', self.parameters.gbd_round_id
            ],
            name=DAG.Tasks.Name.CACHE_SPACETIME,
            num_cores=DAG.Tasks.Cores.CACHE_SPACETIME_RESTRICTIONS,
            m_mem_free=DAG.Tasks.Memory.CACHE_SPACETIME_RESTRICTIONS,
            max_runtime_seconds=DAG.Tasks.Runtime.CACHE_SPACETIME_RESTRICTIONS,
            queue=DAG.QUEUE,
            tag=DAG.Tasks.Type.CACHE
        )
        self.task_map[DAG.Tasks.Type.CACHE][
            cache_spacetime_restrictions.name] = cache_spacetime_restrictions
        self.workflow.add_task(cache_spacetime_restrictions)

        # Create cache_regional_scalars job
        cache_regional_scalars = PythonTask(
            script=os.path.join(self.code_dir, DAG.Executables.LOC_AGG),
            args=[
                '--action', DAG.Tasks.Type.CACHE,
                '--parent_dir', self.parameters.parent_dir,
                '--gbd_round_id', self.parameters.gbd_round_id
            ],
            name=DAG.Tasks.Name.CACHE_REGIONAL_SCALARS,
            num_cores=DAG.Tasks.Cores.CACHE_REGIONAL_SCALARS,
            m_mem_free=DAG.Tasks.Memory.CACHE_REGIONAL_SCALARS,
            max_runtime_seconds=DAG.Tasks.Runtime.CACHE_REGIONAL_SCALARS,
            queue=DAG.QUEUE,
            tag=DAG.Tasks.Type.CACHE
        )
        self.task_map[DAG.Tasks.Type.CACHE][cache_regional_scalars.name] = (
            cache_regional_scalars
        )
        self.workflow.add_task(cache_regional_scalars)

        if gbd.measures.YLL in self.parameters.measure_ids:
            # Create cache_pred_ex job
            cache_pred_ex = PythonTask(
                script=os.path.join(self.code_dir, DAG.Executables.YLLS),
                args=[
                    '--action', DAG.Tasks.Type.CACHE,
                    '--parent_dir', self.parameters.parent_dir,
                    '--gbd_round_id', self.parameters.gbd_round_id
                ],
                name=DAG.Tasks.Name.CACHE_PRED_EX,
                num_cores=DAG.Tasks.Cores.CACHE_PRED_EX,
                m_mem_free=DAG.Tasks.Memory.CACHE_PRED_EX,
                max_runtime_seconds=DAG.Tasks.Runtime.CACHE_PRED_EX,
                queue=DAG.QUEUE,
                tag=DAG.Tasks.Type.CACHE
            )
            self.task_map[DAG.Tasks.Type.CACHE][cache_pred_ex.name] = (
                cache_pred_ex
            )
            self.workflow.add_task(cache_pred_ex)
Пример #28
0
    def create_upload_tasks(self) -> None:
        """
        Adds tasks to upload summaries to gbd and / or cod databases.
        """
        # Determine if percent change is part of workflow
        if self.parameters.year_start_ids is not None:
            upload_types = ['single', 'multi']
        else:
            upload_types = ['single']

        # gbd upload tasks
        for measure in self.parameters.measure_ids:
            for upload_type in upload_types:
                upload_gbd_task = PythonTask(
                    script=os.path.join(
                        self.code_dir,
                        DAG.Executables.UPLOAD
                    ),
                    args=[
                        '--machine_process', self.parameters.process,
                        '--version_id', self.parameters.version_id,
                        '--database', DataBases.GBD,
                        '--measure_id', measure,
                        '--upload_type', upload_type
                    ],
                    name=DAG.Tasks.Name.UPLOAD.format(
                        database=DataBases.GBD,
                        uploadtype=upload_type,
                        measure=measure
                    ),
                    num_cores=DAG.Tasks.Cores.UPLOAD,
                    m_mem_free=DAG.Tasks.Memory.UPLOAD,
                    max_runtime_seconds=DAG.Tasks.Runtime.UPLOAD,
                    queue=DAG.UPLOAD_QUEUE,
                    tag=DAG.Tasks.Type.UPLOAD
                )
                # add gbd summarization tasks as upstream dependencies
                if upload_type == 'single':
                    for year in self.parameters.year_ids:
                        for location in self.parameters.location_ids:
                            upload_gbd_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.SUMMARIZE][
                                    DAG.Tasks.Name.SUMMARIZE_GBD.format(
                                        measure=measure,
                                        location=location,
                                        year=year
                                    )
                                ]
                            )
                else:
                    for year_index in range(len(self.parameters.year_start_ids)):
                        for location in self.parameters.location_ids:
                            upload_gbd_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.SUMMARIZE][
                                    DAG.Tasks.Name.SUMMARIZE_PCT_CHANGE.format(
                                        measure=measure,
                                        location=location,
                                        year_start=self.parameters.year_start_ids[
                                            year_index],
                                        year_end=self.parameters.year_end_ids[
                                            year_index]
                                    )
                                ]
                            )
                self.task_map[DAG.Tasks.Type.UPLOAD][
                    upload_gbd_task.name
                ] = upload_gbd_task

                self.workflow.add_task(upload_gbd_task)

        # cod upload tasks - DEATHS only
        if DataBases.COD in self.parameters.databases:
            upload_cod_task = PythonTask(
                script=os.path.join(
                    self.code_dir,
                    DAG.Executables.UPLOAD),

                args=[
                    '--machine_process', self.parameters.process,
                    '--version_id', self.parameters.version_id,
                    '--database', DataBases.COD,
                    '--measure_id', Measures.Ids.DEATHS,
                ],
                name=DAG.Tasks.Name.UPLOAD.format(
                    database=DataBases.COD,
                    uploadtype='single',
                    measure=Measures.Ids.DEATHS
                ),
                num_cores=DAG.Tasks.Cores.UPLOAD,
                m_mem_free=DAG.Tasks.Memory.UPLOAD,
                max_runtime_seconds=DAG.Tasks.Runtime.UPLOAD,
                queue=DAG.QUEUE,
                tag=DAG.Tasks.Type.UPLOAD
            )
            # add cod summarization tasks as upstream dependencies
            for year in self.parameters.year_ids:
                for location in self.parameters.location_ids:
                    upload_cod_task.add_upstream(
                        self.task_map[DAG.Tasks.Type.SUMMARIZE][
                            DAG.Tasks.Name.SUMMARIZE_COD.format(
                                measure=Measures.Ids.DEATHS,
                                location=location,
                                year=year
                            )
                        ]
                    )
            self.task_map[DAG.Tasks.Type.UPLOAD][
                upload_cod_task.name
            ] = upload_cod_task

            self.workflow.add_task(upload_cod_task)
Пример #29
0
    def create_summarize_tasks(self) -> None:
        """
        Adds tasks to summarize draw level estimates for each location, year,
        and measure in our FauxCorrect run.

        Dependent on the mortality input caching tasks as well as the append
        shocks tasks.

        """
        for measure in self.parameters.measure_ids:
            for year in self.parameters.year_ids:
                for location in self.parameters.location_ids:
                    # Create summarize tasks for gbd schema.
                    summarize_gbd_task = PythonTask(
                        script=os.path.join(
                            self.code_dir,
                            DAG.Executables.SUMMARIZE_GBD
                        ),
                        args=[
                            '--parent_dir', self.parameters.parent_dir,
                            '--gbd_round_id', self.parameters.gbd_round_id,
                            '--location_id', location,
                            '--year_id', year,
                            '--measure_id', measure,
                            '--machine_process', self.parameters.process
                        ],
                        name=DAG.Tasks.Name.SUMMARIZE_GBD.format(
                            measure=measure, location=location, year=year
                        ),
                        num_cores=DAG.Tasks.Cores.SUMMARIZE,
                        m_mem_free=DAG.Tasks.Memory.SUMMARIZE,
                        max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE,
                        queue=DAG.QUEUE,
                        upstream_tasks=[
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.ENVELOPE_DRAWS
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=(
                                        MortalityInputs.ENVELOPE_SUMMARY
                                    )
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.POPULATION
                                )
                            ]
                        ],
                        tag=DAG.Tasks.Type.SUMMARIZE
                    )
                    for append_sex in self.parameters.sex_ids:
                        summarize_gbd_task.add_upstream(
                            self.task_map[DAG.Tasks.Type.APPEND][
                                DAG.Tasks.Name.APPEND_SHOCKS.format(
                                    location=location,
                                    sex=append_sex
                                )
                            ]
                        )
                    self.task_map[DAG.Tasks.Type.SUMMARIZE][
                        summarize_gbd_task.name
                    ] = summarize_gbd_task

                    self.workflow.add_task(summarize_gbd_task)

                    if (
                        measure == Measures.Ids.YLLS
                        or
                        DataBases.COD not in self.parameters.databases
                    ):
                        continue
                    # Create summarize tasks for deaths and cod schema.
                    summarize_cod_task = PythonTask(
                        script=os.path.join(
                            self.code_dir,
                            DAG.Executables.SUMMARIZE_COD
                        ),
                        args=[
                            '--version_id', self.parameters.version_id,
                            '--parent_dir', self.parameters.parent_dir,
                            '--gbd_round_id', self.parameters.gbd_round_id,
                            '--location_id', location,
                            '--year_id', year
                        ],
                        name=DAG.Tasks.Name.SUMMARIZE_COD.format(
                            measure=Measures.Ids.DEATHS,
                            location=location,
                            year=year
                        ),
                        num_cores=DAG.Tasks.Cores.SUMMARIZE,
                        m_mem_free=DAG.Tasks.Memory.SUMMARIZE,
                        max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE,
                        queue=DAG.QUEUE,
                        upstream_tasks=[
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.ENVELOPE_DRAWS
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=(
                                        MortalityInputs.ENVELOPE_SUMMARY
                                    )
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.POPULATION
                                )
                            ]
                        ],
                        tag=DAG.Tasks.Type.SUMMARIZE
                    )
                    for append_sex in self.parameters.sex_ids:
                        summarize_cod_task.add_upstream(
                            self.task_map[DAG.Tasks.Type.APPEND][
                                DAG.Tasks.Name.APPEND_SHOCKS.format(
                                    location=location,
                                    sex=append_sex
                                )
                            ]
                        )
                    self.task_map[DAG.Tasks.Type.SUMMARIZE][
                        summarize_cod_task.name
                    ] = summarize_cod_task

                    self.workflow.add_task(summarize_cod_task)

            # pct_change summarization tasks
            if self.parameters.year_start_ids:
                for year_index in range(len(self.parameters.year_start_ids)):
                    for pctc_location in self.parameters.location_ids:
                        summarize_pct_change_task = PythonTask(
                            script=os.path.join(
                                self.code_dir,
                                DAG.Executables.SUMMARIZE_PCT_CHANGE
                            ),
                            args=[
                                '--parent_dir', self.parameters.parent_dir,
                                '--gbd_round_id', self.parameters.gbd_round_id,
                                '--location_id', pctc_location,
                                '--year_start_id', self.parameters.year_start_ids[
                                    year_index],
                                '--year_end_id', self.parameters.year_end_ids[
                                    year_index],
                                '--measure_id', measure,
                                '--machine_process', self.parameters.process
                            ],
                            name=DAG.Tasks.Name.SUMMARIZE_PCT_CHANGE.format(
                                measure=measure, location=pctc_location,
                                year_start=self.parameters.year_start_ids[year_index],
                                year_end=self.parameters.year_end_ids[year_index]
                            ),
                            num_cores=DAG.Tasks.Cores.PCT_CHANGE,
                            m_mem_free=DAG.Tasks.Memory.PCT_CHANGE,
                            max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE,
                            queue=DAG.QUEUE,
                            upstream_tasks=[
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_MORTALITY.format(
                                        mort_process=MortalityInputs.ENVELOPE_DRAWS
                                    )
                                ],
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_MORTALITY.format(
                                        mort_process=(
                                            MortalityInputs.ENVELOPE_SUMMARY
                                        )
                                    )
                                ],
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_MORTALITY.format(
                                        mort_process=MortalityInputs.POPULATION
                                    )
                                ]
                            ],
                            tag=DAG.Tasks.Type.SUMMARIZE
                        )
                        for append_sex in self.parameters.sex_ids:
                            summarize_pct_change_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.APPEND][
                                    DAG.Tasks.Name.APPEND_SHOCKS.format(
                                        location=pctc_location,
                                        sex=append_sex
                                    )
                                ]
                            )
                        self.task_map[DAG.Tasks.Type.SUMMARIZE][
                            summarize_pct_change_task.name
                        ] = summarize_pct_change_task

                        self.workflow.add_task(summarize_pct_change_task)
Пример #30
0
    def create_append_shocks_tasks(self) -> None:
        """
        Adds tasks to append shocks to Deaths and YLLs by location, sex,
        and year.

        Dependent on completed correction application for respective locations,
        sexes, and years. Also dependent on location aggregation.
        """
        for sex in self.parameters.sex_ids:
            for location in self.parameters.location_ids:
                most_detailed_location = (
                    location in self.parameters.most_detailed_location_ids
                )
                append_shocks = PythonTask(
                    script=os.path.join(
                        self.code_dir, DAG.Executables.APPEND_SHOCKS
                    ),
                    args=[
                        '--parent_dir', self.parameters.parent_dir,
                        '--machine_process', self.parameters.process,
                        '--measure_ids',
                        " ".join([str(x) for x in self.parameters.measure_ids]),
                        '--location_id', location,
                        '--most_detailed_location', most_detailed_location,
                        '--sex_id', sex
                    ],
                    name=DAG.Tasks.Name.APPEND_SHOCKS.format(
                        location=location, sex=sex
                    ),
                    num_cores=DAG.Tasks.Cores.APPEND_SHOCKS,
                    m_mem_free=DAG.Tasks.Memory.APPEND_SHOCKS,
                    max_runtime_seconds=DAG.Tasks.Runtime.APPEND,
                    tag=DAG.Tasks.Type.APPEND
                )
                if (
                    most_detailed_location
                    and
                    Measures.Ids.YLLS in self.parameters.measure_ids
                ):
                    # attach calc ylls jobs as upstream
                    append_shocks.add_upstream(
                        self.task_map[DAG.Tasks.Type.CALCULATE][
                            DAG.Tasks.Name.CALC_YLLS.format(
                                location=location,
                                sex=sex
                            )
                        ]
                    )
                elif (
                    most_detailed_location
                    and
                    Measures.Ids.YLLS not in self.parameters.measure_ids
                ):
                    # add cause agg jobs as upstream.
                    append_shocks.add_upstream(
                       self.task_map[DAG.Tasks.Type.CAUSE_AGG][
                            (DAG.Tasks.Name.CAUSE_AGGREGATION
                            .format(
                                location=location,
                                sex=sex
                            ))
                        ]
                    )
                else:
                    # Add location aggregation tasks as upstream dependency.
                    for location_set_id in self.parameters.location_set_ids:
                        for loc_agg_year in self.parameters.year_ids:
                            for loc_agg_measure in self.parameters.measure_ids:
                                for loc_agg_type in (
                                    LocationAggregation.Type.CODCORRECT
                                ):
                                    if (
                                        FilePaths.UNSCALED_DIR in loc_agg_type
                                        and
                                        loc_agg_measure == Measures.Ids.YLLS
                                    ):
                                        continue
                                    append_shocks.add_upstream(
                                        self.task_map[
                                            DAG.Tasks.Type.LOC_AGG][(
                                            DAG.Tasks.Name.LOCATION_AGGREGATION
                                            .format(
                                                aggregation_type=(
                                                    loc_agg_type
                                                    .replace("/","_")
                                                ),
                                                location_set=location_set_id,
                                                measure=loc_agg_measure,
                                                year=loc_agg_year,
                                            )
                                        )]
                                    )
                self.task_map[DAG.Tasks.Type.APPEND][
                    append_shocks.name
                ] = append_shocks
                self.workflow.add_task(append_shocks)