def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id)) initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] for initiator_task in initiator_tasks_group.values(): # collect all party task party status if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL: tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all: cls.collect_task_of_all_party(job=job, task=initiator_task) new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains(initiator_task.f_status): FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[ JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id ) ] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling") break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED break else: schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id)) schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id)) return scheduling_status_code, initiator_tasks_group.values()
def stop_job(cls, job_id, role, party_id, stop_status): schedule_logger(job_id=job_id).info(f"request stop job {job_id} with {stop_status}") jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True) if len(jobs) > 0: if stop_status == JobStatus.CANCELED: schedule_logger(job_id=job_id).info(f"cancel job {job_id}") set_cancel_status = cls.cancel_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id=job_id).info(f"set job {job_id} cancel signal {set_cancel_status}") job = jobs[0] job.f_status = stop_status schedule_logger(job_id=job_id).info(f"request stop job {job_id} with {stop_status} to all party") status_code, response = FederatedScheduler.stop_job(job=jobs[0], stop_status=stop_status) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job_id).info(f"stop job {job_id} with {stop_status} successfully") return RetCode.SUCCESS, "success" else: initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) for initiator_task in initiator_tasks_group.values(): TaskScheduler.collect_task_of_all_party(job, initiator_task=initiator_task, set_status=stop_status) schedule_logger(job_id=job_id).info(f"stop job {job_id} with {stop_status} failed, {response}") return RetCode.FEDERATED_ERROR, json_dumps(response) else: return RetCode.SUCCESS, "can not found job"