def update_job_status(cls, job_info): update_status = JobSaver.update_job_status(job_info=job_info) if update_status and EndStatus.contains(job_info.get("status")): ResourceManager.return_job_resource(job_id=job_info["job_id"], role=job_info["role"], party_id=job_info["party_id"]) return update_status
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job_id=job.f_job_id).info( "scheduling job {} tasks".format(job.f_job_id)) initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] for initiator_task in initiator_tasks_group.values(): # collect all party task party status if job.f_runtime_conf_on_party["job_parameters"][ "federated_status_collect_type"] == FederatedCommunicationType.PULL: cls.collect_task_of_all_party(job=job, initiator_task=initiator_task) new_task_status = cls.federated_task_status( job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains( initiator_task.f_status): FederatedScheduler.stop_task( job=job, task=initiator_task, stop_status=initiator_task.f_status) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components( component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[JobSaver.task_key( task_id=job_utils.generate_task_id( job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id)] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job_id=job.f_job_id).info( f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling" ) break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED waiting_task.f_status = StatusSet.FAILED FederatedScheduler.sync_task_status(job, waiting_task) break else: schedule_logger(job_id=job.f_job_id).info( "have cancel signal, pass start job {} tasks".format( job.f_job_id)) schedule_logger(job_id=job.f_job_id).info( "finish scheduling job {} tasks".format(job.f_job_id)) return scheduling_status_code, initiator_tasks_group.values()
def run_do(self): schedule_logger().info("start schedule waiting jobs") jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} waiting jobs") if len(jobs): # FIFO job = jobs[0] schedule_logger().info(f"schedule waiting job {job.f_job_id}") try: self.schedule_waiting_jobs(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule waiting job {job.f_job_id} failed") schedule_logger().info("schedule waiting jobs finished") schedule_logger().info("start schedule running jobs") jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.RUNNING, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} running jobs") for job in jobs: schedule_logger().info(f"schedule running job {job.f_job_id}") try: self.schedule_running_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule job {job.f_job_id} failed") schedule_logger().info("schedule running jobs finished") # some ready job exit before start schedule_logger().info("start schedule ready jobs") jobs = JobSaver.query_job(is_initiator=True, ready_signal=True, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} ready jobs") for job in jobs: schedule_logger().info(f"schedule ready job {job.f_job_id}") try: self.schedule_ready_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule ready job {job.f_job_id} failed:\n{e}") schedule_logger().info("schedule ready jobs finished") schedule_logger().info("start schedule rerun jobs") jobs = JobSaver.query_job(is_initiator=True, rerun_signal=True, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} rerun jobs") for job in jobs: schedule_logger().info(f"schedule rerun job {job.f_job_id}") try: self.schedule_rerun_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule job {job.f_job_id} failed") schedule_logger().info("schedule rerun jobs finished") schedule_logger().info("start schedule end status jobs to update status") jobs = JobSaver.query_job(is_initiator=True, status=set(EndStatus.status_list()), end_time=[current_timestamp() - END_STATUS_JOB_SCHEDULING_TIME_LIMIT, current_timestamp()]) schedule_logger().info(f"have {len(jobs)} end status jobs") for job in jobs: schedule_logger().info(f"schedule end status job {job.f_job_id}") try: update_status = self.end_scheduling_updates(job_id=job.f_job_id) if not update_status: schedule_logger(job.f_job_id).info(f"the number of updates has been exceeded") continue self.schedule_running_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule job {job.f_job_id} failed") schedule_logger().info("schedule end status jobs finished")