예제 #1
0
 def stop_job(cls, job_id, role, party_id, stop_status):
     schedule_logger(job_id=job_id).info(
         f"request stop job {job_id} with {stop_status}")
     jobs = JobSaver.query_job(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               is_initiator=True)
     if len(jobs) > 0:
         if stop_status == JobStatus.CANCELED:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id}")
             set_cancel_status = cls.cancel_signal(job_id=job_id,
                                                   set_or_reset=True)
             schedule_logger(job_id=job_id).info(
                 f"set job {job_id} cancel signal {set_cancel_status}")
         job = jobs[0]
         job.f_status = stop_status
         schedule_logger(job_id=job_id).info(
             f"request stop job {job_id} with {stop_status} to all party")
         status_code, response = FederatedScheduler.stop_job(
             job=jobs[0], stop_status=stop_status)
         if status_code == FederatedSchedulingStatusCode.SUCCESS:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} successfully")
             return RetCode.SUCCESS, "success"
         else:
             initiator_tasks_group = JobSaver.get_tasks_asc(
                 job_id=job.f_job_id,
                 role=job.f_role,
                 party_id=job.f_party_id)
             for initiator_task in initiator_tasks_group.values():
                 TaskScheduler.collect_task_of_all_party(
                     job,
                     initiator_task=initiator_task,
                     set_status=stop_status)
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} failed, {response}")
             return RetCode.FEDERATED_ERROR, json_dumps(response)
     else:
         return RetCode.SUCCESS, "can not found job"
예제 #2
0
    def schedule_running_job(cls, job: Job, force_sync_status=False):
        schedule_logger(job.f_job_id).info(f"scheduling running job")

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = dict([(task.f_component_name, task.f_status)
                             for task in tasks])
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status.values())
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job.f_job_id).info(
            f"job status is {new_job_status}, calculate by task status list: {tasks_status}"
        )
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if auto_rerun_tasks:
            schedule_logger(job.f_job_id).info("job have auto rerun tasks")
            cls.set_job_rerun(job_id=job.f_job_id,
                              initiator_role=job.f_initiator_role,
                              initiator_party_id=job.f_initiator_party_id,
                              tasks=auto_rerun_tasks,
                              auto=True)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job.f_job_id).info("finish scheduling running job")
예제 #3
0
    def schedule_running_job(cls, job, force_sync_status=False):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {}".format(
            job.f_job_id))

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = [task.f_status for task in tasks]
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status)
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job_id=job.f_job_id).info(
            "Job {} status is {}, calculate by task status list: {}".format(
                job.f_job_id, new_job_status, tasks_status))
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job_id=job.f_job_id).info(
            "finish scheduling job {}".format(job.f_job_id))
예제 #4
0
    def set_job_rerun(cls,
                      job_id,
                      initiator_role,
                      initiator_party_id,
                      auto,
                      force=False,
                      tasks: typing.List[Task] = None,
                      component_name: typing.Union[str, list] = None):
        schedule_logger(job_id).info(
            f"try to rerun job on initiator {initiator_role} {initiator_party_id}"
        )

        jobs = JobSaver.query_job(job_id=job_id,
                                  role=initiator_role,
                                  party_id=initiator_party_id)
        if not jobs:
            raise RuntimeError(
                f"can not found job on initiator {initiator_role} {initiator_party_id}"
            )
        job = jobs[0]

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        component_name, force = cls.get_rerun_component(
            component_name, job, dsl_parser, force)
        schedule_logger(job_id).info(f"rerun component: {component_name}")

        if tasks:
            schedule_logger(job_id).info(
                f"require {[task.f_component_name for task in tasks]} to rerun"
            )
        else:
            task_query = {
                'job_id': job_id,
                'role': initiator_role,
                'party_id': initiator_party_id,
            }

            if not component_name or component_name == job_utils.job_pipeline_component_name(
            ):
                # rerun all tasks
                schedule_logger(job_id).info(
                    "require all component of pipeline to rerun")
            else:
                _require_reruns = {component_name} if isinstance(
                    component_name, str) else set(component_name)
                _should_reruns = _require_reruns.copy()
                for _cpn in _require_reruns:
                    _components = dsl_parser.get_downstream_dependent_components(
                        _cpn)
                    for _c in _components:
                        _should_reruns.add(_c.get_name())

                schedule_logger(job_id).info(
                    f"require {_require_reruns} to rerun, "
                    f"and then found {_should_reruns} need be to rerun")
                task_query['component_name'] = _should_reruns

            tasks = JobSaver.query_task(**task_query)

        job_can_rerun = any([
            TaskScheduler.prepare_rerun_task(
                job=job,
                task=task,
                dsl_parser=dsl_parser,
                auto=auto,
                force=force,
            ) for task in tasks
        ])
        if not job_can_rerun:
            FederatedScheduler.sync_job_status(job=job)
            schedule_logger(job_id).info("job no task to rerun")
            return False

        schedule_logger(job_id).info("job set rerun signal")
        status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
        schedule_logger(job_id).info(
            f"job set rerun signal {'successfully' if status else 'failed'}")
        return True