Пример #1
0
 def stop_jobs(cls, job_id, stop_status, role=None, party_id=None):
     if role and party_id:
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
     else:
         jobs = JobSaver.query_job(job_id=job_id)
     kill_status = True
     kill_details = {}
     for job in jobs:
         kill_job_status, kill_job_details = cls.stop_job(
             job=job, stop_status=stop_status)
         kill_status = kill_status & kill_job_status
         kill_details[job_id] = kill_job_details
     return kill_status, kill_details
Пример #2
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(
            f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=f"stop job on this party {kill_status};\n"
                f"stop job on all party success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job on this party {};\n"
                                   "stop job failed:\n{}".format(
                                       kill_status,
                                       json_dumps(response, indent=4)))
    else:
        schedule_logger(job_id).info(f"can not found job {job_id} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Пример #3
0
def pipeline_dag_dependency(job_info):
    try:
        detect_utils.check_config(job_info, required_arguments=["party_id", "role"])
        component_need_run = {}
        if job_info.get('job_id'):
            jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"])
            if not jobs:
                raise Exception('query job {} failed'.format(job_info.get('job_id', '')))
            job = jobs[0]
            dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                           runtime_conf=job.f_runtime_conf_on_party,
                                                           train_runtime_conf=job.f_train_runtime_conf)
            tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True)
            for task in tasks:
                need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True)
                component_need_run[task.f_component_name] = need_run
        else:
            dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}),
                                                           runtime_conf=job_info.get('job_runtime_conf', {}),
                                                           train_runtime_conf=job_info.get('job_train_runtime_conf', {}))
        dependency = dsl_parser.get_dependency()
        dependency["component_need_run"] = component_need_run
        return dependency
    except Exception as e:
        stat_logger.exception(e)
        raise e
Пример #4
0
 def clean_task(cls, job_id, task_id, task_version, role, party_id,
                content_type: TaskCleanResourceType):
     status = set()
     if content_type == TaskCleanResourceType.METRICS:
         tracker = Tracker(job_id=job_id,
                           role=role,
                           party_id=party_id,
                           task_id=task_id,
                           task_version=task_version)
         status.add(tracker.clean_metrics())
     elif content_type == TaskCleanResourceType.TABLE:
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
         if jobs:
             job = jobs[0]
             job_parameters = RunParameters(
                 **job.f_runtime_conf_on_party["job_parameters"])
             tracker = Tracker(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               task_id=task_id,
                               task_version=task_version,
                               job_parameters=job_parameters)
             status.add(tracker.clean_task(job.f_runtime_conf_on_party))
     if len(status) == 1 and True in status:
         return True
     else:
         return False
Пример #5
0
 def start_job(cls, job_id, initiator_role, initiator_party_id):
     schedule_logger(job_id=job_id).info(
         "try to start job {} on initiator {} {}".format(
             job_id, initiator_role, initiator_party_id))
     job_info = {}
     job_info["job_id"] = job_id
     job_info["role"] = initiator_role
     job_info["party_id"] = initiator_party_id
     job_info["status"] = JobStatus.RUNNING
     job_info["party_status"] = JobStatus.RUNNING
     job_info["start_time"] = current_timestamp()
     job_info["tag"] = 'end_waiting'
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
         FederatedScheduler.start_job(job=job)
         schedule_logger(job_id=job_id).info(
             "start job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
     else:
         schedule_logger(job_id=job_id).error(
             "can not found job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
Пример #6
0
def get_job_table_list():
    jobs = JobSaver.query_job(**request.json)
    if jobs:
        job = jobs[0]
        tables = get_job_all_table(job)
        return get_json_result(data=tables)
    else:
        return get_json_result(retcode=101, retmsg='no find job')
Пример #7
0
def clean_queue():
    jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING)
    clean_status = {}
    for job in jobs:
        status_code, response = FederatedScheduler.request_stop_job(
            job=job, stop_status=JobStatus.CANCELED)
        clean_status[job.f_job_id] = status_code
    return get_json_result(retcode=0, retmsg='success', data=clean_status)
Пример #8
0
def get_job_table_list():
    detect_utils.check_config(config=request.json, required_arguments=['job_id', 'role', 'party_id'])
    jobs = JobSaver.query_job(**request.json)
    if jobs:
        job = jobs[0]
        tables = get_job_all_table(job)
        return get_json_result(data=tables)
    else:
        return get_json_result(retcode=101, retmsg='no find job')
Пример #9
0
def query_job():
    jobs = JobSaver.query_job(**request.json)
    if not jobs:
        return get_json_result(retcode=0,
                               retmsg='no job could be found',
                               data=[])
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[job.to_json() for job in jobs])
Пример #10
0
def component_output_data_table():
    request_data = request.json
    detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name'])
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
Пример #11
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING, only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party:
                 continue
             count += 1
             try:
                 process_exist = build_engine(
                     task.f_engine_conf.get("computing_engine")).is_alive(
                         task)
                 if not process_exist:
                     msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}"
                     detect_logger(job_id=task.f_job_id).info(
                         f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist"
                     )
                     time.sleep(3)
                     _tasks = JobSaver.query_task(
                         task_id=task.f_task_id,
                         task_version=task.f_task_version,
                         role=task.f_role,
                         party_id=task.f_party_id)
                     if _tasks:
                         if _tasks[0].f_party_status == TaskStatus.RUNNING:
                             stop_job_ids.add(task.f_job_id)
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has been checked twice, try to stop job"
                             )
                         else:
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again"
                             )
                     else:
                         detect_logger(task.f_job_id).warning(
                             f"{msg} can not found on db")
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.FAILED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Пример #12
0
def component_output_data_table():
    request_data = request.json
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(
            FederatedScheduler.tracker_command(job, request_data,
                                               'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
Пример #13
0
def check_dependence(job_id, role, party_id):
    job = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)[0]
    status = DependenceManager.check_job_dependence(job)
    if status:
        return get_json_result(retcode=0, retmsg='success')
    else:
        return get_json_result(
            retcode=RetCode.RUNNING,
            retmsg=f"check for job {job_id} dependence failed, "
            f"dependencies are being installed automatically, it may take a few minutes"
        )
Пример #14
0
 def update_job_on_initiator(cls, initiator_job: Job, update_fields: list):
     jobs = JobSaver.query_job(job_id=initiator_job.f_job_id)
     if not jobs:
         raise Exception("Failed to update job status on initiator")
     job_info = initiator_job.to_human_model_dict(
         only_primary_with=update_fields)
     for field in update_fields:
         job_info[field] = getattr(initiator_job, "f_%s" % field)
     for job in jobs:
         job_info["role"] = job.f_role
         job_info["party_id"] = job.f_party_id
         JobSaver.update_job_status(job_info=job_info)
         JobSaver.update_job(job_info=job_info)
Пример #15
0
def update_parameters():
    job_info = request.json
    component_parameters = job_info.pop("component_parameters", None)
    job_parameters = job_info.pop("job_parameters", None)
    job_info["is_initiator"] = True
    jobs = JobSaver.query_job(**job_info)
    if not jobs:
        return get_json_result(
            retcode=RetCode.DATA_ERROR,
            retmsg=log_utils.failed_log(f"query job by {job_info}"))
    else:
        retcode, retdata = DAGScheduler.update_parameters(
            jobs[0], job_parameters, component_parameters)
        return get_json_result(retcode=retcode, data=retdata)
Пример #16
0
def get_url():
    request_data = request.json
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'),
                              role=request_data.get('role'),
                              party_id=request_data.get('party_id'))
    if jobs:
        board_urls = []
        for job in jobs:
            board_url = job_utils.get_board_url(job.f_job_id, job.f_role,
                                                job.f_party_id)
            board_urls.append(board_url)
        return get_json_result(data={'board_url': board_urls})
    else:
        return get_json_result(retcode=101, retmsg='no found job')
Пример #17
0
 def _run(self):
     job = JobSaver.query_job(job_id=self.args.job_id,
                              role=self.args.role,
                              party_id=self.args.party_id)[0]
     try:
         JobController.job_reload(job)
     except Exception as e:
         traceback.print_exc()
         JobSaver.update_job(
             job_info={
                 "job_id": job.f_job_id,
                 "role": job.f_role,
                 "party_id": job.f_party_id,
                 "inheritance_status": JobInheritanceStatus.FAILED
             })
         LOGGER.exception(e)
Пример #18
0
def rerun_job():
    job_id = request.json.get("job_id")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        status_code, response = FederatedScheduler.request_rerun_job(
            job=jobs[0], command_body=request.json)
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(retcode=RetCode.SUCCESS,
                                   retmsg="rerun job success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="rerun job failed:\n{}".format(
                                       json_dumps(response)))
    else:
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Пример #19
0
def update_job():
    job_info = request.json
    jobs = JobSaver.query_job(job_id=job_info['job_id'],
                              party_id=job_info['party_id'],
                              role=job_info['role'])
    if not jobs:
        return get_json_result(retcode=101, retmsg='find job failed')
    else:
        JobSaver.update_job(
            job_info={
                'description': job_info.get('notes', ''),
                'job_id': job_info['job_id'],
                'role': job_info['role'],
                'party_id': job_info['party_id']
            })
        return get_json_result(retcode=0, retmsg='success')
Пример #20
0
 def query_resource(cls, resource_in_use=True, engine_name=None):
     if not engine_name:
         engine_name = ENGINES.get(EngineType.COMPUTING)
     use_resource_jobs = JobSaver.query_job(resource_in_use=resource_in_use)
     used = []
     for job in use_resource_jobs:
         used.append({
             "job_id": job.f_job_id,
             "role": job.f_role,
             "party_id": job.f_party_id,
             "core": job.f_cores,
             "memory": job.f_memory
         })
     computing_engine_resource = cls.get_engine_registration_info(
         engine_type=EngineType.COMPUTING, engine_name=engine_name)
     return used, computing_engine_resource.to_dict(
     ) if computing_engine_resource else {}
Пример #21
0
def pipeline_dag_dependency(job_info):
    try:
        detect_utils.check_config(job_info, required_arguments=["party_id", "role"])
        if job_info.get('job_id'):
            jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"])
            if not jobs:
                raise Exception('query job {} failed'.format(job_info.get('job_id', '')))
            job = jobs[0]
            job_dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                               runtime_conf=job.f_runtime_conf_on_party,
                                                               train_runtime_conf=job.f_train_runtime_conf)
        else:
            job_dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}),
                                                               runtime_conf=job_info.get('job_runtime_conf', {}),
                                                               train_runtime_conf=job_info.get('job_train_runtime_conf', {}))
        return job_dsl_parser.get_dependency(role=job_info["role"], party_id=int(job_info["party_id"]))
    except Exception as e:
        stat_logger.exception(e)
        raise e
Пример #22
0
 def return_resource(cls, job_id):
     jobs = JobSaver.query_job(job_id=job_id)
     if not jobs:
         raise Exception(f'no found job {job_id}')
     return_resource_job_list = []
     for job in jobs:
         job_info = {
             "job_id": job.f_job_id,
             "role": job.f_role,
             "party_id": job.f_party_id,
             "resource_in_use": job.f_resource_in_use,
             "resource_return_status": False
         }
         if job.f_resource_in_use:
             return_status = cls.return_job_resource(
                 job.f_job_id, job.f_role, job.f_party_id)
             job_info["resource_return_status"] = return_status
         return_resource_job_list.append(job_info)
     return return_resource_job_list
Пример #23
0
 def detect_running_job(cls):
     detect_logger().info('start detect running job')
     try:
         running_jobs = JobSaver.query_job(status=JobStatus.RUNNING,
                                           is_initiator=True)
         stop_jobs = set()
         for job in running_jobs:
             try:
                 if job_utils.check_job_is_timeout(job):
                     stop_jobs.add(job)
             except Exception as e:
                 detect_logger(job_id=job.f_job_id).exception(e)
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="running timeout",
                               stop_status=JobStatus.TIMEOUT)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info('finish detect running job')
Пример #24
0
def job_config():
    jobs = JobSaver.query_job(**request.json)
    if not jobs:
        return get_json_result(retcode=101, retmsg='find job failed')
    else:
        job = jobs[0]
        response_data = dict()
        response_data['job_id'] = job.f_job_id
        response_data['dsl'] = job.f_dsl
        response_data['runtime_conf'] = job.f_runtime_conf
        response_data['train_runtime_conf'] = job.f_train_runtime_conf

        adapter = JobRuntimeConfigAdapter(job.f_runtime_conf)
        job_parameters = adapter.get_common_parameters().to_dict()
        response_data['model_info'] = {
            'model_id': job_parameters.get('model_id'),
            'model_version': job_parameters.get('model_version')
        }
        return get_json_result(retcode=0, retmsg='success', data=response_data)
Пример #25
0
    def output_reload(cls, job, source_tasks: dict, target_tasks: dict):
        # model reload
        schedule_logger(job.f_job_id).info("start reload model")
        source_job = JobSaver.query_job(
            job_id=job.f_inheritance_info.get("job_id"))[0]
        cls.output_model_reload(job, source_job)
        cls.checkpoint_reload(job, source_job)
        schedule_logger(job.f_job_id).info("start reload data")
        source_tracker_dict = cls.load_task_tracker(source_tasks)
        target_tracker_dict = cls.load_task_tracker(target_tasks)
        for key, source_tracker in source_tracker_dict.items():
            target_tracker = target_tracker_dict[key]
            table_infos = source_tracker.get_output_data_info()
            # data reload
            schedule_logger(job.f_job_id).info(f"table infos:{table_infos}")
            for table in table_infos:
                target_tracker.log_output_data_info(
                    data_name=table.f_data_name,
                    table_namespace=table.f_table_namespace,
                    table_name=table.f_table_name)

            # cache reload
            schedule_logger(job.f_job_id).info("start reload cache")
            cache_list = source_tracker.query_output_cache_record()
            for cache in cache_list:
                schedule_logger(job.f_job_id).info(
                    f"start reload cache name: {cache.f_cache_name}")
                target_tracker.tracking_output_cache(
                    cache.f_cache, cache_name=cache.f_cache_name)

            # summary reload
            schedule_logger(job.f_job_id).info("start reload summary")
            target_tracker.reload_summary(source_tracker=source_tracker)

            # metric reload
            schedule_logger(job.f_job_id).info("start reload metric")
            target_tracker.reload_metric(source_tracker=source_tracker)

        schedule_logger(job.f_job_id).info("reload output success")
Пример #26
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING,
             run_on_this_party=True,
             run_ip=RuntimeConfig.JOB_SERVER_HOST,
             only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             count += 1
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger(job_id=task.f_job_id).info(
                         'job {} task {} {} on {} {} process {} does not exist'
                         .format(task.f_job_id, task.f_task_id,
                                 task.f_task_version, task.f_role,
                                 task.f_party_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.CANCELED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Пример #27
0
 def stop_job(cls, job_id, role, party_id, stop_status):
     schedule_logger(job_id=job_id).info(
         f"request stop job {job_id} with {stop_status}")
     jobs = JobSaver.query_job(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               is_initiator=True)
     if len(jobs) > 0:
         if stop_status == JobStatus.CANCELED:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id}")
             set_cancel_status = cls.cancel_signal(job_id=job_id,
                                                   set_or_reset=True)
             schedule_logger(job_id=job_id).info(
                 f"set job {job_id} cancel signal {set_cancel_status}")
         job = jobs[0]
         job.f_status = stop_status
         schedule_logger(job_id=job_id).info(
             f"request stop job {job_id} with {stop_status} to all party")
         status_code, response = FederatedScheduler.stop_job(
             job=jobs[0], stop_status=stop_status)
         if status_code == FederatedSchedulingStatusCode.SUCCESS:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} successfully")
             return RetCode.SUCCESS, "success"
         else:
             initiator_tasks_group = JobSaver.get_tasks_asc(
                 job_id=job.f_job_id,
                 role=job.f_role,
                 party_id=job.f_party_id)
             for initiator_task in initiator_tasks_group.values():
                 TaskScheduler.collect_task_of_all_party(
                     job,
                     initiator_task=initiator_task,
                     set_status=stop_status)
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} failed, {response}")
             return RetCode.FEDERATED_ERROR, json_dumps(response)
     else:
         return RetCode.SUCCESS, "can not found job"
Пример #28
0
def component_rerun_check(job_id, role, party_id):
    job = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)[0]
    component_list = DependenceManager.component_check(job, check_type="rerun")
    return get_json_result(data=component_list)
Пример #29
0
    def run_do(self):
        schedule_logger().info("start schedule waiting jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  status=JobStatus.WAITING,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} waiting jobs")
        if len(jobs):
            # FIFO
            job = jobs[0]
            schedule_logger().info(f"schedule waiting job {job.f_job_id}")
            try:
                self.schedule_waiting_jobs(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(job.f_job_id).error(
                    f"schedule waiting job {job.f_job_id} failed")
        schedule_logger().info("schedule waiting jobs finished")

        schedule_logger().info("start schedule running jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  status=JobStatus.RUNNING,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} running jobs")
        for job in jobs:
            schedule_logger().info(f"schedule running job {job.f_job_id}")
            try:
                self.schedule_running_job(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(
                    job.f_job_id).error(f"schedule job {job.f_job_id} failed")
        schedule_logger().info("schedule running jobs finished")

        # some ready job exit before start
        schedule_logger().info("start schedule ready jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  ready_signal=True,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} ready jobs")
        for job in jobs:
            schedule_logger().info(f"schedule ready job {job.f_job_id}")
            try:
                self.schedule_ready_job(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(job.f_job_id).error(
                    f"schedule ready job {job.f_job_id} failed:\n{e}")
        schedule_logger().info("schedule ready jobs finished")

        schedule_logger().info("start schedule rerun jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  rerun_signal=True,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} rerun jobs")
        for job in jobs:
            schedule_logger().info(f"schedule rerun job {job.f_job_id}")
            try:
                self.schedule_rerun_job(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(
                    job.f_job_id).error(f"schedule job {job.f_job_id} failed")
        schedule_logger().info("schedule rerun jobs finished")

        schedule_logger().info(
            "start schedule end status jobs to update status")
        jobs = JobSaver.query_job(is_initiator=True,
                                  status=set(EndStatus.status_list()),
                                  end_time=[
                                      current_timestamp() -
                                      END_STATUS_JOB_SCHEDULING_TIME_LIMIT,
                                      current_timestamp()
                                  ])
        schedule_logger().info(f"have {len(jobs)} end status jobs")
        for job in jobs:
            schedule_logger().info(f"schedule end status job {job.f_job_id}")
            try:
                update_status = self.end_scheduling_updates(
                    job_id=job.f_job_id)
                if not update_status:
                    schedule_logger(job.f_job_id).info(
                        f"the number of updates has been exceeded")
                    continue
                self.schedule_running_job(job=job, force_sync_status=True)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(
                    job.f_job_id).error(f"schedule job {job.f_job_id} failed")
        schedule_logger().info("schedule end status jobs finished")
Пример #30
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id,
               component_name):
     schedule_logger(job_id=job_id).info(
         f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}"
     )
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(
             f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}"
         )
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id,
                                     component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(
         dsl=job.f_dsl,
         runtime_conf=job.f_runtime_conf_on_party,
         train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(
                 f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun"
             )
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job,
                                          task=task,
                                          stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job,
                                           task=task,
                                           content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version}"
             )
             for _role, _party_ids in job.f_runtime_conf_on_party[
                     "role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(
                         job_id,
                         _role,
                         _party_id,
                         False,
                         job.f_initiator_role,
                         job.f_initiator_party_id,
                         RunParameters(
                             **
                             job.f_runtime_conf_on_party["job_parameters"]),
                         dsl_parser,
                         component_name=task.f_component_name,
                         task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version} successfully"
             )
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(
             job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(
             job_id=job_id).info(f"job {job_id} no task to rerun")