def create_job(job_id, role, party_id): try: JobController.create_job(job_id=job_id, role=role, party_id=int(party_id), job_info=request.json) return get_json_result(retcode=0, retmsg='success') except RuntimeError as e: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg=str(e))
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_end_time = None job.f_elapsed = None job.f_progress = 0 schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"]) JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed") else: # status updates may be delayed, and in a very small probability they will be executed after the rerun command schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task") else: schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf_on_party["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def stop_job(job_id, role, party_id, stop_status): kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status, role=role, party_id=party_id) return get_json_result( retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR, retmsg='success' if kill_status else 'failed', data=kill_details)
def cancel_job(job_id, role, party_id): status = JobController.cancel_job(job_id=job_id, role=role, party_id=int(party_id)) if status: return get_json_result(retcode=0, retmsg='cancel job success') else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg='cancel job failed')
def stop_job(cls, job_id, role, party_id, stop_status): schedule_logger(job_id=job_id).info(f"request stop job {job_id}") jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True) if len(jobs) > 0: schedule_logger(job_id=job_id).info(f"initiator cancel job {job_id}") JobController.cancel_job(job_id=job_id, role=role, party_id=party_id) job = jobs[0] job.f_status = stop_status schedule_logger(job_id=job_id).info(f"request cancel job {job_id} to all party") status_code, response = FederatedScheduler.stop_job(job=jobs[0], stop_status=stop_status) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job_id).info(f"cancel job {job_id} successfully") return RetCode.SUCCESS, "success" else: schedule_logger(job_id=job_id).info(f"cancel job {job_id} failed, {response}") return RetCode.FEDERATED_ERROR, json_dumps(response) else: schedule_logger(job_id=job_id).info(f"can not found job {job_id} to stop, delete event on {role} {party_id}") JobQueue.delete_event(job_id=job_id) return RetCode.SUCCESS, "can not found job, delete job waiting event"
def stop_job(job_id, role, party_id, stop_status): jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) kill_status = True kill_details = {} for job in jobs: kill_job_status, kill_job_details = JobController.stop_job( job=job, stop_status=stop_status) kill_status = kill_status & kill_job_status kill_details[job_id] = kill_job_details return get_json_result( retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR, retmsg='success' if kill_status else 'failed', data=kill_details)
def job_status(job_id, role, party_id, status): job_info = {} job_info.update({ "job_id": job_id, "role": role, "party_id": party_id, "status": status }) if JobController.update_job_status(job_info=job_info): return get_json_result(retcode=0, retmsg='success') else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="update job status failed")
def align_input_data_partitions(cls, job: Job, dsl_parser): job_args = dsl_parser.get_args_input() dataset = JobController.get_dataset(is_initiator=True, role=job.f_initiator_role, party_id=job.f_initiator_party_id, roles=job.f_roles, job_args=job_args) input_data_aligned_partitions = 0 # Large integers are not used status, partys_response = FederatedScheduler.align_args(job=job, command_body=dataset) schedule_logger(job_id=job.f_job_id).info("align partition partys response: {}".format(partys_response)) for role, party_args in partys_response.items(): for party_id, response in party_args.items(): if not response["retcode"]: partitions = response.get('data').get('min_input_data_partition') if partitions: if input_data_aligned_partitions == 0 or partitions < input_data_aligned_partitions: input_data_aligned_partitions = partitions return input_data_aligned_partitions
def query_job_input_args(job_id, role, party_id): job_input_args = JobController.query_job_input_args( input_data=request.json, role=role, party_id=party_id) return get_json_result(retcode=0, retmsg='success', data=job_input_args)
def update_job(job_id, role, party_id): job_info = {} job_info.update(request.json) job_info.update({"job_id": job_id, "role": role, "party_id": party_id}) JobController.update_job(job_info=job_info) return get_json_result(retcode=0, retmsg='success')
def start_job(job_id, role, party_id): JobController.start_job(job_id=job_id, role=role, party_id=int(party_id), extra_info=request.json) return get_json_result(retcode=0, retmsg='success')
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_initiator = job_runtime_conf['initiator'] job_parameters = RunParameters(**job_runtime_conf['job_parameters']) cls.backend_compatibility(job_parameters=job_parameters) job_utils.check_job_runtime_conf(job_runtime_conf) if job_parameters.job_type != 'predict': # generate job model info job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role']) job_parameters.model_version = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters.model_id, model_version=job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = job_utils.save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) cls.adapt_job_parameters(job_parameters=job_parameters) # update runtime conf job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("create job failed: {}".format(response)) if job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job_runtime_conf["role"].items(): for party_id in party_ids: if role == job_initiator['role'] and party_id == job_initiator['party_id']: continue JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser) # push into queue try: JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) except Exception as e: raise Exception(f'push job into queue failed:\n{e}') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url
def clean(job_id, role, party_id): JobController.clean_job(job_id=job_id, role=role, party_id=party_id, roles=request.json) return get_json_result(retcode=0, retmsg='success')
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": board_url } submit_result.update(path_dict) return submit_result
def save_pipelined_model(job_id, role, party_id): JobController.save_pipelined_model(job_id=job_id, role=role, party_id=party_id) return get_json_result(retcode=0, retmsg='success')