def create_job(job_id, role, party_id): try: JobController.create_job(job_id=job_id, role=role, party_id=int(party_id), job_info=request.json) return get_json_result(retcode=0, retmsg='success') except RuntimeError as e: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg=str(e))
def stop_job(job_id, role, party_id, stop_status): kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status, role=role, party_id=party_id) return get_json_result( retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR, retmsg='success' if kill_status else 'failed', data=kill_details)
def _run(self): job = JobSaver.query_job(job_id=self.args.job_id, role=self.args.role, party_id=self.args.party_id)[0] try: JobController.job_reload(job) except Exception as e: traceback.print_exc() JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.FAILED }) LOGGER.exception(e)
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def test_gen_updated_parameters(self): job_id = "202110211127411105150" initiator_role = "guest" initiator_party_id = 9999 input_job_parameters = { "common": { "auto_retries": 1, "auto_retry_delay": 1 } } input_job_parameters = {} input_component_parameters = { "common": { "hetero_lr_0": { "alpha": 0.02 } }, "role": { "guest": { "0": { "reader_0": { "table": {"name": "breast_hetero_guest", "namespace": "unitest_experiment"} }, "homo_nn_0":{ "with_label": True, "output_format": "dense" }, } }, "host": { "1": { "dataio_0":{ "with_label": True, "output_format": "dense" }, "evaluation_0": { "need_run": True } } } } } job_configuration = job_utils.get_job_configuration(job_id=job_id, role=initiator_role, party_id=initiator_party_id) origin_job_parameters = job_configuration.runtime_conf["job_parameters"] origin_component_parameters = job_configuration.runtime_conf["component_parameters"] updated_job_parameters, updated_component_parameters, updated_components = JobController.gen_updated_parameters( job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id, input_job_parameters=input_job_parameters, input_component_parameters=input_component_parameters) jprint(updated_job_parameters) jprint(updated_component_parameters) self.assertTrue(check(input_component_parameters, updated_component_parameters)[0])
def create_new_version_task(cls, job, task, dsl_parser, auto): # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task( job=job, task=task, content_type=TaskCleanResourceType.METRICS) # create new version task task.f_task_version = task.f_task_version + 1 if auto: task.f_auto_retries = task.f_auto_retries - 1 task.f_run_pid = None task.f_run_ip = None # todo: FederatedScheduler.create_task and JobController.initialize_tasks will create task twice status_code, response = FederatedScheduler.create_task(job=job, task=task) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception(f"create {task.f_task_id} new version failed") # create the task holder in db to record information of all participants in the initiator for scheduling for _role in response: for _party_id in response[_role]: if _role == job.f_initiator_role and _party_id == job.f_initiator_party_id: continue JobController.initialize_tasks( job_id=job.f_job_id, role=_role, party_id=_party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=RunParameters( **job.f_runtime_conf_on_party["job_parameters"]), dsl_parser=dsl_parser, components=[task.f_component_name], task_version=task.f_task_version, auto_retries=task.f_auto_retries) schedule_logger(job.f_job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" )
def job_status(job_id, role, party_id, status): job_info = {} job_info.update({ "job_id": job_id, "role": role, "party_id": party_id, "status": status }) if JobController.update_job_status(job_info=job_info): return get_json_result(retcode=0, retmsg='success') else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="update job status failed")
def job_status(job_id, role, party_id, status): job_info = request.json # some value of job_info is initiator, should be updated job_info.update({ "job_id": job_id, "role": role, "party_id": party_id, "status": status }) if JobController.update_job_status(job_info=job_info): return get_json_result(retcode=0, retmsg='success') else: return get_json_result(retcode=RetCode.NOT_EFFECTIVE, retmsg="update job status does not take effect")
def component_check(cls, job, check_type="inheritance"): if check_type == "rerun": task_list = JobSaver.query_task(job_id=job.f_job_id, party_id=job.f_party_id, role=job.f_role, status=TaskStatus.SUCCESS, only_latest=True) tasks = {} for task in task_list: tasks[task.f_component_name] = task else: tasks = JobController.load_tasks( component_list=job.f_inheritance_info.get( "component_list", []), job_id=job.f_inheritance_info.get("job_id"), role=job.f_role, party_id=job.f_party_id) tracker_dict = JobController.load_task_tracker(tasks) missing_dependence_component_list = [] # data dependence for tracker in tracker_dict.values(): table_infos = tracker.get_output_data_info() for table in table_infos: table_meta = storage.StorageTableMeta( name=table.f_table_name, namespace=table.f_table_namespace) if not table_meta: missing_dependence_component_list.append( tracker.component_name) continue if check_type == "rerun": return missing_dependence_component_list elif check_type == "inheritance": # reload component list return list( set(job.f_inheritance_info.get("component_list", [])) - set(missing_dependence_component_list))
def update_parameters(cls, job, job_parameters, component_parameters): updated_job_parameters, updated_component_parameters, updated_components = JobController.gen_updated_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, input_job_parameters=job_parameters, input_component_parameters=component_parameters) schedule_logger(job.f_job_id).info( f"components {updated_components} parameters has been updated") updated_parameters = { "job_parameters": updated_job_parameters, "component_parameters": updated_component_parameters, "components": updated_components } status_code, response = FederatedScheduler.update_parameter( job, updated_parameters=updated_parameters) if status_code == FederatedSchedulingStatusCode.SUCCESS: return RetCode.SUCCESS, updated_parameters else: return RetCode.OPERATING_ERROR, response
def create_job(job_id, role, party_id): src_fate_ver = request.json.get('src_fate_ver') if src_fate_ver is not None and compare_version(src_fate_ver, '1.7.0') == 'lt': return get_json_result(retcode=RetCode.INCOMPATIBLE_FATE_VER, retmsg='Incompatible FATE versions', data={ 'src_fate_ver': src_fate_ver, "current_fate_ver": RuntimeConfig.get_env('FATE') }) try: result = JobController.create_job(job_id=job_id, role=role, party_id=int(party_id), job_info=request.json) return get_json_result(retcode=0, retmsg='success', data=result) except RuntimeError as e: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg=str(e), data={"job_id": job_id})
def start_job(job_id, role, party_id): JobController.start_job(job_id=job_id, role=role, party_id=int(party_id), extra_info=request.json) return get_json_result(retcode=0, retmsg='success')
def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None): if not job_id: job_id = job_utils.generate_job_id() submit_result = {"job_id": job_id} schedule_logger(job_id).info( f"submit job, body {submit_job_conf.to_dict()}") try: dsl = submit_job_conf.dsl runtime_conf = deepcopy(submit_job_conf.runtime_conf) job_utils.check_job_runtime_conf(runtime_conf) authentication_utils.check_constraint(runtime_conf, dsl) job_initiator = runtime_conf["initiator"] conf_adapter = JobRuntimeConfigAdapter(runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != "predict": # generate job model info conf_version = schedule_utils.get_conf_version(runtime_conf) if conf_version != 2: raise Exception( "only the v2 version runtime conf is supported") common_job_parameters.model_id = model_utils.gen_model_id( runtime_conf["role"]) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ["model_id", "model_version"]) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_pipeline_model() train_runtime_conf = json_loads( pipeline_model.train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) dsl = json_loads(pipeline_model.inference_dsl) # dsl = ProviderManager.fill_fate_flow_provider(dsl) job = Job() job.f_job_id = job_id job.f_dsl = dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = runtime_conf["role"] job.f_initiator_role = job_initiator["role"] job.f_initiator_party_id = job_initiator["party_id"] job.f_role = job_initiator["role"] job.f_party_id = job_initiator["party_id"] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in runtime_conf["role"][ job.f_initiator_role]: msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}" schedule_logger(job_id).info(msg) raise Exception(msg) # create common parameters on initiator JobController.create_common_job_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, common_job_parameters=common_job_parameters) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() # inherit job job.f_inheritance_info = common_job_parameters.inheritance_info job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS if job.f_inheritance_info: inheritance_jobs = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"]) inheritance_tasks = JobSaver.query_task( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"], only_latest=True) job_utils.check_job_inheritance_parameters( job, inheritance_jobs, inheritance_tasks) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) else: need_run_components = {} for role in response: need_run_components[role] = {} for party, res in response[role].items(): need_run_components[role][party] = [ name for name, value in response[role][party] ["data"]["components"].items() if value["need_run"] is True ] if common_job_parameters.federated_mode == FederatedMode.MULTIPLE: # create the task holder in db to record information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue if not need_run_components[role][party_id]: continue JobController.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=common_job_parameters, dsl_parser=dsl_parser, components=need_run_components[role][party_id]) job.f_status = JobStatus.WAITING status_code, response = FederatedScheduler.sync_job_status( job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("set job to waiting status failed") schedule_logger(job_id).info( f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}" ) logs_directory = job_utils.get_job_log_directory(job_id) result = { "code": RetCode.SUCCESS, "message": "success", "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator["role"], job_initiator["party_id"]) } warn_parameter = JobRuntimeConfigAdapter( submit_job_conf.runtime_conf).check_removed_parameter() if warn_parameter: result[ "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!" submit_result.update(result) submit_result.update(path_dict) except Exception as e: submit_result["code"] = RetCode.OPERATING_ERROR submit_result["message"] = exception_to_trace_string(e) schedule_logger(job_id).exception(e) return submit_result
def clean(job_id, role, party_id): JobController.clean_job(job_id=job_id, role=role, party_id=party_id, roles=request.json) return get_json_result(retcode=0, retmsg='success')
def save_pipelined_model(job_id, role, party_id): JobController.save_pipelined_model(job_id=job_id, role=role, party_id=party_id) return get_json_result(retcode=0, retmsg='success')
def align_job_args(job_id, role, party_id): JobController.align_job_args(job_info=request.json, role=role, party_id=party_id, job_id=job_id) return get_json_result(retcode=0, retmsg='success')
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info( f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError( f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}" ) if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun" ) else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version}" ) for _role, _party_ids in job.f_runtime_conf_on_party[ "role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks( job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters( ** job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" ) job_can_rerun = True if job_can_rerun: schedule_logger( job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger( job_id=job_id).info(f"job {job_id} no task to rerun")
def query_job_input_args(job_id, role, party_id): job_input_args = JobController.query_job_input_args( input_data=request.json, role=role, party_id=party_id) return get_json_result(retcode=0, retmsg='success', data=job_input_args)
def create_task(job_id, component_name, task_id, task_version, role, party_id): JobController.initialize_task(role, party_id, request.json) return get_json_result(retcode=0, retmsg='success')
def update_parameters(job_id, role, party_id): JobController.update_parameter(job_id=job_id, role=role, party_id=party_id, updated_parameters=request.json) return get_json_result(retcode=0, retmsg='success')
def update_job(job_id, role, party_id): job_info = {} job_info.update(request.json) job_info.update({"job_id": job_id, "role": role, "party_id": party_id}) JobController.update_job(job_info=job_info) return get_json_result(retcode=0, retmsg='success')
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) authentication_utils.check_constraint(job_runtime_conf, job_dsl) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] job.f_role = job_initiator['role'] job.f_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator['role'], job_initiator['party_id']) } submit_result.update(path_dict) return submit_result