def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format( job_info.get('job_id', ''))) job = jobs[0] job_dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) else: job_dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) return job_dsl_parser.get_dependency(role=job_info["role"], party_id=int( job_info["party_id"])) except Exception as e: stat_logger.exception(e) raise e
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) component_need_run = {} if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format(job_info.get('job_id', ''))) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True) for task in tasks: need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True) component_need_run[task.f_component_name] = need_run else: dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) dependency = dsl_parser.get_dependency() dependency["component_need_run"] = component_need_run return dependency except Exception as e: stat_logger.exception(e) raise e
def schedule_running_job(cls, job): schedule_logger(job_id=job.f_job_id).info("scheduling job {}".format(job.f_job_id)) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, tasks = TaskScheduler.schedule(job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = [task.f_status for task in tasks] new_job_status = cls.calculate_job_status(task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress(tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job_id=job.f_job_id).info("Job {} status is {}, calculate by task status list: {}".format(job.f_job_id, new_job_status, tasks_status)) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) schedule_logger(job_id=job.f_job_id).info("finish scheduling job {}".format(job.f_job_id))
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( 'job {} on {} {} start to save pipeline'.format( job_id, role, party_id)) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=role, party_id=party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') if job_type == 'predict': return dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) tracker.save_pipelined_model(pipelined_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( 'job {} on {} {} save pipeline successfully'.format( job_id, role, party_id))
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_end_time = None job.f_elapsed = None job.f_progress = 0 schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"]) JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed") else: # status updates may be delayed, and in a very small probability they will be executed after the rerun command schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task") else: schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) job_parameters = RunParameters(**runtime_conf['job_parameters']) job_initiator = runtime_conf['initiator'] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) # save new job into db if role == job_initiator['role'] and party_id == job_initiator[ 'party_id']: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING roles = job_info['roles'] # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 engines_info = cls.get_job_engines_address( job_parameters=job_parameters) cls.special_role_parameters(role=role, job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, engines_info=engines_info) runtime_conf["job_parameters"] = job_parameters.to_dict() JobSaver.create_job(job_info=job_info) job_utils.save_job_conf(job_id=job_id, job_dsl=dsl, job_runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id, role, party_id, True, job_initiator, job_parameters, dsl_parser) cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_info=job_info, is_initiator=is_initiator, dsl_parser=dsl_parser)
def task_command(cls, job, task, command, command_body=None, need_user=False): federated_response = {} job_parameters = job.f_runtime_conf_on_party["job_parameters"] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) component = dsl_parser.get_component_info( component_name=task.f_component_name) component_parameters = component.get_role_parameters() for dest_role, parameters_on_partys in component_parameters.items(): federated_response[dest_role] = {} for parameters_on_party in parameters_on_partys: dest_party_id = parameters_on_party.get('local', {}).get('party_id') try: if need_user: command_body["user_id"] = job.f_user.get( dest_role, {}).get(str(dest_party_id), "") schedule_logger(job_id=job.f_job_id).info( f'user:{job.f_user}, dest_role:{dest_role}, dest_party_id:{dest_party_id}' ) schedule_logger(job_id=job.f_job_id).info( f'command_body: {command_body}') response = federated_api( job_id=task.f_job_id, method='POST', endpoint='/party/{}/{}/{}/{}/{}/{}/{}'.format( task.f_job_id, task.f_component_name, task.f_task_id, task.f_task_version, dest_role, dest_party_id, command), src_party_id=job.f_initiator_party_id, dest_party_id=dest_party_id, src_role=job.f_initiator_role, json_body=command_body if command_body else {}, federated_mode=job_parameters["federated_mode"]) federated_response[dest_role][dest_party_id] = response except Exception as e: federated_response[dest_role][dest_party_id] = { "retcode": RetCode.FEDERATED_ERROR, "retmsg": "Federated schedule error, {}".format(str(e)) } if federated_response[dest_role][dest_party_id]["retcode"]: schedule_logger(job_id=job.f_job_id).warning( "an error occurred while {} the task to role {} party {}: \n{}" .format( command, dest_role, dest_party_id, federated_response[dest_role][dest_party_id] ["retmsg"])) return cls.return_federated_response( federated_response=federated_response)
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( 'job {} on {} {} start to save pipeline'.format( job_id, role, party_id)) job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=role, party_id=party_id) job_parameters = runtime_conf_on_party.get('job_parameters', {}) if role in job_parameters.get("assistant_role", []): return model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') work_mode = job_parameters['work_mode'] roles = runtime_conf_on_party['role'] initiator_role = runtime_conf_on_party['initiator']['role'] initiator_party_id = runtime_conf_on_party['initiator']['party_id'] if job_type == 'predict': return dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version pipeline.parent = True pipeline.loaded_times = 0 pipeline.roles = json_dumps(roles, byte=True) pipeline.work_mode = work_mode pipeline.initiator_role = initiator_role pipeline.initiator_party_id = initiator_party_id pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) pipeline.parent_info = json_dumps({}, byte=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) tracker.save_pipelined_model(pipelined_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( 'job {} on {} {} save pipeline successfully'.format( job_id, role, party_id))
def schedule_running_job(cls, job: Job, force_sync_status=False): schedule_logger(job.f_job_id).info(f"scheduling running job") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = dict([(task.f_component_name, task.f_status) for task in tasks]) new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status.values()) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job.f_job_id).info( f"job status is {new_job_status}, calculate by task status list: {tasks_status}" ) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if auto_rerun_tasks: schedule_logger(job.f_job_id).info("job have auto rerun tasks") cls.set_job_rerun(job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, tasks=auto_rerun_tasks, auto=True) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job.f_job_id).info("finish scheduling running job")
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf_on_party["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def get_model_alias(self): job_configuration = OperationClient().get_job_conf( self.model_version, self.tracker.role, self.tracker.party_id) if not job_configuration: raise ValueError('The job was not found.') job_configuration = JobConfiguration(**job_configuration) dsl_parser = get_job_dsl_parser( job_configuration.dsl, job_configuration.runtime_conf, train_runtime_conf=job_configuration.train_runtime_conf) component = dsl_parser.get_component_info(self.component_name) task_output_dsl = component.get_output() self.model_alias = task_output_dsl['model'][0] if task_output_dsl.get( 'model') else 'default'
def component_output_model(): request_data = request.json check_request_parameters(request_data) job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) try: model_id = runtime_conf_on_party['job_parameters']['model_id'] model_version = runtime_conf_on_party['job_parameters']['model_version'] except Exception as e: job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_model_configuration(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) if any([job_dsl, job_runtime_conf, train_runtime_conf]): adapter = JobRuntimeConfigAdapter(job_runtime_conf) model_id = adapter.get_common_parameters().to_dict().get('model_id') model_version = adapter.get_common_parameters().to_dict.get('model_version') else: stat_logger.exception(e) stat_logger.error(f"Can not find model info by filters: job id: {request_data.get('job_id')}, " f"role: {request_data.get('role')}, party id: {request_data.get('party_id')}") raise Exception(f"Can not find model info by filters: job id: {request_data.get('job_id')}, " f"role: {request_data.get('role')}, party id: {request_data.get('party_id')}") tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id'], model_id=model_id, model_version=model_version) dag = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) component = dag.get_component_info(request_data['component_name']) output_model_json = {} # There is only one model output at the current dsl version. output_model = tracker.get_output_model(component.get_output()['model'][0] if component.get_output().get('model') else 'default') for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Param'): output_model_json = json_format.MessageToDict(buffer_object, including_default_value_fields=True) if output_model_json: component_define = tracker.get_component_define() this_component_model_meta = {} for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Meta'): this_component_model_meta['meta_data'] = json_format.MessageToDict(buffer_object, including_default_value_fields=True) this_component_model_meta.update(component_define) return get_json_result(retcode=0, retmsg='success', data=output_model_json, meta=this_component_model_meta) else: return get_json_result(retcode=0, retmsg='no data', data={})
def _run(self): result = {} dsl_parser = schedule_utils.get_job_dsl_parser( dsl=self.args.dsl, runtime_conf=self.args.runtime_conf, train_runtime_conf=self.args.train_runtime_conf, pipeline_dsl=self.args.pipeline_dsl) provider = ComponentProvider(**self.args.config["provider"]) common_task_info = self.args.config["common_task_info"] log_msg = f"initialize the components: {self.args.config['components']}" LOGGER.info( start_log(log_msg, role=self.args.role, party_id=self.args.party_id)) for component_name in self.args.config["components"]: result[component_name] = {} task_info = {} task_info.update(common_task_info) parameters, user_specified_parameters = ProviderManager.get_component_parameters( dsl_parser=dsl_parser, component_name=component_name, role=self.args.role, party_id=self.args.party_id, provider=provider) if parameters: task_info = {} task_info.update(common_task_info) task_info["component_name"] = component_name task_info["component_module"] = parameters["module"] task_info["provider_info"] = provider.to_dict() task_info["component_parameters"] = parameters TaskController.create_task( role=self.args.role, party_id=self.args.party_id, run_on_this_party=common_task_info["run_on_this_party"], task_info=task_info) result[component_name]["need_run"] = True else: # The party does not need to run, pass result[component_name]["need_run"] = False LOGGER.info( successful_log(log_msg, role=self.args.role, party_id=self.args.party_id)) return result
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters().get(role, {}).get(party_id, {}) schedule_logger(job_id).info('job parameters:{}'.format(job_parameters)) job_parameters = RunParameters(**job_parameters) # save new job into db if role == job_info["initiator_role"] and party_id == job_info["initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.adapt_job_parameters(role=role, job_parameters=job_parameters) engines_info = cls.get_job_engines_address(job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info) job_info["runtime_conf_on_party"]["job_parameters"] = job_parameters.to_dict() job_utils.save_job_conf(job_id=job_id, role=role, job_dsl=dsl, job_runtime_conf=runtime_conf, job_runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) job_parameters = job_info['runtime_conf_on_party']['job_parameters'] roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) JobSaver.create_job(job_info=job_info)
def component_output_model(): request_data = request.json check_request_parameters(request_data) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) model_id = job_runtime_conf['job_parameters']['model_id'] model_version = job_runtime_conf['job_parameters']['model_version'] tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id'], model_id=model_id, model_version=model_version) dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) component = dag.get_component_info(request_data['component_name']) output_model_json = {} # There is only one model output at the current dsl version. output_model = tracker.get_output_model(component.get_output( )['model'][0] if component.get_output().get('model') else 'default') for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Param'): output_model_json = json_format.MessageToDict( buffer_object, including_default_value_fields=True) if output_model_json: component_define = tracker.get_component_define() this_component_model_meta = {} for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Meta'): this_component_model_meta[ 'meta_data'] = json_format.MessageToDict( buffer_object, including_default_value_fields=True) this_component_model_meta.update(component_define) return get_json_result(retcode=0, retmsg='success', data=output_model_json, meta=this_component_model_meta) else: return get_json_result(retcode=0, retmsg='no data', data={})
def get_job_all_table(job): dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf ) _, hierarchical_structure = dsl_parser.get_dsl_hierarchical_structure() component_table = {} component_output_tables = Tracker.query_output_data_infos(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) for component_name_list in hierarchical_structure: for component_name in component_name_list: component_table[component_name] = {} component_input_table = get_component_input_table(dsl_parser, job, component_name) component_table[component_name]['input'] = component_input_table component_table[component_name]['output'] = {} for output_table in component_output_tables: if output_table.f_component_name == component_name: component_table[component_name]['output'][output_table.f_data_name] = \ {'name': output_table.f_table_name, 'namespace': output_table.f_table_namespace} return component_table
def check_spark_dependence(cls, job): if not DEPENDENT_DISTRIBUTION: return True engine_name = ENGINES.get(EngineType.COMPUTING) schedule_logger(job.f_job_id).info(f"job engine name: {engine_name}") if engine_name not in [ComputingEngine.SPARK]: return True dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) provider_group = ProviderManager.get_job_provider_group( dsl_parser=dsl_parser) version_provider_info = {} fate_flow_version_provider_info = {} schedule_logger(job.f_job_id).info(f'group_info:{provider_group}') for group_key, group_info in provider_group.items(): if group_info["provider"]["name"] == ComponentProviderName.FATE_FLOW.value and \ group_info["provider"]["version"] not in fate_flow_version_provider_info: fate_flow_version_provider_info[ group_info["provider"]["version"]] = group_info["provider"] if group_info["provider"]["name"] == ComponentProviderName.FATE.value and \ group_info["provider"]["version"] not in version_provider_info: version_provider_info[group_info["provider"] ["version"]] = group_info["provider"] schedule_logger(job.f_job_id).info( f'version_provider_info:{version_provider_info}') schedule_logger(job.f_job_id).info( f'fate_flow_version_provider_info:{fate_flow_version_provider_info}' ) if not version_provider_info: version_provider_info = fate_flow_version_provider_info check_tag, upload_tag, upload_details = cls.check_upload( job.f_job_id, version_provider_info, fate_flow_version_provider_info) if upload_tag: cls.upload_spark_dependence(job, upload_details) return check_tag
def check_component(cls, job, check_type="inheritance"): schedule_logger(job.f_job_id).info(f"component check") dependence_status_code, response = FederatedScheduler.check_component( job=job, check_type=check_type) schedule_logger( job.f_job_id).info(f"component check response: {response}") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) component_set = set([ cpn.name for cpn in dsl_parser.get_source_connect_sub_graph( job.f_inheritance_info.get("component_list")) ]) for dest_role in response.keys(): for party_id in response[dest_role].keys(): component_set = component_set.intersection( set(response[dest_role][party_id].get("data"))) if component_set != set(job.f_inheritance_info.get("component_list")): schedule_logger( job.f_job_id).info(f"dsl parser components:{component_set}") component_list = [ cpn.name for cpn in dsl_parser.get_source_connect_sub_graph( list(component_set)) ] schedule_logger( job.f_job_id).info(f"parser result:{component_list}") command_body = {"inheritance_info": job.f_inheritance_info} command_body["inheritance_info"].update( {"component_list": component_list}) schedule_logger( job.f_job_id).info(f"start align job info:{command_body}") status_code, response = FederatedScheduler.align_args( job, command_body=command_body) schedule_logger( job.f_job_id).info(f"align result:{status_code}, {response}") schedule_logger(job.f_job_id).info(f"check success")
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_initiator = job_runtime_conf['initiator'] job_parameters = RunParameters(**job_runtime_conf['job_parameters']) cls.backend_compatibility(job_parameters=job_parameters) job_utils.check_job_runtime_conf(job_runtime_conf) if job_parameters.job_type != 'predict': # generate job model info job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role']) job_parameters.model_version = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters.model_id, model_version=job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = job_utils.save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) cls.adapt_job_parameters(job_parameters=job_parameters) # update runtime conf job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("create job failed: {}".format(response)) if job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job_runtime_conf["role"].items(): for party_id in party_ids: if role == job_initiator['role'] and party_id == job_initiator['party_id']: continue JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser) # push into queue try: JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) except Exception as e: raise Exception(f'push job into queue failed:\n{e}') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url
def run_task(cls): task_info = {} try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-v', '--task_version', required=True, type=int, help="task version") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=int, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task parameters") parser.add_argument('--run_ip', help="run ip", type=str) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config( JOB_SERVER_HOST=args.job_server.split(':')[0], HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id task_version = args.task_version role = args.role party_id = args.party_id executor_pid = os.getpid() task_info.update({ "job_id": job_id, "component_name": component_name, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, "run_ip": args.run_ip, "run_pid": executor_pid }) start_time = current_timestamp() job_conf = job_utils.get_job_conf(job_id, role) job_dsl = job_conf["job_dsl_path"] job_runtime_conf = job_conf["job_runtime_conf_path"] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=job_conf["train_runtime_conf_path"], pipeline_dsl=job_conf["pipeline_dsl_path"]) party_index = job_runtime_conf["role"][role].index(party_id) job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser, job_runtime_conf, role, party_id) component = dsl_parser.get_component_info( component_name=component_name) component_parameters = component.get_role_parameters() component_parameters_on_party = component_parameters[role][ party_index] if role in component_parameters else {} module_name = component.get_module() task_input_dsl = component.get_input() task_output_dsl = component.get_output() component_parameters_on_party[ 'output_data_name'] = task_output_dsl.get('data') task_parameters = RunParameters( **file_utils.load_json_conf(args.config)) job_parameters = task_parameters if job_parameters.assistant_role: TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task_info["party_status"] = TaskStatus.FAILED return try: job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) tracker_client = TrackerClient( job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) run_class_paths = component_parameters_on_party.get( 'CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_info["party_status"] = TaskStatus.RUNNING cls.report_task_update_to_driver(task_info=task_info) # init environment, process is shared globally RuntimeConfig.init_config( WORK_MODE=job_parameters.work_mode, COMPUTING_ENGINE=job_parameters.computing_engine, FEDERATION_ENGINE=job_parameters.federation_engine, FEDERATED_MODE=job_parameters.federated_mode) if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL: session_options = task_parameters.eggroll_run.copy() else: session_options = {} sess = session.Session( computing_type=job_parameters.computing_engine, federation_type=job_parameters.federation_engine) computing_session_id = job_utils.generate_session_id( task_id, task_version, role, party_id) sess.init_computing(computing_session_id=computing_session_id, options=session_options) federation_session_id = job_utils.generate_task_version_id( task_id, task_version) component_parameters_on_party[ "job_parameters"] = job_parameters.to_dict() sess.init_federation( federation_session_id=federation_session_id, runtime_conf=component_parameters_on_party, service_conf=job_parameters.engines_address.get( EngineType.FEDERATION, {})) sess.as_default() schedule_logger().info('Run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger().info("Component parameters on party {}".format( component_parameters_on_party)) schedule_logger().info("Task input dsl {}".format(task_input_dsl)) task_run_args = cls.get_task_run_args( job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, ) if module_name in {"Upload", "Download", "Reader", "Writer"}: task_run_args["job_parameters"] = job_parameters run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker_client) run_object.set_task_version_id( task_version_id=job_utils.generate_task_version_id( task_id, task_version)) # add profile logs profile.profile_start() run_object.run(component_parameters_on_party, task_run_args) profile.profile_ends() output_data = run_object.save_data() if not isinstance(output_data, list): output_data = [output_data] for index in range(0, len(output_data)): data_name = task_output_dsl.get( 'data')[index] if task_output_dsl.get( 'data') else '{}'.format(index) persistent_table_namespace, persistent_table_name = tracker.save_output_data( computing_table=output_data[index], output_storage_engine=job_parameters.storage_engine, output_storage_address=job_parameters.engines_address.get( EngineType.STORAGE, {})) if persistent_table_namespace and persistent_table_name: tracker.log_output_data_info( data_name=data_name, table_namespace=persistent_table_namespace, table_name=persistent_table_name) output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model( output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task_info["party_status"] = TaskStatus.SUCCESS except Exception as e: task_info["party_status"] = TaskStatus.FAILED schedule_logger().exception(e) finally: try: task_info["end_time"] = current_timestamp() task_info["elapsed"] = task_info["end_time"] - start_time cls.report_task_update_to_driver(task_info=task_info) except Exception as e: task_info["party_status"] = TaskStatus.FAILED traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format( task_id, role, party_id, timestamp_to_date(start_time))) schedule_logger().info('task {} {} {} end time: {}'.format( task_id, role, party_id, timestamp_to_date(task_info["end_time"]))) schedule_logger().info('task {} {} {} takes {}s'.format( task_id, role, party_id, int(task_info["elapsed"]) / 1000)) schedule_logger().info('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) print('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) return task_info
def _run_(self): # todo: All function calls where errors should be thrown args = self.args start_time = current_timestamp() try: LOGGER.info( f'run {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} task' ) self.report_info.update({ "job_id": args.job_id, "component_name": args.component_name, "task_id": args.task_id, "task_version": args.task_version, "role": args.role, "party_id": args.party_id, "run_ip": args.run_ip, "run_pid": self.run_pid }) operation_client = OperationClient() job_configuration = JobConfiguration( **operation_client.get_job_conf( args.job_id, args.role, args.party_id, args.component_name, args.task_id, args.task_version)) task_parameters_conf = args.config dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_configuration.dsl, runtime_conf=job_configuration.runtime_conf, train_runtime_conf=job_configuration.train_runtime_conf, pipeline_dsl=None) job_parameters = dsl_parser.get_job_parameters( job_configuration.runtime_conf) user_name = job_parameters.get(args.role, {}).get(args.party_id, {}).get("user", '') LOGGER.info(f"user name:{user_name}") src_user = task_parameters_conf.get("src_user") task_parameters = RunParameters(**task_parameters_conf) job_parameters = task_parameters if job_parameters.assistant_role: TaskExecutor.monkey_patch() job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser, job_configuration.runtime_conf_on_party, args.role, args.party_id) component = dsl_parser.get_component_info( component_name=args.component_name) module_name = component.get_module() task_input_dsl = component.get_input() task_output_dsl = component.get_output() kwargs = { 'job_id': args.job_id, 'role': args.role, 'party_id': args.party_id, 'component_name': args.component_name, 'task_id': args.task_id, 'task_version': args.task_version, 'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version, 'component_module_name': module_name, 'job_parameters': job_parameters, } tracker = Tracker(**kwargs) tracker_client = TrackerClient(**kwargs) checkpoint_manager = CheckpointManager(**kwargs) self.report_info["party_status"] = TaskStatus.RUNNING self.report_task_info_to_driver() previous_components_parameters = tracker_client.get_model_run_parameters( ) LOGGER.info( f"previous_components_parameters:\n{json_dumps(previous_components_parameters, indent=4)}" ) component_provider, component_parameters_on_party, user_specified_parameters = ProviderManager.get_component_run_info( dsl_parser=dsl_parser, component_name=args.component_name, role=args.role, party_id=args.party_id, previous_components_parameters=previous_components_parameters) RuntimeConfig.set_component_provider(component_provider) LOGGER.info( f"component parameters on party:\n{json_dumps(component_parameters_on_party, indent=4)}" ) flow_feeded_parameters = { "output_data_name": task_output_dsl.get("data") } # init environment, process is shared globally RuntimeConfig.init_config( COMPUTING_ENGINE=job_parameters.computing_engine, FEDERATION_ENGINE=job_parameters.federation_engine, FEDERATED_MODE=job_parameters.federated_mode) if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL: session_options = task_parameters.eggroll_run.copy() session_options["python.path"] = os.getenv("PYTHONPATH") session_options["python.venv"] = os.getenv("VIRTUAL_ENV") else: session_options = {} sess = session.Session(session_id=args.session_id) sess.as_global() sess.init_computing(computing_session_id=args.session_id, options=session_options) component_parameters_on_party[ "job_parameters"] = job_parameters.to_dict() roles = job_configuration.runtime_conf["role"] if set(roles) == {"local"}: LOGGER.info(f"only local roles, pass init federation") else: sess.init_federation( federation_session_id=args.federation_session_id, runtime_conf=component_parameters_on_party, service_conf=job_parameters.engines_address.get( EngineType.FEDERATION, {})) LOGGER.info( f'run {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} task' ) LOGGER.info( f"component parameters on party:\n{json_dumps(component_parameters_on_party, indent=4)}" ) LOGGER.info(f"task input dsl {task_input_dsl}") task_run_args, input_table_list = self.get_task_run_args( job_id=args.job_id, role=args.role, party_id=args.party_id, task_id=args.task_id, task_version=args.task_version, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, ) if module_name in { "Upload", "Download", "Reader", "Writer", "Checkpoint" }: task_run_args["job_parameters"] = job_parameters LOGGER.info(f"task input args {task_run_args}") need_run = component_parameters_on_party.get("ComponentParam", {}).get( "need_run", True) provider_interface = provider_utils.get_provider_interface( provider=component_provider) run_object = provider_interface.get( module_name, ComponentRegistry.get_provider_components( provider_name=component_provider.name, provider_version=component_provider.version)).get_run_obj( self.args.role) flow_feeded_parameters.update({"table_info": input_table_list}) cpn_input = ComponentInput( tracker=tracker_client, checkpoint_manager=checkpoint_manager, task_version_id=job_utils.generate_task_version_id( args.task_id, args.task_version), parameters=component_parameters_on_party["ComponentParam"], datasets=task_run_args.get("data", None), caches=task_run_args.get("cache", None), models=dict( model=task_run_args.get("model"), isometric_model=task_run_args.get("isometric_model"), ), job_parameters=job_parameters, roles=dict( role=component_parameters_on_party["role"], local=component_parameters_on_party["local"], ), flow_feeded_parameters=flow_feeded_parameters, ) profile_log_enabled = False try: if int(os.getenv("FATE_PROFILE_LOG_ENABLED", "0")) > 0: profile_log_enabled = True except Exception as e: LOGGER.warning(e) if profile_log_enabled: # add profile logs LOGGER.info("profile logging is enabled") profile.profile_start() cpn_output = run_object.run(cpn_input) sess.wait_remote_all_done() profile.profile_ends() else: LOGGER.info("profile logging is disabled") cpn_output = run_object.run(cpn_input) sess.wait_remote_all_done() output_table_list = [] LOGGER.info(f"task output data {cpn_output.data}") for index, data in enumerate(cpn_output.data): data_name = task_output_dsl.get( 'data')[index] if task_output_dsl.get( 'data') else '{}'.format(index) #todo: the token depends on the engine type, maybe in job parameters persistent_table_namespace, persistent_table_name = tracker.save_output_data( computing_table=data, output_storage_engine=job_parameters.storage_engine, token={"username": user_name}) if persistent_table_namespace and persistent_table_name: tracker.log_output_data_info( data_name=data_name, table_namespace=persistent_table_namespace, table_name=persistent_table_name) output_table_list.append({ "namespace": persistent_table_namespace, "name": persistent_table_name }) self.log_output_data_table_tracker(args.job_id, input_table_list, output_table_list) # There is only one model output at the current dsl version. tracker_client.save_component_output_model( model_buffers=cpn_output.model, model_alias=task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default', user_specified_run_parameters=user_specified_parameters) if cpn_output.cache is not None: for i, cache in enumerate(cpn_output.cache): if cache is None: continue name = task_output_dsl.get( "cache")[i] if "cache" in task_output_dsl else str(i) if isinstance(cache, DataCache): tracker.tracking_output_cache(cache, cache_name=name) elif isinstance(cache, tuple): tracker.save_output_cache( cache_data=cache[0], cache_meta=cache[1], cache_name=name, output_storage_engine=job_parameters. storage_engine, output_storage_address=job_parameters. engines_address.get(EngineType.STORAGE, {}), token={"username": user_name}) else: raise RuntimeError( f"can not support type {type(cache)} module run object output cache" ) if need_run: self.report_info["party_status"] = TaskStatus.SUCCESS else: self.report_info["party_status"] = TaskStatus.PASS except PassError as e: self.report_info["party_status"] = TaskStatus.PASS except Exception as e: traceback.print_exc() self.report_info["party_status"] = TaskStatus.FAILED LOGGER.exception(e) finally: try: self.report_info["end_time"] = current_timestamp() self.report_info[ "elapsed"] = self.report_info["end_time"] - start_time self.report_task_info_to_driver() except Exception as e: self.report_info["party_status"] = TaskStatus.FAILED traceback.print_exc() LOGGER.exception(e) msg = f"finish {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} with {self.report_info['party_status']}" LOGGER.info(msg) print(msg) return self.report_info
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters().get(role, {}).get( party_id, {}) schedule_logger(job_id).info( 'job parameters:{}'.format(job_parameters)) dest_user = dsl_parser.get_job_parameters().get(role, {}).get( party_id, {}).get("user", '') user = {} src_party_id = int(job_info.get('src_party_id')) if job_info.get( 'src_party_id') else 0 src_user = dsl_parser.get_job_parameters().get( job_info.get('src_role'), {}).get(src_party_id, {}).get("user", '') for _role, party_id_item in dsl_parser.get_job_parameters().items(): user[_role] = {} for _party_id, _parameters in party_id_item.items(): user[_role][_party_id] = _parameters.get("user", "") schedule_logger(job_id).info('job user:{}'.format(user)) if USE_DATA_AUTHENTICATION: job_args = dsl_parser.get_args_input() schedule_logger(job_id).info('job args:{}'.format(job_args)) dataset_dict = cls.get_dataset(False, role, party_id, runtime_conf.get("role"), job_args) dataset_list = [] if dataset_dict.get(role, {}).get(party_id): for k, v in dataset_dict[role][party_id].items(): dataset_list.append({ "namespace": v.split('.')[0], "table_name": v.split('.')[1] }) data_authentication_check( src_role=job_info.get('src_role'), src_party_id=job_info.get('src_party_id'), src_user=src_user, dest_user=dest_user, dataset_list=dataset_list) job_parameters = RunParameters(**job_parameters) # save new job into db if role == job_info["initiator_role"] and party_id == job_info[ "initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING job_info["user_id"] = dest_user job_info["src_user"] = src_user job_info["user"] = user # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.adapt_job_parameters(role=role, job_parameters=job_parameters) engines_info = cls.get_job_engines_address( job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info) job_info["runtime_conf_on_party"][ "job_parameters"] = job_parameters.to_dict() job_utils.save_job_conf( job_id=job_id, role=role, job_dsl=dsl, job_runtime_conf=runtime_conf, job_runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) job_parameters = job_info['runtime_conf_on_party']['job_parameters'] roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) JobSaver.create_job(job_info=job_info)
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters(runtime_conf) schedule_logger(job_id).info( 'job parameters:{}'.format(job_parameters)) dest_user = job_parameters.get(role, {}).get(party_id, {}).get('user', '') user = {} src_party_id = int( job_info['src_party_id']) if job_info.get('src_party_id') else 0 src_role = job_info.get('src_role', '') src_user = job_parameters.get(src_role, {}).get(src_party_id, {}).get( 'user', '') if src_role else '' for _role, party_id_item in job_parameters.items(): user[_role] = {} for _party_id, _parameters in party_id_item.items(): user[_role][_party_id] = _parameters.get("user", "") schedule_logger(job_id).info('job user:{}'.format(user)) if USE_DATA_AUTHENTICATION: job_args = dsl_parser.get_args_input() schedule_logger(job_id).info('job args:{}'.format(job_args)) dataset_dict = cls.get_dataset(False, role, party_id, runtime_conf.get("role"), job_args) dataset_list = [] if dataset_dict.get(role, {}).get(party_id): for k, v in dataset_dict[role][party_id].items(): dataset_list.append({ "namespace": v.split('.')[0], "table_name": v.split('.')[1] }) data_authentication_check( src_role=job_info.get('src_role'), src_party_id=job_info.get('src_party_id'), src_user=src_user, dest_user=dest_user, dataset_list=dataset_list) job_parameters = RunParameters( **job_parameters.get(role, {}).get(party_id, {})) # save new job into db if role == job_info["initiator_role"] and party_id == job_info[ "initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.READY job_info["user_id"] = dest_user job_info["src_user"] = src_user job_info["user"] = user # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.create_job_parameters_on_party(role=role, party_id=party_id, job_parameters=job_parameters) # update job parameters on party job_info["runtime_conf_on_party"][ "job_parameters"] = job_parameters.to_dict() JobSaver.create_job(job_info=job_info) schedule_logger(job_id).info("start initialize tasks") initialized_result, provider_group = cls.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) schedule_logger(job_id).info("initialize tasks success") for provider_key, group_info in provider_group.items(): for cpn in group_info["components"]: dsl["components"][cpn]["provider"] = provider_key roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) job_utils.save_job_conf( job_id=job_id, role=role, party_id=party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) return {"components": initialized_result}
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) authentication_utils.check_constraint(job_runtime_conf, job_dsl) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] job.f_role = job_initiator['role'] job.f_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator['role'], job_initiator['party_id']) } submit_result.update(path_dict) return submit_result
def set_job_rerun(cls, job_id, initiator_role, initiator_party_id, auto, force=False, tasks: typing.List[Task] = None, component_name: typing.Union[str, list] = None): schedule_logger(job_id).info( f"try to rerun job on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if not jobs: raise RuntimeError( f"can not found job on initiator {initiator_role} {initiator_party_id}" ) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) component_name, force = cls.get_rerun_component( component_name, job, dsl_parser, force) schedule_logger(job_id).info(f"rerun component: {component_name}") if tasks: schedule_logger(job_id).info( f"require {[task.f_component_name for task in tasks]} to rerun" ) else: task_query = { 'job_id': job_id, 'role': initiator_role, 'party_id': initiator_party_id, } if not component_name or component_name == job_utils.job_pipeline_component_name( ): # rerun all tasks schedule_logger(job_id).info( "require all component of pipeline to rerun") else: _require_reruns = {component_name} if isinstance( component_name, str) else set(component_name) _should_reruns = _require_reruns.copy() for _cpn in _require_reruns: _components = dsl_parser.get_downstream_dependent_components( _cpn) for _c in _components: _should_reruns.add(_c.get_name()) schedule_logger(job_id).info( f"require {_require_reruns} to rerun, " f"and then found {_should_reruns} need be to rerun") task_query['component_name'] = _should_reruns tasks = JobSaver.query_task(**task_query) job_can_rerun = any([ TaskScheduler.prepare_rerun_task( job=job, task=task, dsl_parser=dsl_parser, auto=auto, force=force, ) for task in tasks ]) if not job_can_rerun: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id).info("job no task to rerun") return False schedule_logger(job_id).info("job set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id).info( f"job set rerun signal {'successfully' if status else 'failed'}") return True
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( f"start to save pipeline model on {role} {party_id}") job_configuration = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) runtime_conf_on_party = job_configuration.runtime_conf_on_party job_parameters = runtime_conf_on_party.get('job_parameters', {}) if role in job_parameters.get("assistant_role", []): return model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') roles = runtime_conf_on_party['role'] initiator_role = runtime_conf_on_party['initiator']['role'] initiator_party_id = runtime_conf_on_party['initiator']['party_id'] if job_type == 'predict': return dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_configuration.dsl, runtime_conf=job_configuration.runtime_conf, train_runtime_conf=job_configuration.train_runtime_conf) components_parameters = {} tasks = JobSaver.query_task(job_id=job_id, role=role, party_id=party_id, only_latest=True) for task in tasks: components_parameters[ task.f_component_name] = task.f_component_parameters predict_dsl = schedule_utils.fill_inference_dsl( dsl_parser, origin_inference_dsl=job_configuration.dsl, components_parameters=components_parameters) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_configuration.dsl, byte=True) pipeline.train_runtime_conf = json_dumps( job_configuration.runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version pipeline.parent = True pipeline.loaded_times = 0 pipeline.roles = json_dumps(roles, byte=True) pipeline.initiator_role = initiator_role pipeline.initiator_party_id = initiator_party_id pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) pipeline.parent_info = json_dumps({}, byte=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version, job_parameters=RunParameters(**job_parameters)) tracker.save_pipeline_model(pipeline_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( f"save pipeline on {role} {party_id} successfully")
def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None): if not job_id: job_id = job_utils.generate_job_id() submit_result = {"job_id": job_id} schedule_logger(job_id).info( f"submit job, body {submit_job_conf.to_dict()}") try: dsl = submit_job_conf.dsl runtime_conf = deepcopy(submit_job_conf.runtime_conf) job_utils.check_job_runtime_conf(runtime_conf) authentication_utils.check_constraint(runtime_conf, dsl) job_initiator = runtime_conf["initiator"] conf_adapter = JobRuntimeConfigAdapter(runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != "predict": # generate job model info conf_version = schedule_utils.get_conf_version(runtime_conf) if conf_version != 2: raise Exception( "only the v2 version runtime conf is supported") common_job_parameters.model_id = model_utils.gen_model_id( runtime_conf["role"]) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ["model_id", "model_version"]) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_pipeline_model() train_runtime_conf = json_loads( pipeline_model.train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) dsl = json_loads(pipeline_model.inference_dsl) # dsl = ProviderManager.fill_fate_flow_provider(dsl) job = Job() job.f_job_id = job_id job.f_dsl = dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = runtime_conf["role"] job.f_initiator_role = job_initiator["role"] job.f_initiator_party_id = job_initiator["party_id"] job.f_role = job_initiator["role"] job.f_party_id = job_initiator["party_id"] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in runtime_conf["role"][ job.f_initiator_role]: msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}" schedule_logger(job_id).info(msg) raise Exception(msg) # create common parameters on initiator JobController.create_common_job_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, common_job_parameters=common_job_parameters) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() # inherit job job.f_inheritance_info = common_job_parameters.inheritance_info job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS if job.f_inheritance_info: inheritance_jobs = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"]) inheritance_tasks = JobSaver.query_task( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"], only_latest=True) job_utils.check_job_inheritance_parameters( job, inheritance_jobs, inheritance_tasks) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) else: need_run_components = {} for role in response: need_run_components[role] = {} for party, res in response[role].items(): need_run_components[role][party] = [ name for name, value in response[role][party] ["data"]["components"].items() if value["need_run"] is True ] if common_job_parameters.federated_mode == FederatedMode.MULTIPLE: # create the task holder in db to record information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue if not need_run_components[role][party_id]: continue JobController.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=common_job_parameters, dsl_parser=dsl_parser, components=need_run_components[role][party_id]) job.f_status = JobStatus.WAITING status_code, response = FederatedScheduler.sync_job_status( job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("set job to waiting status failed") schedule_logger(job_id).info( f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}" ) logs_directory = job_utils.get_job_log_directory(job_id) result = { "code": RetCode.SUCCESS, "message": "success", "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator["role"], job_initiator["party_id"]) } warn_parameter = JobRuntimeConfigAdapter( submit_job_conf.runtime_conf).check_removed_parameter() if warn_parameter: result[ "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!" submit_result.update(result) submit_result.update(path_dict) except Exception as e: submit_result["code"] = RetCode.OPERATING_ERROR submit_result["message"] = exception_to_trace_string(e) schedule_logger(job_id).exception(e) return submit_result