def schedule_rerun_job(cls, job): if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_ready_signal = False job.f_ready_time = None job.f_rerun_signal = False job.f_progress = 0 job.f_end_time = None job.f_elapsed = None schedule_logger(job_id=job.f_job_id).info( f"job {job.f_job_id} has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False) FederatedScheduler.sync_job(job=job, update_fields=[ "ready_signal", "ready_time", "rerun_signal", "progress", "end_time", "elapsed" ]) schedule_logger(job_id=job.f_job_id).info( f"job {job.f_job_id} set waiting to rerun successfully") else: schedule_logger(job_id=job.f_job_id).info( f"job {job.f_job_id} set waiting to rerun failed") else: cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False) cls.schedule_running_job(job)
def start_task(cls, job, task): schedule_logger(task.f_job_id).info( "try to start task {} {} on {} {}".format(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id)) apply_status = ResourceManager.apply_for_task_resource( task_info=task.to_human_model_dict(only_primary_with=["status"])) if not apply_status: return SchedulingStatusCode.NO_RESOURCE task.f_status = TaskStatus.RUNNING update_status = JobSaver.update_task_status( task_info=task.to_human_model_dict(only_primary_with=["status"])) if not update_status: # Another scheduler scheduling the task schedule_logger(task.f_job_id).info( "task {} {} start on another scheduler".format( task.f_task_id, task.f_task_version)) # Rollback task.f_status = TaskStatus.WAITING ResourceManager.return_task_resource( task_info=task.to_human_model_dict( only_primary_with=["status"])) return SchedulingStatusCode.PASS schedule_logger(task.f_job_id).info("start task {} {} on {} {}".format( task.f_task_id, task.f_task_version, task.f_role, task.f_party_id)) FederatedScheduler.sync_task_status(job=job, task=task) status_code, response = FederatedScheduler.start_task(job=job, task=task) if status_code == FederatedSchedulingStatusCode.SUCCESS: return SchedulingStatusCode.SUCCESS else: return SchedulingStatusCode.FAILED
def start_job(cls, job_id, initiator_role, initiator_party_id): schedule_logger(job_id=job_id).info( "try to start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) job_info = {} job_info["job_id"] = job_id job_info["role"] = initiator_role job_info["party_id"] = initiator_party_id job_info["status"] = JobStatus.RUNNING job_info["party_status"] = JobStatus.RUNNING job_info["start_time"] = current_timestamp() job_info["tag"] = 'end_waiting' jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] FederatedScheduler.start_job(job=job) schedule_logger(job_id=job_id).info( "start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) else: schedule_logger(job_id=job_id).error( "can not found job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id))
def report_task_to_initiator(cls, task_info): tasks = JobSaver.query_task(task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"]) if tasks[ 0].f_federated_status_collect_type == FederatedCommunicationType.PUSH: FederatedScheduler.report_task_to_initiator(task=tasks[0])
def finish(cls, job, end_status): schedule_logger(job.f_job_id).info( f"job finished with {end_status}, do something...") cls.stop_job(job_id=job.f_job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, stop_status=end_status) FederatedScheduler.clean_job(job=job) schedule_logger( job.f_job_id).info(f"job finished with {end_status}, done")
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task( task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set( [task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all ) > 1 and not TaskStatus.RUNNING in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task( job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning( f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed" ) for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status( task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response[ "retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def request_stop_jobs(cls, jobs: [Job], stop_msg, stop_status): if not len(jobs): return detect_logger().info( f"have {len(jobs)} should be stopped, because of {stop_msg}") for job in jobs: try: detect_logger(job_id=job.f_job_id).info( f"detector request start to stop job {job.f_job_id}, because of {stop_msg}" ) FederatedScheduler.request_stop_job(job=job, stop_status=stop_status) detect_logger(job_id=job.f_job_id).info( f"detector request stop job {job.f_job_id} successfully") except Exception as e: detect_logger(job_id=job.f_job_id).exception(e)
def clean_queue(): jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING) clean_status = {} for job in jobs: status_code, response = FederatedScheduler.request_stop_job( job=job, stop_status=JobStatus.CANCELED) clean_status[job.f_job_id] = status_code return get_json_result(retcode=0, retmsg='success', data=clean_status)
def component_output_data_table(): request_data = request.json detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name']) jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def component_output_data_table(): request_data = request.json jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify( FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def schedule_running_job(cls, job: Job, force_sync_status=False): schedule_logger(job.f_job_id).info(f"scheduling running job") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = dict([(task.f_component_name, task.f_status) for task in tasks]) new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status.values()) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job.f_job_id).info( f"job status is {new_job_status}, calculate by task status list: {tasks_status}" ) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if auto_rerun_tasks: schedule_logger(job.f_job_id).info("job have auto rerun tasks") cls.set_job_rerun(job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, tasks=auto_rerun_tasks, auto=True) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job.f_job_id).info("finish scheduling running job")
def create_new_version_task(cls, job, task, dsl_parser, auto): # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task( job=job, task=task, content_type=TaskCleanResourceType.METRICS) # create new version task task.f_task_version = task.f_task_version + 1 if auto: task.f_auto_retries = task.f_auto_retries - 1 task.f_run_pid = None task.f_run_ip = None # todo: FederatedScheduler.create_task and JobController.initialize_tasks will create task twice status_code, response = FederatedScheduler.create_task(job=job, task=task) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception(f"create {task.f_task_id} new version failed") # create the task holder in db to record information of all participants in the initiator for scheduling for _role in response: for _party_id in response[_role]: if _role == job.f_initiator_role and _party_id == job.f_initiator_party_id: continue JobController.initialize_tasks( job_id=job.f_job_id, role=_role, party_id=_party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=RunParameters( **job.f_runtime_conf_on_party["job_parameters"]), dsl_parser=dsl_parser, components=[task.f_component_name], task_version=task.f_task_version, auto_retries=task.f_auto_retries) schedule_logger(job.f_job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" )
def check_component(cls, job, check_type="inheritance"): schedule_logger(job.f_job_id).info(f"component check") dependence_status_code, response = FederatedScheduler.check_component( job=job, check_type=check_type) schedule_logger( job.f_job_id).info(f"component check response: {response}") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) component_set = set([ cpn.name for cpn in dsl_parser.get_source_connect_sub_graph( job.f_inheritance_info.get("component_list")) ]) for dest_role in response.keys(): for party_id in response[dest_role].keys(): component_set = component_set.intersection( set(response[dest_role][party_id].get("data"))) if component_set != set(job.f_inheritance_info.get("component_list")): schedule_logger( job.f_job_id).info(f"dsl parser components:{component_set}") component_list = [ cpn.name for cpn in dsl_parser.get_source_connect_sub_graph( list(component_set)) ] schedule_logger( job.f_job_id).info(f"parser result:{component_list}") command_body = {"inheritance_info": job.f_inheritance_info} command_body["inheritance_info"].update( {"component_list": component_list}) schedule_logger( job.f_job_id).info(f"start align job info:{command_body}") status_code, response = FederatedScheduler.align_args( job, command_body=command_body) schedule_logger( job.f_job_id).info(f"align result:{status_code}, {response}") schedule_logger(job.f_job_id).info(f"check success")
def rerun_job(): job_id = request.json.get("job_id") jobs = JobSaver.query_job(job_id=job_id) if jobs: status_code, response = FederatedScheduler.request_rerun_job( job=jobs[0], command_body=request.json) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result(retcode=RetCode.SUCCESS, retmsg="rerun job success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="rerun job failed:\n{}".format( json_dumps(response))) else: return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def schedule_running_job(cls, job, force_sync_status=False): schedule_logger(job_id=job.f_job_id).info("scheduling job {}".format( job.f_job_id)) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = [task.f_status for task in tasks] new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job_id=job.f_job_id).info( "Job {} status is {}, calculate by task status list: {}".format( job.f_job_id, new_job_status, tasks_status)) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id=job.f_job_id).info( "finish scheduling job {}".format(job.f_job_id))
def update_parameters(cls, job, job_parameters, component_parameters): updated_job_parameters, updated_component_parameters, updated_components = JobController.gen_updated_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, input_job_parameters=job_parameters, input_component_parameters=component_parameters) schedule_logger(job.f_job_id).info( f"components {updated_components} parameters has been updated") updated_parameters = { "job_parameters": updated_job_parameters, "component_parameters": updated_component_parameters, "components": updated_components } status_code, response = FederatedScheduler.update_parameter( job, updated_parameters=updated_parameters) if status_code == FederatedSchedulingStatusCode.SUCCESS: return RetCode.SUCCESS, updated_parameters else: return RetCode.OPERATING_ERROR, response
def stop_job(cls, job_id, role, party_id, stop_status): schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status}") jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True) if len(jobs) > 0: if stop_status == JobStatus.CANCELED: schedule_logger(job_id=job_id).info(f"cancel job {job_id}") set_cancel_status = cls.cancel_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id=job_id).info( f"set job {job_id} cancel signal {set_cancel_status}") job = jobs[0] job.f_status = stop_status schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status} to all party") status_code, response = FederatedScheduler.stop_job( job=jobs[0], stop_status=stop_status) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} successfully") return RetCode.SUCCESS, "success" else: initiator_tasks_group = JobSaver.get_tasks_asc( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) for initiator_task in initiator_tasks_group.values(): TaskScheduler.collect_task_of_all_party( job, initiator_task=initiator_task, set_status=stop_status) schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} failed, {response}") return RetCode.FEDERATED_ERROR, json_dumps(response) else: return RetCode.SUCCESS, "can not found job"
def get_rerun_component(cls, component_name, job, dsl_parser, force): if not component_name or component_name == job_utils.job_pipeline_component_name( ): pass else: dependence_status_code, response = FederatedScheduler.check_component( job=job, check_type="rerun") success_task_list = [ task.f_component_name for task in JobSaver.query_task(job_id=job.f_job_id, party_id=job.f_party_id, role=job.f_role, status=TaskStatus.SUCCESS, only_latest=True) ] component_set = set() for dest_role in response.keys(): for party_id in response[dest_role].keys(): component_set = component_set.union( set(response[dest_role][party_id].get("data"))) schedule_logger(job.f_job_id).info( f"success task list: {success_task_list}, check failed component list: {list(component_set)}" ) need_rerun = [ cpn.name for cpn in dsl_parser.get_need_revisit_nodes( success_task_list, list(component_set)) ] schedule_logger(job.f_job_id).info( f"need rerun success component: {need_rerun}") if component_set: force = True if isinstance(component_name, str): component_name = set(need_rerun).union({component_name}) else: component_name = set(need_rerun).union(set(component_name)) return component_name, force
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job_id=job.f_job_id).info( "scheduling job {} tasks".format(job.f_job_id)) initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] for initiator_task in initiator_tasks_group.values(): # collect all party task party status if job.f_runtime_conf_on_party["job_parameters"][ "federated_status_collect_type"] == FederatedCommunicationType.PULL: cls.collect_task_of_all_party(job=job, initiator_task=initiator_task) new_task_status = cls.federated_task_status( job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains( initiator_task.f_status): FederatedScheduler.stop_task( job=job, task=initiator_task, stop_status=initiator_task.f_status) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components( component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[JobSaver.task_key( task_id=job_utils.generate_task_id( job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id)] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job_id=job.f_job_id).info( f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling" ) break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED waiting_task.f_status = StatusSet.FAILED FederatedScheduler.sync_task_status(job, waiting_task) break else: schedule_logger(job_id=job.f_job_id).info( "have cancel signal, pass start job {} tasks".format( job.f_job_id)) schedule_logger(job_id=job.f_job_id).info( "finish scheduling job {} tasks".format(job.f_job_id)) return scheduling_status_code, initiator_tasks_group.values()
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info( f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError( f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}" ) if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun" ) else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version}" ) for _role, _party_ids in job.f_runtime_conf_on_party[ "role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks( job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters( ** job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" ) job_can_rerun = True if job_can_rerun: schedule_logger( job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger( job_id=job_id).info(f"job {job_id} no task to rerun")
def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None): if not job_id: job_id = job_utils.generate_job_id() submit_result = {"job_id": job_id} schedule_logger(job_id).info( f"submit job, body {submit_job_conf.to_dict()}") try: dsl = submit_job_conf.dsl runtime_conf = deepcopy(submit_job_conf.runtime_conf) job_utils.check_job_runtime_conf(runtime_conf) authentication_utils.check_constraint(runtime_conf, dsl) job_initiator = runtime_conf["initiator"] conf_adapter = JobRuntimeConfigAdapter(runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != "predict": # generate job model info conf_version = schedule_utils.get_conf_version(runtime_conf) if conf_version != 2: raise Exception( "only the v2 version runtime conf is supported") common_job_parameters.model_id = model_utils.gen_model_id( runtime_conf["role"]) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ["model_id", "model_version"]) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_pipeline_model() train_runtime_conf = json_loads( pipeline_model.train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) dsl = json_loads(pipeline_model.inference_dsl) # dsl = ProviderManager.fill_fate_flow_provider(dsl) job = Job() job.f_job_id = job_id job.f_dsl = dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = runtime_conf["role"] job.f_initiator_role = job_initiator["role"] job.f_initiator_party_id = job_initiator["party_id"] job.f_role = job_initiator["role"] job.f_party_id = job_initiator["party_id"] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in runtime_conf["role"][ job.f_initiator_role]: msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}" schedule_logger(job_id).info(msg) raise Exception(msg) # create common parameters on initiator JobController.create_common_job_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, common_job_parameters=common_job_parameters) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() # inherit job job.f_inheritance_info = common_job_parameters.inheritance_info job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS if job.f_inheritance_info: inheritance_jobs = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"]) inheritance_tasks = JobSaver.query_task( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"], only_latest=True) job_utils.check_job_inheritance_parameters( job, inheritance_jobs, inheritance_tasks) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) else: need_run_components = {} for role in response: need_run_components[role] = {} for party, res in response[role].items(): need_run_components[role][party] = [ name for name, value in response[role][party] ["data"]["components"].items() if value["need_run"] is True ] if common_job_parameters.federated_mode == FederatedMode.MULTIPLE: # create the task holder in db to record information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue if not need_run_components[role][party_id]: continue JobController.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=common_job_parameters, dsl_parser=dsl_parser, components=need_run_components[role][party_id]) job.f_status = JobStatus.WAITING status_code, response = FederatedScheduler.sync_job_status( job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("set job to waiting status failed") schedule_logger(job_id).info( f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}" ) logs_directory = job_utils.get_job_log_directory(job_id) result = { "code": RetCode.SUCCESS, "message": "success", "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator["role"], job_initiator["party_id"]) } warn_parameter = JobRuntimeConfigAdapter( submit_job_conf.runtime_conf).check_removed_parameter() if warn_parameter: result[ "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!" submit_result.update(result) submit_result.update(path_dict) except Exception as e: submit_result["code"] = RetCode.OPERATING_ERROR submit_result["message"] = exception_to_trace_string(e) schedule_logger(job_id).exception(e) return submit_result
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) authentication_utils.check_constraint(job_runtime_conf, job_dsl) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] job.f_role = job_initiator['role'] job.f_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator['role'], job_initiator['party_id']) } submit_result.update(path_dict) return submit_result
def schedule_waiting_jobs(cls, job): job_id, initiator_role, initiator_party_id, = job.f_job_id, job.f_initiator_role, job.f_initiator_party_id, if not cls.ready_signal(job_id=job_id, set_or_reset=True): schedule_logger(job_id).info( f"job {job_id} may be handled by another scheduler") return try: if job.f_cancel_signal: job.f_status = JobStatus.CANCELED FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id).info( f"job {job_id} have cancel signal") return apply_status_code, federated_response = FederatedScheduler.resource_for_job( job=job, operation_type=ResourceOperation.APPLY) if apply_status_code == FederatedSchedulingStatusCode.SUCCESS: cls.start_job(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) else: # rollback resource rollback_party = {} failed_party = {} for dest_role in federated_response.keys(): for dest_party_id in federated_response[dest_role].keys(): retcode = federated_response[dest_role][dest_party_id][ "retcode"] if retcode == 0: rollback_party[dest_role] = rollback_party.get( dest_role, []) rollback_party[dest_role].append(dest_party_id) else: failed_party[dest_role] = failed_party.get( dest_role, []) failed_party[dest_role].append(dest_party_id) schedule_logger(job_id).info( "job {} apply resource failed on {}, rollback {}".format( job_id, ",".join([ ",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in failed_party.items() ]), ",".join([ ",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in rollback_party.items() ]), )) if rollback_party: return_status_code, federated_response = FederatedScheduler.resource_for_job( job=job, operation_type=ResourceOperation.RETURN, specific_dest=rollback_party) if return_status_code != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id).info( f"job {job_id} return resource failed:\n{federated_response}" ) else: schedule_logger(job_id).info( f"job {job_id} no party should be rollback resource") if apply_status_code == FederatedSchedulingStatusCode.ERROR: cls.stop_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id, stop_status=JobStatus.FAILED) schedule_logger(job_id).info( f"apply resource error, stop job {job_id}") except Exception as e: raise e finally: update_status = cls.ready_signal(job_id=job_id, set_or_reset=False) schedule_logger(job_id).info( f"reset job {job_id} ready signal {update_status}")
def set_job_rerun(cls, job_id, initiator_role, initiator_party_id, auto, force=False, tasks: typing.List[Task] = None, component_name: typing.Union[str, list] = None): schedule_logger(job_id).info( f"try to rerun job on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if not jobs: raise RuntimeError( f"can not found job on initiator {initiator_role} {initiator_party_id}" ) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) component_name, force = cls.get_rerun_component( component_name, job, dsl_parser, force) schedule_logger(job_id).info(f"rerun component: {component_name}") if tasks: schedule_logger(job_id).info( f"require {[task.f_component_name for task in tasks]} to rerun" ) else: task_query = { 'job_id': job_id, 'role': initiator_role, 'party_id': initiator_party_id, } if not component_name or component_name == job_utils.job_pipeline_component_name( ): # rerun all tasks schedule_logger(job_id).info( "require all component of pipeline to rerun") else: _require_reruns = {component_name} if isinstance( component_name, str) else set(component_name) _should_reruns = _require_reruns.copy() for _cpn in _require_reruns: _components = dsl_parser.get_downstream_dependent_components( _cpn) for _c in _components: _should_reruns.add(_c.get_name()) schedule_logger(job_id).info( f"require {_require_reruns} to rerun, " f"and then found {_should_reruns} need be to rerun") task_query['component_name'] = _should_reruns tasks = JobSaver.query_task(**task_query) job_can_rerun = any([ TaskScheduler.prepare_rerun_task( job=job, task=task, dsl_parser=dsl_parser, auto=auto, force=force, ) for task in tasks ]) if not job_can_rerun: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id).info("job no task to rerun") return False schedule_logger(job_id).info("job set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id).info( f"job set rerun signal {'successfully' if status else 'failed'}") return True