예제 #1
0
 def schedule_rerun_job(cls, job):
     if EndStatus.contains(job.f_status):
         job.f_status = JobStatus.WAITING
         job.f_ready_signal = False
         job.f_ready_time = None
         job.f_rerun_signal = False
         job.f_progress = 0
         job.f_end_time = None
         job.f_elapsed = None
         schedule_logger(job_id=job.f_job_id).info(
             f"job {job.f_job_id} has been finished, set waiting to rerun")
         status, response = FederatedScheduler.sync_job_status(job=job)
         if status == FederatedSchedulingStatusCode.SUCCESS:
             cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False)
             FederatedScheduler.sync_job(job=job,
                                         update_fields=[
                                             "ready_signal", "ready_time",
                                             "rerun_signal", "progress",
                                             "end_time", "elapsed"
                                         ])
             schedule_logger(job_id=job.f_job_id).info(
                 f"job {job.f_job_id} set waiting to rerun successfully")
         else:
             schedule_logger(job_id=job.f_job_id).info(
                 f"job {job.f_job_id} set waiting to rerun failed")
     else:
         cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False)
         cls.schedule_running_job(job)
예제 #2
0
 def start_task(cls, job, task):
     schedule_logger(task.f_job_id).info(
         "try to start task {} {} on {} {}".format(task.f_task_id,
                                                   task.f_task_version,
                                                   task.f_role,
                                                   task.f_party_id))
     apply_status = ResourceManager.apply_for_task_resource(
         task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not apply_status:
         return SchedulingStatusCode.NO_RESOURCE
     task.f_status = TaskStatus.RUNNING
     update_status = JobSaver.update_task_status(
         task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not update_status:
         # Another scheduler scheduling the task
         schedule_logger(task.f_job_id).info(
             "task {} {} start on another scheduler".format(
                 task.f_task_id, task.f_task_version))
         # Rollback
         task.f_status = TaskStatus.WAITING
         ResourceManager.return_task_resource(
             task_info=task.to_human_model_dict(
                 only_primary_with=["status"]))
         return SchedulingStatusCode.PASS
     schedule_logger(task.f_job_id).info("start task {} {} on {} {}".format(
         task.f_task_id, task.f_task_version, task.f_role, task.f_party_id))
     FederatedScheduler.sync_task_status(job=job, task=task)
     status_code, response = FederatedScheduler.start_task(job=job,
                                                           task=task)
     if status_code == FederatedSchedulingStatusCode.SUCCESS:
         return SchedulingStatusCode.SUCCESS
     else:
         return SchedulingStatusCode.FAILED
예제 #3
0
 def start_job(cls, job_id, initiator_role, initiator_party_id):
     schedule_logger(job_id=job_id).info(
         "try to start job {} on initiator {} {}".format(
             job_id, initiator_role, initiator_party_id))
     job_info = {}
     job_info["job_id"] = job_id
     job_info["role"] = initiator_role
     job_info["party_id"] = initiator_party_id
     job_info["status"] = JobStatus.RUNNING
     job_info["party_status"] = JobStatus.RUNNING
     job_info["start_time"] = current_timestamp()
     job_info["tag"] = 'end_waiting'
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
         FederatedScheduler.start_job(job=job)
         schedule_logger(job_id=job_id).info(
             "start job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
     else:
         schedule_logger(job_id=job_id).error(
             "can not found job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
예제 #4
0
 def report_task_to_initiator(cls, task_info):
     tasks = JobSaver.query_task(task_id=task_info["task_id"],
                                 task_version=task_info["task_version"],
                                 role=task_info["role"],
                                 party_id=task_info["party_id"])
     if tasks[
             0].f_federated_status_collect_type == FederatedCommunicationType.PUSH:
         FederatedScheduler.report_task_to_initiator(task=tasks[0])
예제 #5
0
 def finish(cls, job, end_status):
     schedule_logger(job.f_job_id).info(
         f"job finished with {end_status}, do something...")
     cls.stop_job(job_id=job.f_job_id,
                  role=job.f_initiator_role,
                  party_id=job.f_initiator_party_id,
                  stop_status=end_status)
     FederatedScheduler.clean_job(job=job)
     schedule_logger(
         job.f_job_id).info(f"job finished with {end_status}, done")
예제 #6
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(
         task_id=initiator_task.f_task_id,
         task_version=initiator_task.f_task_version)
     tasks_status_on_all = set(
         [task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all
                ) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(
         job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(
             f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed"
         )
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(
                     task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response[
                     "retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
예제 #7
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(
            f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=f"stop job on this party {kill_status};\n"
                f"stop job on all party success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job on this party {};\n"
                                   "stop job failed:\n{}".format(
                                       kill_status,
                                       json_dumps(response, indent=4)))
    else:
        schedule_logger(job_id).info(f"can not found job {job_id} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
예제 #8
0
 def request_stop_jobs(cls, jobs: [Job], stop_msg, stop_status):
     if not len(jobs):
         return
     detect_logger().info(
         f"have {len(jobs)} should be stopped, because of {stop_msg}")
     for job in jobs:
         try:
             detect_logger(job_id=job.f_job_id).info(
                 f"detector request start to stop job {job.f_job_id}, because of {stop_msg}"
             )
             FederatedScheduler.request_stop_job(job=job,
                                                 stop_status=stop_status)
             detect_logger(job_id=job.f_job_id).info(
                 f"detector request stop job {job.f_job_id} successfully")
         except Exception as e:
             detect_logger(job_id=job.f_job_id).exception(e)
예제 #9
0
def clean_queue():
    jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING)
    clean_status = {}
    for job in jobs:
        status_code, response = FederatedScheduler.request_stop_job(
            job=job, stop_status=JobStatus.CANCELED)
        clean_status[job.f_job_id] = status_code
    return get_json_result(retcode=0, retmsg='success', data=clean_status)
예제 #10
0
def component_output_data_table():
    request_data = request.json
    detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name'])
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
예제 #11
0
def component_output_data_table():
    request_data = request.json
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(
            FederatedScheduler.tracker_command(job, request_data,
                                               'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
예제 #12
0
    def schedule_running_job(cls, job: Job, force_sync_status=False):
        schedule_logger(job.f_job_id).info(f"scheduling running job")

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = dict([(task.f_component_name, task.f_status)
                             for task in tasks])
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status.values())
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job.f_job_id).info(
            f"job status is {new_job_status}, calculate by task status list: {tasks_status}"
        )
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if auto_rerun_tasks:
            schedule_logger(job.f_job_id).info("job have auto rerun tasks")
            cls.set_job_rerun(job_id=job.f_job_id,
                              initiator_role=job.f_initiator_role,
                              initiator_party_id=job.f_initiator_party_id,
                              tasks=auto_rerun_tasks,
                              auto=True)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job.f_job_id).info("finish scheduling running job")
예제 #13
0
 def create_new_version_task(cls, job, task, dsl_parser, auto):
     # stop old version task
     FederatedScheduler.stop_task(job=job,
                                  task=task,
                                  stop_status=TaskStatus.CANCELED)
     FederatedScheduler.clean_task(
         job=job, task=task, content_type=TaskCleanResourceType.METRICS)
     # create new version task
     task.f_task_version = task.f_task_version + 1
     if auto:
         task.f_auto_retries = task.f_auto_retries - 1
     task.f_run_pid = None
     task.f_run_ip = None
     # todo: FederatedScheduler.create_task and JobController.initialize_tasks will create task twice
     status_code, response = FederatedScheduler.create_task(job=job,
                                                            task=task)
     if status_code != FederatedSchedulingStatusCode.SUCCESS:
         raise Exception(f"create {task.f_task_id} new version failed")
     # create the task holder in db to record information of all participants in the initiator for scheduling
     for _role in response:
         for _party_id in response[_role]:
             if _role == job.f_initiator_role and _party_id == job.f_initiator_party_id:
                 continue
             JobController.initialize_tasks(
                 job_id=job.f_job_id,
                 role=_role,
                 party_id=_party_id,
                 run_on_this_party=False,
                 initiator_role=job.f_initiator_role,
                 initiator_party_id=job.f_initiator_party_id,
                 job_parameters=RunParameters(
                     **job.f_runtime_conf_on_party["job_parameters"]),
                 dsl_parser=dsl_parser,
                 components=[task.f_component_name],
                 task_version=task.f_task_version,
                 auto_retries=task.f_auto_retries)
     schedule_logger(job.f_job_id).info(
         f"create task {task.f_task_id} new version {task.f_task_version} successfully"
     )
예제 #14
0
    def check_component(cls, job, check_type="inheritance"):
        schedule_logger(job.f_job_id).info(f"component check")
        dependence_status_code, response = FederatedScheduler.check_component(
            job=job, check_type=check_type)
        schedule_logger(
            job.f_job_id).info(f"component check response: {response}")
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf,
            train_runtime_conf=job.f_train_runtime_conf)
        component_set = set([
            cpn.name for cpn in dsl_parser.get_source_connect_sub_graph(
                job.f_inheritance_info.get("component_list"))
        ])
        for dest_role in response.keys():
            for party_id in response[dest_role].keys():
                component_set = component_set.intersection(
                    set(response[dest_role][party_id].get("data")))
        if component_set != set(job.f_inheritance_info.get("component_list")):
            schedule_logger(
                job.f_job_id).info(f"dsl parser components:{component_set}")

            component_list = [
                cpn.name for cpn in dsl_parser.get_source_connect_sub_graph(
                    list(component_set))
            ]
            schedule_logger(
                job.f_job_id).info(f"parser result:{component_list}")
            command_body = {"inheritance_info": job.f_inheritance_info}
            command_body["inheritance_info"].update(
                {"component_list": component_list})
            schedule_logger(
                job.f_job_id).info(f"start align job info:{command_body}")
            status_code, response = FederatedScheduler.align_args(
                job, command_body=command_body)
            schedule_logger(
                job.f_job_id).info(f"align result:{status_code}, {response}")
        schedule_logger(job.f_job_id).info(f"check success")
예제 #15
0
def rerun_job():
    job_id = request.json.get("job_id")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        status_code, response = FederatedScheduler.request_rerun_job(
            job=jobs[0], command_body=request.json)
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(retcode=RetCode.SUCCESS,
                                   retmsg="rerun job success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="rerun job failed:\n{}".format(
                                       json_dumps(response)))
    else:
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
예제 #16
0
    def schedule_running_job(cls, job, force_sync_status=False):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {}".format(
            job.f_job_id))

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = [task.f_status for task in tasks]
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status)
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job_id=job.f_job_id).info(
            "Job {} status is {}, calculate by task status list: {}".format(
                job.f_job_id, new_job_status, tasks_status))
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job_id=job.f_job_id).info(
            "finish scheduling job {}".format(job.f_job_id))
예제 #17
0
 def update_parameters(cls, job, job_parameters, component_parameters):
     updated_job_parameters, updated_component_parameters, updated_components = JobController.gen_updated_parameters(
         job_id=job.f_job_id,
         initiator_role=job.f_initiator_role,
         initiator_party_id=job.f_initiator_party_id,
         input_job_parameters=job_parameters,
         input_component_parameters=component_parameters)
     schedule_logger(job.f_job_id).info(
         f"components {updated_components} parameters has been updated")
     updated_parameters = {
         "job_parameters": updated_job_parameters,
         "component_parameters": updated_component_parameters,
         "components": updated_components
     }
     status_code, response = FederatedScheduler.update_parameter(
         job, updated_parameters=updated_parameters)
     if status_code == FederatedSchedulingStatusCode.SUCCESS:
         return RetCode.SUCCESS, updated_parameters
     else:
         return RetCode.OPERATING_ERROR, response
예제 #18
0
 def stop_job(cls, job_id, role, party_id, stop_status):
     schedule_logger(job_id=job_id).info(
         f"request stop job {job_id} with {stop_status}")
     jobs = JobSaver.query_job(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               is_initiator=True)
     if len(jobs) > 0:
         if stop_status == JobStatus.CANCELED:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id}")
             set_cancel_status = cls.cancel_signal(job_id=job_id,
                                                   set_or_reset=True)
             schedule_logger(job_id=job_id).info(
                 f"set job {job_id} cancel signal {set_cancel_status}")
         job = jobs[0]
         job.f_status = stop_status
         schedule_logger(job_id=job_id).info(
             f"request stop job {job_id} with {stop_status} to all party")
         status_code, response = FederatedScheduler.stop_job(
             job=jobs[0], stop_status=stop_status)
         if status_code == FederatedSchedulingStatusCode.SUCCESS:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} successfully")
             return RetCode.SUCCESS, "success"
         else:
             initiator_tasks_group = JobSaver.get_tasks_asc(
                 job_id=job.f_job_id,
                 role=job.f_role,
                 party_id=job.f_party_id)
             for initiator_task in initiator_tasks_group.values():
                 TaskScheduler.collect_task_of_all_party(
                     job,
                     initiator_task=initiator_task,
                     set_status=stop_status)
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} failed, {response}")
             return RetCode.FEDERATED_ERROR, json_dumps(response)
     else:
         return RetCode.SUCCESS, "can not found job"
예제 #19
0
 def get_rerun_component(cls, component_name, job, dsl_parser, force):
     if not component_name or component_name == job_utils.job_pipeline_component_name(
     ):
         pass
     else:
         dependence_status_code, response = FederatedScheduler.check_component(
             job=job, check_type="rerun")
         success_task_list = [
             task.f_component_name
             for task in JobSaver.query_task(job_id=job.f_job_id,
                                             party_id=job.f_party_id,
                                             role=job.f_role,
                                             status=TaskStatus.SUCCESS,
                                             only_latest=True)
         ]
         component_set = set()
         for dest_role in response.keys():
             for party_id in response[dest_role].keys():
                 component_set = component_set.union(
                     set(response[dest_role][party_id].get("data")))
         schedule_logger(job.f_job_id).info(
             f"success task list: {success_task_list}, check failed component list: {list(component_set)}"
         )
         need_rerun = [
             cpn.name for cpn in dsl_parser.get_need_revisit_nodes(
                 success_task_list, list(component_set))
         ]
         schedule_logger(job.f_job_id).info(
             f"need rerun success component: {need_rerun}")
         if component_set:
             force = True
         if isinstance(component_name, str):
             component_name = set(need_rerun).union({component_name})
         else:
             component_name = set(need_rerun).union(set(component_name))
     return component_name, force
예제 #20
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job_id=job.f_job_id).info(
            "scheduling job {} tasks".format(job.f_job_id))
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id,
                                                       role=job.f_role,
                                                       party_id=job.f_party_id)
        waiting_tasks = []
        for initiator_task in initiator_tasks_group.values():
            # collect all party task party status
            if job.f_runtime_conf_on_party["job_parameters"][
                    "federated_status_collect_type"] == FederatedCommunicationType.PULL:
                cls.collect_task_of_all_party(job=job,
                                              initiator_task=initiator_task)
            new_task_status = cls.federated_task_status(
                job_id=initiator_task.f_job_id,
                task_id=initiator_task.f_task_id,
                task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job,
                                                    task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(
                    initiator_task.f_status):
                FederatedScheduler.stop_task(
                    job=job,
                    task=initiator_task,
                    stop_status=initiator_task.f_status)

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(
                        component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[JobSaver.task_key(
                        task_id=job_utils.generate_task_id(
                            job_id=job.f_job_id,
                            component_name=component.get_name()),
                        role=job.f_role,
                        party_id=job.f_party_id)]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job_id=job.f_job_id).info(
                            f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling"
                        )
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        waiting_task.f_status = StatusSet.FAILED
                        FederatedScheduler.sync_task_status(job, waiting_task)
                        break
        else:
            schedule_logger(job_id=job.f_job_id).info(
                "have cancel signal, pass start job {} tasks".format(
                    job.f_job_id))
        schedule_logger(job_id=job.f_job_id).info(
            "finish scheduling job {} tasks".format(job.f_job_id))
        return scheduling_status_code, initiator_tasks_group.values()
예제 #21
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id,
               component_name):
     schedule_logger(job_id=job_id).info(
         f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}"
     )
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(
             f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}"
         )
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id,
                                     component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(
         dsl=job.f_dsl,
         runtime_conf=job.f_runtime_conf_on_party,
         train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(
                 f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun"
             )
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job,
                                          task=task,
                                          stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job,
                                           task=task,
                                           content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version}"
             )
             for _role, _party_ids in job.f_runtime_conf_on_party[
                     "role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(
                         job_id,
                         _role,
                         _party_id,
                         False,
                         job.f_initiator_role,
                         job.f_initiator_party_id,
                         RunParameters(
                             **
                             job.f_runtime_conf_on_party["job_parameters"]),
                         dsl_parser,
                         component_name=task.f_component_name,
                         task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version} successfully"
             )
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(
             job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(
             job_id=job_id).info(f"job {job_id} no task to rerun")
예제 #22
0
    def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        submit_result = {"job_id": job_id}
        schedule_logger(job_id).info(
            f"submit job, body {submit_job_conf.to_dict()}")
        try:
            dsl = submit_job_conf.dsl
            runtime_conf = deepcopy(submit_job_conf.runtime_conf)
            job_utils.check_job_runtime_conf(runtime_conf)
            authentication_utils.check_constraint(runtime_conf, dsl)
            job_initiator = runtime_conf["initiator"]
            conf_adapter = JobRuntimeConfigAdapter(runtime_conf)
            common_job_parameters = conf_adapter.get_common_parameters()

            if common_job_parameters.job_type != "predict":
                # generate job model info
                conf_version = schedule_utils.get_conf_version(runtime_conf)
                if conf_version != 2:
                    raise Exception(
                        "only the v2 version runtime conf is supported")
                common_job_parameters.model_id = model_utils.gen_model_id(
                    runtime_conf["role"])
                common_job_parameters.model_version = job_id
                train_runtime_conf = {}
            else:
                # check predict job parameters
                detect_utils.check_config(common_job_parameters.to_dict(),
                                          ["model_id", "model_version"])
                # get inference dsl from pipeline model as job dsl
                tracker = Tracker(
                    job_id=job_id,
                    role=job_initiator["role"],
                    party_id=job_initiator["party_id"],
                    model_id=common_job_parameters.model_id,
                    model_version=common_job_parameters.model_version)
                pipeline_model = tracker.get_pipeline_model()
                train_runtime_conf = json_loads(
                    pipeline_model.train_runtime_conf)
                if not model_utils.check_if_deployed(
                        role=job_initiator["role"],
                        party_id=job_initiator["party_id"],
                        model_id=common_job_parameters.model_id,
                        model_version=common_job_parameters.model_version):
                    raise Exception(
                        f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet."
                    )
                dsl = json_loads(pipeline_model.inference_dsl)
            # dsl = ProviderManager.fill_fate_flow_provider(dsl)

            job = Job()
            job.f_job_id = job_id
            job.f_dsl = dsl
            job.f_train_runtime_conf = train_runtime_conf
            job.f_roles = runtime_conf["role"]
            job.f_initiator_role = job_initiator["role"]
            job.f_initiator_party_id = job_initiator["party_id"]
            job.f_role = job_initiator["role"]
            job.f_party_id = job_initiator["party_id"]

            path_dict = job_utils.save_job_conf(
                job_id=job_id,
                role=job.f_initiator_role,
                party_id=job.f_initiator_party_id,
                dsl=dsl,
                runtime_conf=runtime_conf,
                runtime_conf_on_party={},
                train_runtime_conf=train_runtime_conf,
                pipeline_dsl=None)

            if job.f_initiator_party_id not in runtime_conf["role"][
                    job.f_initiator_role]:
                msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}"
                schedule_logger(job_id).info(msg)
                raise Exception(msg)

            # create common parameters on initiator
            JobController.create_common_job_parameters(
                job_id=job.f_job_id,
                initiator_role=job.f_initiator_role,
                common_job_parameters=common_job_parameters)
            job.f_runtime_conf = conf_adapter.update_common_parameters(
                common_parameters=common_job_parameters)
            dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job.f_dsl,
                runtime_conf=job.f_runtime_conf,
                train_runtime_conf=job.f_train_runtime_conf)

            # initiator runtime conf as template
            job.f_runtime_conf_on_party = job.f_runtime_conf.copy()
            job.f_runtime_conf_on_party[
                "job_parameters"] = common_job_parameters.to_dict()

            # inherit job
            job.f_inheritance_info = common_job_parameters.inheritance_info
            job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS
            if job.f_inheritance_info:
                inheritance_jobs = JobSaver.query_job(
                    job_id=job.f_inheritance_info.get("job_id"),
                    role=job_initiator["role"],
                    party_id=job_initiator["party_id"])
                inheritance_tasks = JobSaver.query_task(
                    job_id=job.f_inheritance_info.get("job_id"),
                    role=job_initiator["role"],
                    party_id=job_initiator["party_id"],
                    only_latest=True)
                job_utils.check_job_inheritance_parameters(
                    job, inheritance_jobs, inheritance_tasks)

            status_code, response = FederatedScheduler.create_job(job=job)
            if status_code != FederatedSchedulingStatusCode.SUCCESS:
                job.f_status = JobStatus.FAILED
                job.f_tag = "submit_failed"
                FederatedScheduler.sync_job_status(job=job)
                raise Exception("create job failed", response)
            else:
                need_run_components = {}
                for role in response:
                    need_run_components[role] = {}
                    for party, res in response[role].items():
                        need_run_components[role][party] = [
                            name for name, value in response[role][party]
                            ["data"]["components"].items()
                            if value["need_run"] is True
                        ]
                if common_job_parameters.federated_mode == FederatedMode.MULTIPLE:
                    # create the task holder in db to record information of all participants in the initiator for scheduling
                    for role, party_ids in job.f_roles.items():
                        for party_id in party_ids:
                            if role == job.f_initiator_role and party_id == job.f_initiator_party_id:
                                continue
                            if not need_run_components[role][party_id]:
                                continue
                            JobController.initialize_tasks(
                                job_id=job_id,
                                role=role,
                                party_id=party_id,
                                run_on_this_party=False,
                                initiator_role=job.f_initiator_role,
                                initiator_party_id=job.f_initiator_party_id,
                                job_parameters=common_job_parameters,
                                dsl_parser=dsl_parser,
                                components=need_run_components[role][party_id])
                job.f_status = JobStatus.WAITING
                status_code, response = FederatedScheduler.sync_job_status(
                    job=job)
                if status_code != FederatedSchedulingStatusCode.SUCCESS:
                    raise Exception("set job to waiting status failed")

            schedule_logger(job_id).info(
                f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}"
            )
            logs_directory = job_utils.get_job_log_directory(job_id)
            result = {
                "code":
                RetCode.SUCCESS,
                "message":
                "success",
                "model_info": {
                    "model_id": common_job_parameters.model_id,
                    "model_version": common_job_parameters.model_version
                },
                "logs_directory":
                logs_directory,
                "board_url":
                job_utils.get_board_url(job_id, job_initiator["role"],
                                        job_initiator["party_id"])
            }
            warn_parameter = JobRuntimeConfigAdapter(
                submit_job_conf.runtime_conf).check_removed_parameter()
            if warn_parameter:
                result[
                    "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!"
            submit_result.update(result)
            submit_result.update(path_dict)
        except Exception as e:
            submit_result["code"] = RetCode.OPERATING_ERROR
            submit_result["message"] = exception_to_trace_string(e)
            schedule_logger(job_id).exception(e)
        return submit_result
예제 #23
0
    def submit(cls, job_data, job_id=None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(
            job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_job_runtime_conf(job_runtime_conf)
        authentication_utils.check_constraint(job_runtime_conf, job_dsl)

        job_initiator = job_runtime_conf['initiator']
        conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf)
        common_job_parameters = conf_adapter.get_common_parameters()

        if common_job_parameters.job_type != 'predict':
            # generate job model info
            common_job_parameters.model_id = model_utils.gen_model_id(
                job_runtime_conf['role'])
            common_job_parameters.model_version = job_id
            train_runtime_conf = {}
        else:
            # check predict job parameters
            detect_utils.check_config(common_job_parameters.to_dict(),
                                      ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            tracker = Tracker(
                job_id=job_id,
                role=job_initiator['role'],
                party_id=job_initiator['party_id'],
                model_id=common_job_parameters.model_id,
                model_version=common_job_parameters.model_version)
            pipeline_model = tracker.get_output_model('pipeline')
            train_runtime_conf = json_loads(
                pipeline_model['Pipeline'].train_runtime_conf)
            if not model_utils.check_if_deployed(
                    role=job_initiator['role'],
                    party_id=job_initiator['party_id'],
                    model_id=common_job_parameters.model_id,
                    model_version=common_job_parameters.model_version):
                raise Exception(
                    f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet."
                )
            job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)

        job = Job()
        job.f_job_id = job_id
        job.f_dsl = job_dsl
        job.f_train_runtime_conf = train_runtime_conf
        job.f_roles = job_runtime_conf['role']
        job.f_work_mode = common_job_parameters.work_mode
        job.f_initiator_role = job_initiator['role']
        job.f_initiator_party_id = job_initiator['party_id']
        job.f_role = job_initiator['role']
        job.f_party_id = job_initiator['party_id']

        path_dict = job_utils.save_job_conf(
            job_id=job_id,
            role=job.f_initiator_role,
            job_dsl=job_dsl,
            job_runtime_conf=job_runtime_conf,
            job_runtime_conf_on_party={},
            train_runtime_conf=train_runtime_conf,
            pipeline_dsl=None)

        if job.f_initiator_party_id not in job_runtime_conf['role'][
                job.f_initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(
                job.f_initiator_party_id))
            raise Exception("initiator party id error {}".format(
                job.f_initiator_party_id))

        # create common parameters on initiator
        JobController.backend_compatibility(
            job_parameters=common_job_parameters)
        JobController.adapt_job_parameters(
            role=job.f_initiator_role,
            job_parameters=common_job_parameters,
            create_initiator_baseline=True)

        job.f_runtime_conf = conf_adapter.update_common_parameters(
            common_parameters=common_job_parameters)
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf,
            train_runtime_conf=job.f_train_runtime_conf)

        # initiator runtime conf as template
        job.f_runtime_conf_on_party = job.f_runtime_conf.copy()
        job.f_runtime_conf_on_party[
            "job_parameters"] = common_job_parameters.to_dict()

        if common_job_parameters.work_mode == WorkMode.CLUSTER:
            # Save the status information of all participants in the initiator for scheduling
            for role, party_ids in job.f_roles.items():
                for party_id in party_ids:
                    if role == job.f_initiator_role and party_id == job.f_initiator_party_id:
                        continue
                    JobController.initialize_tasks(job_id, role, party_id,
                                                   False, job.f_initiator_role,
                                                   job.f_initiator_party_id,
                                                   common_job_parameters,
                                                   dsl_parser)

        status_code, response = FederatedScheduler.create_job(job=job)
        if status_code != FederatedSchedulingStatusCode.SUCCESS:
            job.f_status = JobStatus.FAILED
            job.f_tag = "submit_failed"
            FederatedScheduler.sync_job_status(job=job)
            raise Exception("create job failed", response)

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(
                job.f_job_id, common_job_parameters.model_id))
        logs_directory = job_utils.get_job_log_directory(job_id)
        submit_result = {
            "job_id":
            job_id,
            "model_info": {
                "model_id": common_job_parameters.model_id,
                "model_version": common_job_parameters.model_version
            },
            "logs_directory":
            logs_directory,
            "board_url":
            job_utils.get_board_url(job_id, job_initiator['role'],
                                    job_initiator['party_id'])
        }
        submit_result.update(path_dict)
        return submit_result
예제 #24
0
 def schedule_waiting_jobs(cls, job):
     job_id, initiator_role, initiator_party_id, = job.f_job_id, job.f_initiator_role, job.f_initiator_party_id,
     if not cls.ready_signal(job_id=job_id, set_or_reset=True):
         schedule_logger(job_id).info(
             f"job {job_id} may be handled by another scheduler")
         return
     try:
         if job.f_cancel_signal:
             job.f_status = JobStatus.CANCELED
             FederatedScheduler.sync_job_status(job=job)
             schedule_logger(job_id).info(
                 f"job {job_id} have cancel signal")
             return
         apply_status_code, federated_response = FederatedScheduler.resource_for_job(
             job=job, operation_type=ResourceOperation.APPLY)
         if apply_status_code == FederatedSchedulingStatusCode.SUCCESS:
             cls.start_job(job_id=job_id,
                           initiator_role=initiator_role,
                           initiator_party_id=initiator_party_id)
         else:
             # rollback resource
             rollback_party = {}
             failed_party = {}
             for dest_role in federated_response.keys():
                 for dest_party_id in federated_response[dest_role].keys():
                     retcode = federated_response[dest_role][dest_party_id][
                         "retcode"]
                     if retcode == 0:
                         rollback_party[dest_role] = rollback_party.get(
                             dest_role, [])
                         rollback_party[dest_role].append(dest_party_id)
                     else:
                         failed_party[dest_role] = failed_party.get(
                             dest_role, [])
                         failed_party[dest_role].append(dest_party_id)
             schedule_logger(job_id).info(
                 "job {} apply resource failed on {}, rollback {}".format(
                     job_id,
                     ",".join([
                         ",".join([f"{_r}:{_p}" for _p in _ps])
                         for _r, _ps in failed_party.items()
                     ]),
                     ",".join([
                         ",".join([f"{_r}:{_p}" for _p in _ps])
                         for _r, _ps in rollback_party.items()
                     ]),
                 ))
             if rollback_party:
                 return_status_code, federated_response = FederatedScheduler.resource_for_job(
                     job=job,
                     operation_type=ResourceOperation.RETURN,
                     specific_dest=rollback_party)
                 if return_status_code != FederatedSchedulingStatusCode.SUCCESS:
                     schedule_logger(job_id).info(
                         f"job {job_id} return resource failed:\n{federated_response}"
                     )
             else:
                 schedule_logger(job_id).info(
                     f"job {job_id} no party should be rollback resource")
             if apply_status_code == FederatedSchedulingStatusCode.ERROR:
                 cls.stop_job(job_id=job_id,
                              role=initiator_role,
                              party_id=initiator_party_id,
                              stop_status=JobStatus.FAILED)
                 schedule_logger(job_id).info(
                     f"apply resource error, stop job {job_id}")
     except Exception as e:
         raise e
     finally:
         update_status = cls.ready_signal(job_id=job_id, set_or_reset=False)
         schedule_logger(job_id).info(
             f"reset job {job_id} ready signal {update_status}")
예제 #25
0
    def set_job_rerun(cls,
                      job_id,
                      initiator_role,
                      initiator_party_id,
                      auto,
                      force=False,
                      tasks: typing.List[Task] = None,
                      component_name: typing.Union[str, list] = None):
        schedule_logger(job_id).info(
            f"try to rerun job on initiator {initiator_role} {initiator_party_id}"
        )

        jobs = JobSaver.query_job(job_id=job_id,
                                  role=initiator_role,
                                  party_id=initiator_party_id)
        if not jobs:
            raise RuntimeError(
                f"can not found job on initiator {initiator_role} {initiator_party_id}"
            )
        job = jobs[0]

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        component_name, force = cls.get_rerun_component(
            component_name, job, dsl_parser, force)
        schedule_logger(job_id).info(f"rerun component: {component_name}")

        if tasks:
            schedule_logger(job_id).info(
                f"require {[task.f_component_name for task in tasks]} to rerun"
            )
        else:
            task_query = {
                'job_id': job_id,
                'role': initiator_role,
                'party_id': initiator_party_id,
            }

            if not component_name or component_name == job_utils.job_pipeline_component_name(
            ):
                # rerun all tasks
                schedule_logger(job_id).info(
                    "require all component of pipeline to rerun")
            else:
                _require_reruns = {component_name} if isinstance(
                    component_name, str) else set(component_name)
                _should_reruns = _require_reruns.copy()
                for _cpn in _require_reruns:
                    _components = dsl_parser.get_downstream_dependent_components(
                        _cpn)
                    for _c in _components:
                        _should_reruns.add(_c.get_name())

                schedule_logger(job_id).info(
                    f"require {_require_reruns} to rerun, "
                    f"and then found {_should_reruns} need be to rerun")
                task_query['component_name'] = _should_reruns

            tasks = JobSaver.query_task(**task_query)

        job_can_rerun = any([
            TaskScheduler.prepare_rerun_task(
                job=job,
                task=task,
                dsl_parser=dsl_parser,
                auto=auto,
                force=force,
            ) for task in tasks
        ])
        if not job_can_rerun:
            FederatedScheduler.sync_job_status(job=job)
            schedule_logger(job_id).info("job no task to rerun")
            return False

        schedule_logger(job_id).info("job set rerun signal")
        status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
        schedule_logger(job_id).info(
            f"job set rerun signal {'successfully' if status else 'failed'}")
        return True