예제 #1
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         if EndStatus.contains(job.f_status):
             job.f_status = JobStatus.WAITING
             job.f_end_time = None
             job.f_elapsed = None
             job.f_progress = 0
             schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun")
             status, response = FederatedScheduler.sync_job_status(job=job)
             if status == FederatedSchedulingStatusCode.SUCCESS:
                 FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"])
                 JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully")
             else:
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed")
         else:
             # status updates may be delayed, and in a very small probability they will be executed after the rerun command
             schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task")
     else:
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
예제 #2
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf_on_party,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf_on_party["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
예제 #3
0
    def submit(cls, job_data, job_id=None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(
            job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_job_runtime_conf(job_runtime_conf)

        job_initiator = job_runtime_conf['initiator']
        conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf)
        common_job_parameters = conf_adapter.get_common_parameters()

        if common_job_parameters.job_type != 'predict':
            # generate job model info
            common_job_parameters.model_id = model_utils.gen_model_id(
                job_runtime_conf['role'])
            common_job_parameters.model_version = job_id
            train_runtime_conf = {}
        else:
            # check predict job parameters
            detect_utils.check_config(common_job_parameters.to_dict(),
                                      ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            tracker = Tracker(
                job_id=job_id,
                role=job_initiator['role'],
                party_id=job_initiator['party_id'],
                model_id=common_job_parameters.model_id,
                model_version=common_job_parameters.model_version)
            pipeline_model = tracker.get_output_model('pipeline')
            if not job_dsl:
                job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(
                pipeline_model['Pipeline'].train_runtime_conf)

        job = Job()
        job.f_job_id = job_id
        job.f_dsl = job_dsl
        job.f_train_runtime_conf = train_runtime_conf
        job.f_roles = job_runtime_conf['role']
        job.f_work_mode = common_job_parameters.work_mode
        job.f_initiator_role = job_initiator['role']
        job.f_initiator_party_id = job_initiator['party_id']

        path_dict = job_utils.save_job_conf(
            job_id=job_id,
            role=job.f_initiator_role,
            job_dsl=job_dsl,
            job_runtime_conf=job_runtime_conf,
            job_runtime_conf_on_party={},
            train_runtime_conf=train_runtime_conf,
            pipeline_dsl=None)

        if job.f_initiator_party_id not in job_runtime_conf['role'][
                job.f_initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(
                job.f_initiator_party_id))
            raise Exception("initiator party id error {}".format(
                job.f_initiator_party_id))

        # create common parameters on initiator
        JobController.backend_compatibility(
            job_parameters=common_job_parameters)
        JobController.adapt_job_parameters(
            role=job.f_initiator_role,
            job_parameters=common_job_parameters,
            create_initiator_baseline=True)

        job.f_runtime_conf = conf_adapter.update_common_parameters(
            common_parameters=common_job_parameters)
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf,
            train_runtime_conf=job.f_train_runtime_conf)

        # initiator runtime conf as template
        job.f_runtime_conf_on_party = job.f_runtime_conf.copy()
        job.f_runtime_conf_on_party[
            "job_parameters"] = common_job_parameters.to_dict()

        if common_job_parameters.work_mode == WorkMode.CLUSTER:
            # Save the status information of all participants in the initiator for scheduling
            for role, party_ids in job.f_roles.items():
                for party_id in party_ids:
                    if role == job.f_initiator_role and party_id == job.f_initiator_party_id:
                        continue
                    JobController.initialize_tasks(job_id, role, party_id,
                                                   False, job.f_initiator_role,
                                                   job.f_initiator_party_id,
                                                   common_job_parameters,
                                                   dsl_parser)

        status_code, response = FederatedScheduler.create_job(job=job)
        if status_code != FederatedSchedulingStatusCode.SUCCESS:
            job.f_status = JobStatus.FAILED
            job.f_tag = "submit_failed"
            FederatedScheduler.sync_job_status(job=job)
            raise Exception("create job failed", response)

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(
                job.f_job_id, common_job_parameters.model_id))
        board_url = "http://{}:{}{}".format(
            ServiceUtils.get_item("fateboard", "host"),
            ServiceUtils.get_item("fateboard", "port"),
            FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id,
                                                  job_initiator['role'],
                                                  job_initiator['party_id'])
        logs_directory = job_utils.get_job_log_directory(job_id)
        submit_result = {
            "job_id": job_id,
            "model_info": {
                "model_id": common_job_parameters.model_id,
                "model_version": common_job_parameters.model_version
            },
            "logs_directory": logs_directory,
            "board_url": board_url
        }
        submit_result.update(path_dict)
        return submit_result
예제 #4
0
    def submit(cls, job_data, job_id=None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_initiator = job_runtime_conf['initiator']
        job_parameters = RunParameters(**job_runtime_conf['job_parameters'])
        cls.backend_compatibility(job_parameters=job_parameters)

        job_utils.check_job_runtime_conf(job_runtime_conf)
        if job_parameters.job_type != 'predict':
            # generate job model info
            job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role'])
            job_parameters.model_version = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                              model_id=job_parameters.model_id, model_version=job_parameters.model_version)
            pipeline_model = tracker.get_output_model('pipeline')
            if not job_dsl:
                job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)

        path_dict = job_utils.save_job_conf(job_id=job_id,
                                            job_dsl=job_dsl,
                                            job_runtime_conf=job_runtime_conf,
                                            train_runtime_conf=train_runtime_conf,
                                            pipeline_dsl=None)

        job = Job()
        job.f_job_id = job_id
        job.f_dsl = job_dsl
        job_runtime_conf["job_parameters"] = job_parameters.to_dict()
        job.f_runtime_conf = job_runtime_conf
        job.f_train_runtime_conf = train_runtime_conf
        job.f_roles = job_runtime_conf['role']
        job.f_work_mode = job_parameters.work_mode
        job.f_initiator_role = job_initiator['role']
        job.f_initiator_party_id = job_initiator['party_id']

        initiator_role = job_initiator['role']
        initiator_party_id = job_initiator['party_id']
        if initiator_party_id not in job_runtime_conf['role'][initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id))
            raise Exception("initiator party id error {}".format(initiator_party_id))

        dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl,
                                                       runtime_conf=job_runtime_conf,
                                                       train_runtime_conf=train_runtime_conf)

        cls.adapt_job_parameters(job_parameters=job_parameters)

        # update runtime conf
        job_runtime_conf["job_parameters"] = job_parameters.to_dict()
        job.f_runtime_conf = job_runtime_conf

        status_code, response = FederatedScheduler.create_job(job=job)
        if status_code != FederatedSchedulingStatusCode.SUCCESS:
            raise Exception("create job failed: {}".format(response))

        if job_parameters.work_mode == WorkMode.CLUSTER:
            # Save the status information of all participants in the initiator for scheduling
            for role, party_ids in job_runtime_conf["role"].items():
                for party_id in party_ids:
                    if role == job_initiator['role'] and party_id == job_initiator['party_id']:
                        continue
                    JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser)

        # push into queue
        try:
            JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
        except Exception as e:
            raise Exception(f'push job into queue failed:\n{e}')

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id))
        board_url = "http://{}:{}{}".format(
            ServiceUtils.get_item("fateboard", "host"),
            ServiceUtils.get_item("fateboard", "port"),
            FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id'])
        logs_directory = job_utils.get_job_log_directory(job_id)
        return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \
               {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url