예제 #1
0
    def schedule_running_job(cls, job: Job, force_sync_status=False):
        schedule_logger(job.f_job_id).info(f"scheduling running job")

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = dict([(task.f_component_name, task.f_status)
                             for task in tasks])
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status.values())
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job.f_job_id).info(
            f"job status is {new_job_status}, calculate by task status list: {tasks_status}"
        )
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if auto_rerun_tasks:
            schedule_logger(job.f_job_id).info("job have auto rerun tasks")
            cls.set_job_rerun(job_id=job.f_job_id,
                              initiator_role=job.f_initiator_role,
                              initiator_party_id=job.f_initiator_party_id,
                              tasks=auto_rerun_tasks,
                              auto=True)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job.f_job_id).info("finish scheduling running job")
예제 #2
0
 def update_job_status(cls, job_info):
     update_status = JobSaver.update_job_status(job_info=job_info)
     if update_status and EndStatus.contains(job_info.get("status")):
         ResourceManager.return_job_resource(job_id=job_info["job_id"],
                                             role=job_info["role"],
                                             party_id=job_info["party_id"])
     return update_status
예제 #3
0
 def schedule_rerun_job(cls, job):
     if EndStatus.contains(job.f_status):
         job.f_status = JobStatus.WAITING
         job.f_ready_signal = False
         job.f_ready_time = None
         job.f_rerun_signal = False
         job.f_progress = 0
         job.f_end_time = None
         job.f_elapsed = None
         schedule_logger(job.f_job_id).info(
             f"job has been finished, set waiting to rerun")
         status, response = FederatedScheduler.sync_job_status(job=job)
         if status == FederatedSchedulingStatusCode.SUCCESS:
             cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False)
             FederatedScheduler.sync_job(job=job,
                                         update_fields=[
                                             "ready_signal", "ready_time",
                                             "rerun_signal", "progress",
                                             "end_time", "elapsed"
                                         ])
             schedule_logger(job.f_job_id).info(
                 f"job set waiting to rerun successfully")
         else:
             schedule_logger(
                 job.f_job_id).info(f"job set waiting to rerun failed")
     else:
         cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False)
         cls.schedule_running_job(job)
예제 #4
0
 def update_status(cls, entity_model: DataBaseModel, entity_info: dict):
     query_filters = []
     primary_keys = entity_model.get_primary_keys_name()
     for p_k in primary_keys:
         query_filters.append(operator.attrgetter(p_k)(entity_model) == entity_info[p_k.lstrip("f").lstrip("_")])
     objs = entity_model.select().where(*query_filters)
     if objs:
         obj = objs[0]
     else:
         raise Exception(f"can not found the {entity_model.__name__} record to update")
     update_filters = query_filters[:]
     update_info = {"job_id": entity_info["job_id"]}
     for status_field in cls.STATUS_FIELDS:
         if entity_info.get(status_field) and hasattr(entity_model, f"f_{status_field}"):
             if status_field in ["status", "party_status"]:
                 update_info[status_field] = entity_info[status_field]
                 old_status = getattr(obj, f"f_{status_field}")
                 new_status = update_info[status_field]
                 if_pass = False
                 if isinstance(obj, Task):
                     if TaskStatus.StateTransitionRule.if_pass(src_status=old_status, dest_status=new_status):
                         if_pass = True
                 elif isinstance(obj, Job):
                     if JobStatus.StateTransitionRule.if_pass(src_status=old_status, dest_status=new_status):
                         if_pass = True
                     if EndStatus.contains(new_status) and new_status not in {JobStatus.SUCCESS, JobStatus.CANCELED}:
                         update_filters.append(Job.f_rerun_signal == False)
                 if if_pass:
                     update_filters.append(operator.attrgetter(f"f_{status_field}")(type(obj)) == old_status)
                 else:
                     # not allow update status
                     update_info.pop(status_field)
     return cls.execute_update(old_obj=obj, model=entity_model, update_info=update_info, update_filters=update_filters)
예제 #5
0
 def calculate_job_progress(cls, tasks_status):
     total = 0
     finished_count = 0
     for task_status in tasks_status.values():
         total += 1
         if EndStatus.contains(task_status):
             finished_count += 1
     return total, finished_count
예제 #6
0
 def update_task_status(cls, task_info):
     update_status = JobSaver.update_task_status(task_info=task_info)
     if update_status and EndStatus.contains(task_info.get("status")):
         ResourceManager.return_task_resource(task_info=task_info)
         cls.clean_task(job_id=task_info["job_id"],
                        task_id=task_info["task_id"],
                        task_version=task_info["task_version"],
                        role=task_info["role"],
                        party_id=task_info["party_id"],
                        content_type=TaskCleanResourceType.TABLE)
     cls.report_task_to_initiator(task_info=task_info)
     return update_status
예제 #7
0
 def update_job_status(cls, job_info):
     schedule_logger(job_info["job_id"]).info("try to update job status to {}".format(job_info.get("status")))
     update_status = cls.update_status(Job, job_info)
     if update_status:
         schedule_logger(job_info["job_id"]).info("update job status successfully")
         if EndStatus.contains(job_info.get("status")):
             new_job_info = {}
             # only update tag
             for k in ["job_id", "role", "party_id", "tag"]:
                 if k in job_info:
                     new_job_info[k] = job_info[k]
             if not new_job_info.get("tag"):
                 new_job_info["tag"] = "job_end"
             cls.update_entity_table(Job, new_job_info)
     else:
         schedule_logger(job_info["job_id"]).warning("update job status does not take effect")
     return update_status
예제 #8
0
 def detect_resource_record(cls):
     detect_logger().info('start detect resource recycle')
     try:
         filter_status = EndStatus.status_list()
         filter_status.append(JobStatus.WAITING)
         jobs = Job.select().where(
             Job.f_resource_in_use == True,
             current_timestamp() - Job.f_apply_resource_time >
             10 * 60 * 1000, Job.f_status << filter_status)
         stop_jobs = set()
         for job in jobs:
             if job.f_status == JobStatus.WAITING:
                 stop_jobs.add(job)
             else:
                 try:
                     detect_logger(job_id=job.f_job_id).info(
                         f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource"
                     )
                     flag = ResourceManager.return_job_resource(
                         job_id=job.f_job_id,
                         role=job.f_role,
                         party_id=job.f_party_id)
                     if flag:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully"
                         )
                     else:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed"
                         )
                 except Exception as e:
                     detect_logger(job_id=job.f_job_id).exception(e)
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="start timeout",
                               stop_status=JobStatus.TIMEOUT)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info('finish detect resource recycle')
예제 #9
0
    def test_tracking(self):
        with open(
                os.path.join(get_fate_flow_python_directory(), self.dsl_path),
                'r') as f:
            dsl_data = json.load(f)
        with open(
                os.path.join(get_fate_flow_python_directory(),
                             self.config_path), 'r') as f:
            config_data = json.load(f)
            config_data["initiator"]["party_id"] = self.guest_party_id
            config_data["role"] = {
                "guest": [self.guest_party_id],
                "host": [self.host_party_id],
                "arbiter": [self.host_party_id]
            }
        response = requests.post("/".join([self.server_url, 'job', 'submit']),
                                 json={
                                     'job_dsl': dsl_data,
                                     'job_runtime_conf': config_data
                                 })
        self.assertTrue(response.status_code in [200, 201])
        self.assertTrue(int(response.json()['retcode']) == 0)
        job_id = response.json()['jobId']
        job_info = {'f_status': 'running'}
        for i in range(60):
            response = requests.post("/".join(
                [self.server_url, 'job', 'query']),
                                     json={
                                         'job_id': job_id,
                                         'role': 'guest'
                                     })
            self.assertTrue(response.status_code in [200, 201])
            job_info = response.json()['data'][0]
            if EndStatus.contains(job_info['f_status']):
                break
            time.sleep(self.sleep_time)
            print('waiting job run success, the job has been running for {}s'.
                  format((i + 1) * self.sleep_time))
        self.assertTrue(job_info['f_status'] == JobStatus.SUCCESS)
        os.makedirs(self.success_job_dir, exist_ok=True)
        with open(os.path.join(self.success_job_dir, job_id), 'w') as fw:
            json.dump(job_info, fw)
        self.assertTrue(
            os.path.exists(os.path.join(self.success_job_dir, job_id)))

        # test_component_parameters
        test_component(self, 'component/parameters')

        # test_component_metric_all
        test_component(self, 'component/metric/all')

        # test_component_metric
        test_component(self, 'component/metrics')

        # test_component_output_model
        test_component(self, 'component/output/model')

        # test_component_output_data_download
        test_component(self, 'component/output/data')

        # test_component_output_data_download
        test_component(self, 'component/output/data/download')

        # test_job_data_view
        test_component(self, 'job/data_view')
예제 #10
0
    def run_do(self):
        schedule_logger().info("start schedule waiting jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  status=JobStatus.WAITING,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} waiting jobs")
        if len(jobs):
            # FIFO
            job = jobs[0]
            schedule_logger().info(f"schedule waiting job {job.f_job_id}")
            try:
                self.schedule_waiting_jobs(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(
                    job.f_job_id).error(f"schedule waiting job failed")
        schedule_logger().info("schedule waiting jobs finished")

        schedule_logger().info("start schedule running jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  status=JobStatus.RUNNING,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} running jobs")
        for job in jobs:
            schedule_logger().info(f"schedule running job {job.f_job_id}")
            try:
                self.schedule_running_job(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(job.f_job_id).error(f"schedule job failed")
        schedule_logger().info("schedule running jobs finished")

        # some ready job exit before start
        schedule_logger().info("start schedule ready jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  ready_signal=True,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} ready jobs")
        for job in jobs:
            schedule_logger().info(f"schedule ready job {job.f_job_id}")
            try:
                self.schedule_ready_job(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(
                    job.f_job_id).error(f"schedule ready job failed:\n{e}")
        schedule_logger().info("schedule ready jobs finished")

        schedule_logger().info("start schedule rerun jobs")
        jobs = JobSaver.query_job(is_initiator=True,
                                  rerun_signal=True,
                                  order_by="create_time",
                                  reverse=False)
        schedule_logger().info(f"have {len(jobs)} rerun jobs")
        for job in jobs:
            schedule_logger().info(f"schedule rerun job {job.f_job_id}")
            try:
                self.schedule_rerun_job(job=job)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(job.f_job_id).error(f"schedule job failed")
        schedule_logger().info("schedule rerun jobs finished")

        schedule_logger().info(
            "start schedule end status jobs to update status")
        jobs = JobSaver.query_job(
            is_initiator=True,
            status=set(EndStatus.status_list()),
            end_time=[
                current_timestamp() -
                JobDefaultConfig.end_status_job_scheduling_time_limit,
                current_timestamp()
            ])
        schedule_logger().info(f"have {len(jobs)} end status jobs")
        for job in jobs:
            schedule_logger().info(f"schedule end status job {job.f_job_id}")
            try:
                update_status = self.end_scheduling_updates(
                    job_id=job.f_job_id)
                if update_status:
                    schedule_logger(job.f_job_id).info(
                        f"try update status by scheduling like running job")
                else:
                    schedule_logger(job.f_job_id).info(
                        f"the number of updates has been exceeded")
                    continue
                self.schedule_running_job(job=job, force_sync_status=True)
            except Exception as e:
                schedule_logger(job.f_job_id).exception(e)
                schedule_logger(job.f_job_id).error(f"schedule job failed")
        schedule_logger().info("schedule end status jobs finished")
예제 #11
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job.f_job_id).info("scheduling job tasks")
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id,
                                                       role=job.f_role,
                                                       party_id=job.f_party_id)
        waiting_tasks = []
        auto_rerun_tasks = []
        for initiator_task in initiator_tasks_group.values():
            if job.f_runtime_conf_on_party["job_parameters"][
                    "federated_status_collect_type"] == FederatedCommunicationType.PULL:
                # collect all parties task party status and store it in the database now
                cls.collect_task_of_all_party(job=job,
                                              initiator_task=initiator_task)
            else:
                # all parties report task party status and store it in the initiator database when federated_status_collect_type is push
                pass
            # get all parties party task status and calculate
            new_task_status = cls.get_federated_task_status(
                job_id=initiator_task.f_job_id,
                task_id=initiator_task.f_task_id,
                task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job,
                                                    task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(
                    initiator_task.f_status):
                FederatedScheduler.stop_task(
                    job=job,
                    task=initiator_task,
                    stop_status=initiator_task.f_status)
                if not canceled and AutoRerunStatus.contains(
                        initiator_task.f_status):
                    if initiator_task.f_auto_retries > 0:
                        auto_rerun_tasks.append(initiator_task)
                        schedule_logger(job.f_job_id).info(
                            f"task {initiator_task.f_task_id} {initiator_task.f_status} will be retried"
                        )
                    else:
                        schedule_logger(job.f_job_id).info(
                            f"task {initiator_task.f_task_id} {initiator_task.f_status} has no retry count"
                        )

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(
                        component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[JobSaver.task_key(
                        task_id=job_utils.generate_task_id(
                            job_id=job.f_job_id,
                            component_name=component.get_name()),
                        role=job.f_role,
                        party_id=job.f_party_id)]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job.f_job_id).info(
                            f"task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling"
                        )
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        waiting_task.f_status = StatusSet.FAILED
                        FederatedScheduler.sync_task_status(job, waiting_task)
                        break
        else:
            schedule_logger(
                job.f_job_id).info("have cancel signal, pass start job tasks")
        schedule_logger(job.f_job_id).info("finish scheduling job tasks")
        return scheduling_status_code, auto_rerun_tasks, initiator_tasks_group.values(
        )