def stop_jobs(cls, job_id, stop_status, role=None, party_id=None): if role and party_id: jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) else: jobs = JobSaver.query_job(job_id=job_id) kill_status = True kill_details = {} for job in jobs: kill_job_status, kill_job_details = cls.stop_job( job=job, stop_status=stop_status) kill_status = kill_status & kill_job_status kill_details[job_id] = kill_job_details return kill_status, kill_details
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) component_need_run = {} if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format(job_info.get('job_id', ''))) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True) for task in tasks: need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True) component_need_run[task.f_component_name] = need_run else: dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) dependency = dsl_parser.get_dependency() dependency["component_need_run"] = component_need_run return dependency except Exception as e: stat_logger.exception(e) raise e
def clean_task(cls, job_id, task_id, task_version, role, party_id, content_type: TaskCleanResourceType): status = set() if content_type == TaskCleanResourceType.METRICS: tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version) status.add(tracker.clean_metrics()) elif content_type == TaskCleanResourceType.TABLE: jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) if jobs: job = jobs[0] job_parameters = RunParameters( **job.f_runtime_conf_on_party["job_parameters"]) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_parameters=job_parameters) status.add(tracker.clean_task(job.f_runtime_conf_on_party)) if len(status) == 1 and True in status: return True else: return False
def start_job(cls, job_id, initiator_role, initiator_party_id): schedule_logger(job_id=job_id).info( "try to start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) job_info = {} job_info["job_id"] = job_id job_info["role"] = initiator_role job_info["party_id"] = initiator_party_id job_info["status"] = JobStatus.RUNNING job_info["party_status"] = JobStatus.RUNNING job_info["start_time"] = current_timestamp() job_info["tag"] = 'end_waiting' jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] FederatedScheduler.start_job(job=job) schedule_logger(job_id=job_id).info( "start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) else: schedule_logger(job_id=job_id).error( "can not found job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id))
def get_job_table_list(): jobs = JobSaver.query_job(**request.json) if jobs: job = jobs[0] tables = get_job_all_table(job) return get_json_result(data=tables) else: return get_json_result(retcode=101, retmsg='no find job')
def clean_queue(): jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING) clean_status = {} for job in jobs: status_code, response = FederatedScheduler.request_stop_job( job=job, stop_status=JobStatus.CANCELED) clean_status[job.f_job_id] = status_code return get_json_result(retcode=0, retmsg='success', data=clean_status)
def get_job_table_list(): detect_utils.check_config(config=request.json, required_arguments=['job_id', 'role', 'party_id']) jobs = JobSaver.query_job(**request.json) if jobs: job = jobs[0] tables = get_job_all_table(job) return get_json_result(data=tables) else: return get_json_result(retcode=101, retmsg='no find job')
def query_job(): jobs = JobSaver.query_job(**request.json) if not jobs: return get_json_result(retcode=0, retmsg='no job could be found', data=[]) return get_json_result(retcode=0, retmsg='success', data=[job.to_json() for job in jobs])
def component_output_data_table(): request_data = request.json detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name']) jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, only_latest=False) stop_job_ids = set() for task in running_tasks: if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party: continue count += 1 try: process_exist = build_engine( task.f_engine_conf.get("computing_engine")).is_alive( task) if not process_exist: msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}" detect_logger(job_id=task.f_job_id).info( f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist" ) time.sleep(3) _tasks = JobSaver.query_task( task_id=task.f_task_id, task_version=task.f_task_version, role=task.f_role, party_id=task.f_party_id) if _tasks: if _tasks[0].f_party_status == TaskStatus.RUNNING: stop_job_ids.add(task.f_job_id) detect_logger(task.f_job_id).info( f"{msg} party status has been checked twice, try to stop job" ) else: detect_logger(task.f_job_id).info( f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again" ) else: detect_logger(task.f_job_id).warning( f"{msg} can not found on db") except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.FAILED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def component_output_data_table(): request_data = request.json jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify( FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def check_dependence(job_id, role, party_id): job = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)[0] status = DependenceManager.check_job_dependence(job) if status: return get_json_result(retcode=0, retmsg='success') else: return get_json_result( retcode=RetCode.RUNNING, retmsg=f"check for job {job_id} dependence failed, " f"dependencies are being installed automatically, it may take a few minutes" )
def update_job_on_initiator(cls, initiator_job: Job, update_fields: list): jobs = JobSaver.query_job(job_id=initiator_job.f_job_id) if not jobs: raise Exception("Failed to update job status on initiator") job_info = initiator_job.to_human_model_dict( only_primary_with=update_fields) for field in update_fields: job_info[field] = getattr(initiator_job, "f_%s" % field) for job in jobs: job_info["role"] = job.f_role job_info["party_id"] = job.f_party_id JobSaver.update_job_status(job_info=job_info) JobSaver.update_job(job_info=job_info)
def update_parameters(): job_info = request.json component_parameters = job_info.pop("component_parameters", None) job_parameters = job_info.pop("job_parameters", None) job_info["is_initiator"] = True jobs = JobSaver.query_job(**job_info) if not jobs: return get_json_result( retcode=RetCode.DATA_ERROR, retmsg=log_utils.failed_log(f"query job by {job_info}")) else: retcode, retdata = DAGScheduler.update_parameters( jobs[0], job_parameters, component_parameters) return get_json_result(retcode=retcode, data=retdata)
def get_url(): request_data = request.json jobs = JobSaver.query_job(job_id=request_data.get('job_id'), role=request_data.get('role'), party_id=request_data.get('party_id')) if jobs: board_urls = [] for job in jobs: board_url = job_utils.get_board_url(job.f_job_id, job.f_role, job.f_party_id) board_urls.append(board_url) return get_json_result(data={'board_url': board_urls}) else: return get_json_result(retcode=101, retmsg='no found job')
def _run(self): job = JobSaver.query_job(job_id=self.args.job_id, role=self.args.role, party_id=self.args.party_id)[0] try: JobController.job_reload(job) except Exception as e: traceback.print_exc() JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.FAILED }) LOGGER.exception(e)
def rerun_job(): job_id = request.json.get("job_id") jobs = JobSaver.query_job(job_id=job_id) if jobs: status_code, response = FederatedScheduler.request_rerun_job( job=jobs[0], command_body=request.json) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result(retcode=RetCode.SUCCESS, retmsg="rerun job success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="rerun job failed:\n{}".format( json_dumps(response))) else: return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def update_job(): job_info = request.json jobs = JobSaver.query_job(job_id=job_info['job_id'], party_id=job_info['party_id'], role=job_info['role']) if not jobs: return get_json_result(retcode=101, retmsg='find job failed') else: JobSaver.update_job( job_info={ 'description': job_info.get('notes', ''), 'job_id': job_info['job_id'], 'role': job_info['role'], 'party_id': job_info['party_id'] }) return get_json_result(retcode=0, retmsg='success')
def query_resource(cls, resource_in_use=True, engine_name=None): if not engine_name: engine_name = ENGINES.get(EngineType.COMPUTING) use_resource_jobs = JobSaver.query_job(resource_in_use=resource_in_use) used = [] for job in use_resource_jobs: used.append({ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "core": job.f_cores, "memory": job.f_memory }) computing_engine_resource = cls.get_engine_registration_info( engine_type=EngineType.COMPUTING, engine_name=engine_name) return used, computing_engine_resource.to_dict( ) if computing_engine_resource else {}
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format(job_info.get('job_id', ''))) job = jobs[0] job_dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) else: job_dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) return job_dsl_parser.get_dependency(role=job_info["role"], party_id=int(job_info["party_id"])) except Exception as e: stat_logger.exception(e) raise e
def return_resource(cls, job_id): jobs = JobSaver.query_job(job_id=job_id) if not jobs: raise Exception(f'no found job {job_id}') return_resource_job_list = [] for job in jobs: job_info = { "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "resource_in_use": job.f_resource_in_use, "resource_return_status": False } if job.f_resource_in_use: return_status = cls.return_job_resource( job.f_job_id, job.f_role, job.f_party_id) job_info["resource_return_status"] = return_status return_resource_job_list.append(job_info) return return_resource_job_list
def detect_running_job(cls): detect_logger().info('start detect running job') try: running_jobs = JobSaver.query_job(status=JobStatus.RUNNING, is_initiator=True) stop_jobs = set() for job in running_jobs: try: if job_utils.check_job_is_timeout(job): stop_jobs.add(job) except Exception as e: detect_logger(job_id=job.f_job_id).exception(e) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="running timeout", stop_status=JobStatus.TIMEOUT) except Exception as e: detect_logger().exception(e) finally: detect_logger().info('finish detect running job')
def job_config(): jobs = JobSaver.query_job(**request.json) if not jobs: return get_json_result(retcode=101, retmsg='find job failed') else: job = jobs[0] response_data = dict() response_data['job_id'] = job.f_job_id response_data['dsl'] = job.f_dsl response_data['runtime_conf'] = job.f_runtime_conf response_data['train_runtime_conf'] = job.f_train_runtime_conf adapter = JobRuntimeConfigAdapter(job.f_runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() response_data['model_info'] = { 'model_id': job_parameters.get('model_id'), 'model_version': job_parameters.get('model_version') } return get_json_result(retcode=0, retmsg='success', data=response_data)
def output_reload(cls, job, source_tasks: dict, target_tasks: dict): # model reload schedule_logger(job.f_job_id).info("start reload model") source_job = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"))[0] cls.output_model_reload(job, source_job) cls.checkpoint_reload(job, source_job) schedule_logger(job.f_job_id).info("start reload data") source_tracker_dict = cls.load_task_tracker(source_tasks) target_tracker_dict = cls.load_task_tracker(target_tasks) for key, source_tracker in source_tracker_dict.items(): target_tracker = target_tracker_dict[key] table_infos = source_tracker.get_output_data_info() # data reload schedule_logger(job.f_job_id).info(f"table infos:{table_infos}") for table in table_infos: target_tracker.log_output_data_info( data_name=table.f_data_name, table_namespace=table.f_table_namespace, table_name=table.f_table_name) # cache reload schedule_logger(job.f_job_id).info("start reload cache") cache_list = source_tracker.query_output_cache_record() for cache in cache_list: schedule_logger(job.f_job_id).info( f"start reload cache name: {cache.f_cache_name}") target_tracker.tracking_output_cache( cache.f_cache, cache_name=cache.f_cache_name) # summary reload schedule_logger(job.f_job_id).info("start reload summary") target_tracker.reload_summary(source_tracker=source_tracker) # metric reload schedule_logger(job.f_job_id).info("start reload metric") target_tracker.reload_metric(source_tracker=source_tracker) schedule_logger(job.f_job_id).info("reload output success")
def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, run_on_this_party=True, run_ip=RuntimeConfig.JOB_SERVER_HOST, only_latest=False) stop_job_ids = set() for task in running_tasks: count += 1 try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger(job_id=task.f_job_id).info( 'job {} task {} {} on {} {} process {} does not exist' .format(task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.CANCELED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def stop_job(cls, job_id, role, party_id, stop_status): schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status}") jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True) if len(jobs) > 0: if stop_status == JobStatus.CANCELED: schedule_logger(job_id=job_id).info(f"cancel job {job_id}") set_cancel_status = cls.cancel_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id=job_id).info( f"set job {job_id} cancel signal {set_cancel_status}") job = jobs[0] job.f_status = stop_status schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status} to all party") status_code, response = FederatedScheduler.stop_job( job=jobs[0], stop_status=stop_status) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} successfully") return RetCode.SUCCESS, "success" else: initiator_tasks_group = JobSaver.get_tasks_asc( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) for initiator_task in initiator_tasks_group.values(): TaskScheduler.collect_task_of_all_party( job, initiator_task=initiator_task, set_status=stop_status) schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} failed, {response}") return RetCode.FEDERATED_ERROR, json_dumps(response) else: return RetCode.SUCCESS, "can not found job"
def component_rerun_check(job_id, role, party_id): job = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)[0] component_list = DependenceManager.component_check(job, check_type="rerun") return get_json_result(data=component_list)
def run_do(self): schedule_logger().info("start schedule waiting jobs") jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} waiting jobs") if len(jobs): # FIFO job = jobs[0] schedule_logger().info(f"schedule waiting job {job.f_job_id}") try: self.schedule_waiting_jobs(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error( f"schedule waiting job {job.f_job_id} failed") schedule_logger().info("schedule waiting jobs finished") schedule_logger().info("start schedule running jobs") jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.RUNNING, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} running jobs") for job in jobs: schedule_logger().info(f"schedule running job {job.f_job_id}") try: self.schedule_running_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger( job.f_job_id).error(f"schedule job {job.f_job_id} failed") schedule_logger().info("schedule running jobs finished") # some ready job exit before start schedule_logger().info("start schedule ready jobs") jobs = JobSaver.query_job(is_initiator=True, ready_signal=True, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} ready jobs") for job in jobs: schedule_logger().info(f"schedule ready job {job.f_job_id}") try: self.schedule_ready_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error( f"schedule ready job {job.f_job_id} failed:\n{e}") schedule_logger().info("schedule ready jobs finished") schedule_logger().info("start schedule rerun jobs") jobs = JobSaver.query_job(is_initiator=True, rerun_signal=True, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} rerun jobs") for job in jobs: schedule_logger().info(f"schedule rerun job {job.f_job_id}") try: self.schedule_rerun_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger( job.f_job_id).error(f"schedule job {job.f_job_id} failed") schedule_logger().info("schedule rerun jobs finished") schedule_logger().info( "start schedule end status jobs to update status") jobs = JobSaver.query_job(is_initiator=True, status=set(EndStatus.status_list()), end_time=[ current_timestamp() - END_STATUS_JOB_SCHEDULING_TIME_LIMIT, current_timestamp() ]) schedule_logger().info(f"have {len(jobs)} end status jobs") for job in jobs: schedule_logger().info(f"schedule end status job {job.f_job_id}") try: update_status = self.end_scheduling_updates( job_id=job.f_job_id) if not update_status: schedule_logger(job.f_job_id).info( f"the number of updates has been exceeded") continue self.schedule_running_job(job=job, force_sync_status=True) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger( job.f_job_id).error(f"schedule job {job.f_job_id} failed") schedule_logger().info("schedule end status jobs finished")
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info( f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError( f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}" ) if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun" ) else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version}" ) for _role, _party_ids in job.f_runtime_conf_on_party[ "role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks( job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters( ** job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" ) job_can_rerun = True if job_can_rerun: schedule_logger( job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger( job_id=job_id).info(f"job {job_id} no task to rerun")