def start_inheriting_job(cls, job): JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.RUNNING }) conf_dir = job_utils.get_job_directory(job_id=job.f_job_id) os.makedirs(conf_dir, exist_ok=True) process_cmd = [ sys.executable or 'python3', sys.modules[JobInherit.__module__].__file__, '--job_id', job.f_job_id, '--role', job.f_role, '--party_id', job.f_party_id, ] log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job.f_job_id), "job_inheritance") p = process_utils.run_subprocess(job_id=job.f_job_id, config_dir=conf_dir, process_cmd=process_cmd, log_dir=log_dir, process_name="job_inheritance")
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) component_need_run = {} if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format(job_info.get('job_id', ''))) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True) for task in tasks: need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True) component_need_run[task.f_component_name] = need_run else: dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) dependency = dsl_parser.get_dependency() dependency["component_need_run"] = component_need_run return dependency except Exception as e: stat_logger.exception(e) raise e
def update_parameter(cls, job_id, role, party_id, updated_parameters: dict): job_configuration = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) job_parameters = updated_parameters.get("job_parameters") component_parameters = updated_parameters.get("component_parameters") if job_parameters: job_configuration.runtime_conf["job_parameters"] = job_parameters job_parameters = RunParameters(**job_parameters["common"]) cls.create_job_parameters_on_party(role=role, party_id=party_id, job_parameters=job_parameters) job_configuration.runtime_conf_on_party[ "job_parameters"] = job_parameters.to_dict() if component_parameters: job_configuration.runtime_conf[ "component_parameters"] = component_parameters job_configuration.runtime_conf_on_party[ "component_parameters"] = component_parameters job_info = {} job_info["job_id"] = job_id job_info["role"] = role job_info["party_id"] = party_id job_info["runtime_conf"] = job_configuration.runtime_conf job_info[ "runtime_conf_on_party"] = job_configuration.runtime_conf_on_party JobSaver.update_job(job_info)
def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, only_latest=False) stop_job_ids = set() for task in running_tasks: if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party: continue count += 1 try: process_exist = build_engine( task.f_engine_conf.get("computing_engine")).is_alive( task) if not process_exist: msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}" detect_logger(job_id=task.f_job_id).info( f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist" ) time.sleep(3) _tasks = JobSaver.query_task( task_id=task.f_task_id, task_version=task.f_task_version, role=task.f_role, party_id=task.f_party_id) if _tasks: if _tasks[0].f_party_status == TaskStatus.RUNNING: stop_job_ids.add(task.f_job_id) detect_logger(task.f_job_id).info( f"{msg} party status has been checked twice, try to stop job" ) else: detect_logger(task.f_job_id).info( f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again" ) else: detect_logger(task.f_job_id).warning( f"{msg} can not found on db") except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.FAILED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task( task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set( [task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all ) > 1 and not TaskStatus.RUNNING in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task( job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning( f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed" ) for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status( task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response[ "retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def update_job_on_initiator(cls, initiator_job: Job, update_fields: list): jobs = JobSaver.query_job(job_id=initiator_job.f_job_id) if not jobs: raise Exception("Failed to update job status on initiator") job_info = initiator_job.to_human_model_dict( only_primary_with=update_fields) for field in update_fields: job_info[field] = getattr(initiator_job, "f_%s" % field) for job in jobs: job_info["role"] = job.f_role job_info["party_id"] = job.f_party_id JobSaver.update_job_status(job_info=job_info) JobSaver.update_job(job_info=job_info)
def report_task(job_id, component_name, task_id, task_version, role, party_id): task_info = {} task_info.update(request.json) task_info.update({ "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, }) JobSaver.update_task(task_info=task_info) if task_info.get("party_status"): JobSaver.update_status(Task, task_info) return get_json_result(retcode=0, retmsg='success')
def upload_history(): request_data = request.json if request_data.get('job_id'): tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True) else: tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True) limit = request_data.get('limit') if not limit: tasks = tasks[-1::-1] else: tasks = tasks[-1:-limit - 1:-1] jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks) data = get_upload_info(jobs_run_conf=jobs_run_conf) return get_json_result(retcode=0, retmsg='success', data=data)
def create_task(cls, role, party_id, run_on_this_party, task_info): task_info["role"] = role task_info["party_id"] = party_id task_info["status"] = TaskStatus.WAITING task_info["party_status"] = TaskStatus.WAITING task_info["create_time"] = base_utils.current_timestamp() task_info["run_on_this_party"] = run_on_this_party if "task_id" not in task_info: task_info["task_id"] = job_utils.generate_task_id( job_id=task_info["job_id"], component_name=task_info["component_name"]) if "task_version" not in task_info: task_info["task_version"] = 0 JobSaver.create_task(task_info=task_info)
def stop_jobs(cls, job_id, stop_status, role=None, party_id=None): if role and party_id: jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) else: jobs = JobSaver.query_job(job_id=job_id) kill_status = True kill_details = {} for job in jobs: kill_job_status, kill_job_details = cls.stop_job( job=job, stop_status=stop_status) kill_status = kill_status & kill_job_status kill_details[job_id] = kill_job_details return kill_status, kill_details
def status_reload(cls, job, source_tasks, target_tasks): schedule_logger(job.f_job_id).info("start reload status") # update task status for key, source_task in source_tasks.items(): JobSaver.reload_task(source_task, target_tasks[key]) # update job status JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.SUCCESS }) schedule_logger(job.f_job_id).info("reload status success")
def start_job(cls, job_id, initiator_role, initiator_party_id): schedule_logger(job_id=job_id).info( "try to start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) job_info = {} job_info["job_id"] = job_id job_info["role"] = initiator_role job_info["party_id"] = initiator_party_id job_info["status"] = JobStatus.RUNNING job_info["party_status"] = JobStatus.RUNNING job_info["start_time"] = current_timestamp() job_info["tag"] = 'end_waiting' jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] FederatedScheduler.start_job(job=job) schedule_logger(job_id=job_id).info( "start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) else: schedule_logger(job_id=job_id).error( "can not found job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id))
def _run(self): job = JobSaver.query_job(job_id=self.args.job_id, role=self.args.role, party_id=self.args.party_id)[0] try: JobController.job_reload(job) except Exception as e: traceback.print_exc() JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.FAILED }) LOGGER.exception(e)
def start_task(cls, job, task): schedule_logger(task.f_job_id).info( "try to start task {} {} on {} {}".format(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id)) apply_status = ResourceManager.apply_for_task_resource( task_info=task.to_human_model_dict(only_primary_with=["status"])) if not apply_status: return SchedulingStatusCode.NO_RESOURCE task.f_status = TaskStatus.RUNNING update_status = JobSaver.update_task_status( task_info=task.to_human_model_dict(only_primary_with=["status"])) if not update_status: # Another scheduler scheduling the task schedule_logger(task.f_job_id).info( "task {} {} start on another scheduler".format( task.f_task_id, task.f_task_version)) # Rollback task.f_status = TaskStatus.WAITING ResourceManager.return_task_resource( task_info=task.to_human_model_dict( only_primary_with=["status"])) return SchedulingStatusCode.PASS schedule_logger(task.f_job_id).info("start task {} {} on {} {}".format( task.f_task_id, task.f_task_version, task.f_role, task.f_party_id)) FederatedScheduler.sync_task_status(job=job, task=task) status_code, response = FederatedScheduler.start_task(job=job, task=task) if status_code == FederatedSchedulingStatusCode.SUCCESS: return SchedulingStatusCode.SUCCESS else: return SchedulingStatusCode.FAILED
def component_output_data_download(): request_data = request.json tasks = JobSaver.query_task(only_latest=True, job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) if not tasks: raise ValueError( f'no found task, please check if the parameters are correct:{request_data}' ) import_component_output_depend(tasks[0].f_provider_info) try: output_tables_meta = get_component_output_tables_meta( task_data=request_data) except Exception as e: stat_logger.exception(e) return error_response(210, str(e)) limit = request_data.get('limit', -1) if not output_tables_meta: return error_response(response_code=210, retmsg='no data') if limit == 0: return error_response(response_code=210, retmsg='limit is 0') tar_file_name = 'job_{}_{}_{}_{}_output_data.tar.gz'.format( request_data['job_id'], request_data['component_name'], request_data['role'], request_data['party_id']) return TableStorage.send_table(output_tables_meta, tar_file_name, limit=limit, need_head=request_data.get("head", True))
def update_job_status(cls, job_info): update_status = JobSaver.update_job_status(job_info=job_info) if update_status and EndStatus.contains(job_info.get("status")): ResourceManager.return_job_resource(job_id=job_info["job_id"], role=job_info["role"], party_id=job_info["party_id"]) return update_status
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def update_job(cls, job_info): """ Save to local database :param job_info: :return: """ return JobSaver.update_job(job_info=job_info)
def clean_task(cls, job_id, task_id, task_version, role, party_id, content_type: TaskCleanResourceType): status = set() if content_type == TaskCleanResourceType.METRICS: tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version) status.add(tracker.clean_metrics()) elif content_type == TaskCleanResourceType.TABLE: jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) if jobs: job = jobs[0] job_parameters = RunParameters( **job.f_runtime_conf_on_party["job_parameters"]) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_parameters=job_parameters) status.add(tracker.clean_task(job.f_runtime_conf_on_party)) if len(status) == 1 and True in status: return True else: return False
def query_task(): tasks = JobSaver.query_task(**request.json) if not tasks: return get_json_result(retcode=101, retmsg='find task failed') return get_json_result(retcode=0, retmsg='success', data=[task.to_json() for task in tasks])
def update_job(): job_info = request.json jobs = JobSaver.query_job(job_id=job_info['job_id'], party_id=job_info['party_id'], role=job_info['role']) if not jobs: return get_json_result(retcode=101, retmsg='find job failed') else: JobSaver.update_job( job_info={ 'description': job_info.get('notes', ''), 'job_id': job_info['job_id'], 'role': job_info['role'], 'party_id': job_info['party_id'] }) return get_json_result(retcode=0, retmsg='success')
def report_task_to_initiator(cls, task_info): tasks = JobSaver.query_task(task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"]) if tasks[ 0].f_federated_status_collect_type == FederatedCommunicationType.PUSH: FederatedScheduler.report_task_to_initiator(task=tasks[0])
def get_job_table_list(): jobs = JobSaver.query_job(**request.json) if jobs: job = jobs[0] tables = get_job_all_table(job) return get_json_result(data=tables) else: return get_json_result(retcode=101, retmsg='no find job')
def clean_queue(): jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING) clean_status = {} for job in jobs: status_code, response = FederatedScheduler.request_stop_job( job=job, stop_status=JobStatus.CANCELED) clean_status[job.f_job_id] = status_code return get_json_result(retcode=0, retmsg='success', data=clean_status)
def query_job(): jobs = JobSaver.query_job(**request.json) if not jobs: return get_json_result(retcode=0, retmsg='no job could be found', data=[]) return get_json_result(retcode=0, retmsg='success', data=[job.to_json() for job in jobs])
def get_job_table_list(): detect_utils.check_config(config=request.json, required_arguments=['job_id', 'role', 'party_id']) jobs = JobSaver.query_job(**request.json) if jobs: job = jobs[0] tables = get_job_all_table(job) return get_json_result(data=tables) else: return get_json_result(retcode=101, retmsg='no find job')
def component_output_data_table(): request_data = request.json detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name']) jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def component_output_data_table(): request_data = request.json jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify( FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def load_tasks(cls, component_list, job_id, role, party_id): tasks = JobSaver.query_task(job_id=job_id, role=role, party_id=party_id, only_latest=True) task_dict = {} for cpn in component_list: for task in tasks: if cpn == task.f_component_name: task_dict[cpn] = task return task_dict
def check_dependence(job_id, role, party_id): job = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)[0] status = DependenceManager.check_job_dependence(job) if status: return get_json_result(retcode=0, retmsg='success') else: return get_json_result( retcode=RetCode.RUNNING, retmsg=f"check for job {job_id} dependence failed, " f"dependencies are being installed automatically, it may take a few minutes" )