def query_task(cls, only_latest=True, reverse=None, order_by=None, **kwargs): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(Task, attr_name): filters.append(operator.attrgetter('f_%s' % f_n)(Task) == f_v) if filters: tasks = Task.select().where(*filters) else: tasks = Task.select() if reverse is not None: if not order_by or not hasattr(Task, f"f_{order_by}"): order_by = "create_time" if reverse is True: tasks = tasks.order_by(getattr(Task, f"f_{order_by}").desc()) elif reverse is False: tasks = tasks.order_by(getattr(Task, f"f_{order_by}").asc()) if only_latest: tasks_group = cls.get_latest_tasks(tasks=tasks) return list(tasks_group.values()) else: return [task for task in tasks]
def query_task(**kwargs): with DB.connection_context(): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(Task, attr_name): filters.append(operator.attrgetter('f_%s' % f_n)(Task) == f_v) if filters: tasks = Task.select().where(*filters) else: tasks = Task.select() return [task for task in tasks]
def report_task_to_initiator(cls, task: Task): """ :param task: :return: """ if task.f_role != task.f_initiator_role and task.f_party_id != task.f_initiator_party_id: exception = None for t in range(DEFAULT_FEDERATED_COMMAND_TRYS): try: response = federated_api( job_id=task.f_job_id, method='POST', endpoint='/initiator/{}/{}/{}/{}/{}/{}/report'.format( task.f_job_id, task.f_component_name, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id), src_party_id=task.f_party_id, dest_party_id=task.f_initiator_party_id, src_role=task.f_role, json_body=task.to_human_model_dict( only_primary_with=cls.REPORT_TO_INITIATOR_FIELDS), federated_mode=task.f_federated_mode) except Exception as e: exception = e continue if response["retcode"] != RetCode.SUCCESS: exception = Exception(response["retmsg"]) else: return True else: schedule_logger(job_id=task.f_job_id).error( f"report task to initiator error: {exception}") return False else: return False
def query_task(cls, only_latest=True, reverse=None, order_by=None, **kwargs) -> typing.List[Task]: tasks = Task.query(reverse=reverse, order_by=order_by, **kwargs) if only_latest: tasks_group = cls.get_latest_tasks(tasks=tasks) return list(tasks_group.values()) else: return tasks
def check_task(cls, job_id, role, party_id, components: list): filters = [ Task.f_job_id == job_id, Task.f_role == role, Task.f_party_id == party_id, Task.f_component_name << components ] tasks = Task.select().where(*filters) if tasks and len(tasks) == len(components): return True else: return False
def save_worker_info(cls, task: Task, worker_name: WorkerName, worker_id, **kwargs): worker = WorkerInfo() ignore_attr = auto_date_timestamp_db_field() for attr, value in task.to_dict().items(): if hasattr(worker, attr) and attr not in ignore_attr and value is not None: setattr(worker, attr, value) worker.f_create_time = current_timestamp() worker.f_worker_name = worker_name.value worker.f_worker_id = worker_id for k, v in kwargs.items(): attr = f"f_{k}" if hasattr(worker, attr) and v is not None: setattr(worker, attr, v) rows = worker.save(force_insert=True) if rows != 1: raise Exception("save worker info failed")
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--processors_per_node', help="processors_per_node", type=int) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) executor_pid = os.getpid() task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] component_parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) task_parameters = task_config['task_parameters'] module_name = task_config['module_name'] TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], component_module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = executor_pid run_class_paths = component_parameters.get('CodePath').split('/') run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py', '') run_class_name = run_class_paths[-1] task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=job_parameters.get('backend', 0)) if args.processors_per_node and args.processors_per_node > 0 and RuntimeConfig.BACKEND == Backend.EGGROLL: session_options = {"eggroll.session.processors.per.node": args.processors_per_node} else: session_options = {} session.init(job_id=job_utils.generate_session_id(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND, options=session_options) federation.init(job_id=task_id, runtime_conf=component_parameters) schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id)) schedule_logger().info(component_parameters) schedule_logger().info(task_input_dsl) task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id, task_id=task_id, job_args=job_args, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, if_save_as_task_input_data=job_parameters.get("save_as_task_input_data", SAVE_AS_TASK_INPUT_DATA_SWITCH) ) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) run_object.run(component_parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.COMPLETE except Exception as e: task.f_status = TaskStatus.FAILED schedule_logger().exception(e) finally: sync_success = False try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_start_time))) schedule_logger().info('task {} {} {} end time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_end_time))) schedule_logger().info('task {} {} {} takes {}s'.format(task_id, role, party_id, int(task.f_elapsed)/1000)) schedule_logger().info( 'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
def get_tasks_asc(cls, job_id, role, party_id): tasks = Task.select().where( Task.f_job_id == job_id, Task.f_role == role, Task.f_party_id == party_id).order_by(Task.f_create_time.asc()) tasks_group = cls.get_latest_tasks(tasks=tasks) return tasks_group
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger.info('enter task process') schedule_logger.info(args) # init function args if args.job_server: RuntimeConfig.init_config( HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode']) storage.init_storage(job_id=task_id, work_mode=RuntimeConfig.WORK_MODE) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get( 'party_id', None), task_info=task.to_json()) schedule_logger.info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger.info(parameters) schedule_logger.info(task_input_dsl) run_object.run(parameters, task_run_args) if task_output_dsl: if task_output_dsl.get('data', []): output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0]) if task_output_dsl.get('model', []): output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0]) task.f_status = TaskStatus.SUCCESS except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED finally: try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json()) except Exception as e: schedule_logger.exception(e) schedule_logger.info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status))
def save_task(self, role, party_id, task_info): with DB.connection_context(): tasks = Task.select().where( Task.f_job_id == self.job_id, Task.f_component_name == self.component_name, Task.f_task_id == self.task_id, Task.f_role == role, Task.f_party_id == party_id) is_insert = True if tasks: task = tasks[0] is_insert = False else: task = Task() task.f_create_time = current_timestamp() task.f_job_id = self.job_id task.f_component_name = self.component_name task.f_task_id = self.task_id task.f_role = role task.f_party_id = party_id if 'f_status' in task_info: if task.f_status in [TaskStatus.COMPLETE, TaskStatus.FAILED]: # Termination status cannot be updated # TODO: pass for k, v in task_info.items(): try: if k in [ 'f_job_id', 'f_component_name', 'f_task_id', 'f_role', 'f_party_id' ] or v == getattr(Task, k).default: continue except: pass setattr(task, k, v) if is_insert: task.save(force_insert=True) else: task.save() return task
def list_task(limit): if limit > 0: tasks = Task.select().order_by(Task.f_create_time.desc()).limit(limit) else: tasks = Task.select().order_by(Task.f_create_time.desc()) return [task for task in tasks]
def get_tasks_asc(cls, job_id, role, party_id): tasks = Task.query(order_by="create_time", reverse=False, job_id=job_id, role=role, party_id=party_id) tasks_group = cls.get_latest_tasks(tasks=tasks) return tasks_group