def __init__(self, job_id: str, role: str, party_id: int, model_id: str = None, model_version: str = None, component_name: str = None, component_module_name: str = None, task_id: str = None): self.job_id = job_id self.role = role self.party_id = party_id self.component_name = component_name if component_name else 'pipeline' self.module_name = component_module_name if component_module_name else 'Pipeline' self.task_id = task_id if task_id else job_utils.generate_task_id( job_id=self.job_id, component_name=self.component_name) self.table_namespace = '_'.join([ 'fate_flow', 'tracking', 'data', self.job_id, self.role, str(self.party_id), self.component_name ]) self.job_table_namespace = '_'.join([ 'fate_flow', 'tracking', 'data', self.job_id, self.role, str(self.party_id) ]) self.model_id = model_id self.party_model_id = model_utils.gen_party_model_id(model_id=model_id, role=role, party_id=party_id) self.model_version = model_version self.pipelined_model = None if self.party_model_id and self.model_version: self.pipelined_model = pipelined_model.PipelinedModel( model_id=self.party_model_id, model_version=self.model_version)
def check_task_status(job_id, component, interval=1): task_id = job_utils.generate_task_id( job_id=job_id, component_name=component.get_name()) while True: try: status_collect = set() parameters = component.get_role_parameters() for _role, _partys_parameters in parameters.items(): for _party_parameters in _partys_parameters: _party_id = _party_parameters.get('local', {}).get('party_id') tasks = query_task(job_id=job_id, task_id=task_id, role=_role, party_id=_party_id) if tasks: task_status = tasks[0].f_status else: task_status = 'notRunning' schedule_logger.info( 'job {} component {} run on {} {} status is {}'. format(job_id, component.get_name(), _role, _party_id, task_status)) status_collect.add(task_status) if 'failed' in status_collect: return False elif len(status_collect) == 1 and 'success' in status_collect: return True else: time.sleep(interval) except Exception as e: schedule_logger.exception(e) return False
def create_task(cls, role, party_id, run_on_this_party, task_info): task_info["role"] = role task_info["party_id"] = party_id task_info["status"] = TaskStatus.WAITING task_info["party_status"] = TaskStatus.WAITING task_info["create_time"] = base_utils.current_timestamp() task_info["run_on_this_party"] = run_on_this_party if "task_id" not in task_info: task_info["task_id"] = job_utils.generate_task_id(job_id=task_info["job_id"], component_name=task_info["component_name"]) if "task_version" not in task_info: task_info["task_version"] = 0 JobSaver.create_task(task_info=task_info)
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id)) initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] for initiator_task in initiator_tasks_group.values(): # collect all party task party status if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL: tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all: cls.collect_task_of_all_party(job=job, task=initiator_task) new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains(initiator_task.f_status): FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[ JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id ) ] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling") break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED break else: schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id)) schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id)) return scheduling_status_code, initiator_tasks_group.values()
def create_task(cls, role, party_id, run_on_this_party, task_info): task_info["role"] = role task_info["party_id"] = str(party_id) task_info["status"] = TaskStatus.WAITING task_info["party_status"] = TaskStatus.WAITING task_info["create_time"] = base_utils.current_timestamp() task_info["run_on_this_party"] = run_on_this_party if task_info.get("task_id") is None: task_info["task_id"] = job_utils.generate_task_id( job_id=task_info["job_id"], component_name=task_info["component_name"]) if task_info.get("task_version") is None: task_info["task_version"] = 0 task = JobSaver.create_task(task_info=task_info) if task and run_on_this_party: job_utils.save_task_using_job_conf(task)
def run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component): parameters = component.get_role_parameters() component_name = component.get_name() module_name = component.get_module() task_id = job_utils.generate_task_id(job_id=job_id, component_name=component_name) schedule_logger.info('job {} run component {}'.format( job_id, component_name)) for role, partys_parameters in parameters.items(): for party_index in range(len(partys_parameters)): party_parameters = partys_parameters[party_index] if role in job_args: party_job_args = job_args[role][party_index]['args'] else: party_job_args = {} dest_party_id = party_parameters.get('local', {}).get('party_id') federated_api(job_id=job_id, method='POST', endpoint='/{}/job/{}/{}/{}/{}/{}/run'.format( API_VERSION, job_id, component_name, task_id, role, dest_party_id), src_party_id=job_initiator['party_id'], dest_party_id=dest_party_id, json_body={ 'job_parameters': job_parameters, 'job_initiator': job_initiator, 'job_args': party_job_args, 'parameters': party_parameters, 'module_name': module_name, 'input': component.get_input(), 'output': component.get_output() }, work_mode=job_parameters['work_mode']) component_task_status = TaskScheduler.check_task_status( job_id=job_id, component=component) if component_task_status: task_success = True else: task_success = False schedule_logger.info('job {} component {} run {}'.format( job_id, component_name, 'success' if task_success else 'failed')) # update progress TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job_utils.update_job_progress( job_id=job_id, dag=dag, current_task_id=task_id).to_json()) if task_success: next_components = dag.get_next_components(component_name) schedule_logger.info( 'job {} component {} next components is {}'.format( job_id, component_name, [ next_component.get_name() for next_component in next_components ])) for next_component in next_components: try: schedule_logger.info( 'job {} check component {} dependencies status'.format( job_id, next_component.get_name())) dependencies_status = TaskScheduler.check_dependencies( job_id=job_id, dag=dag, component=next_component) schedule_logger.info( 'job {} component {} dependencies status is {}'.format( job_id, next_component.get_name(), dependencies_status)) if dependencies_status: run_status = TaskScheduler.run_component( job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, next_component) else: run_status = False except Exception as e: schedule_logger.info(e) run_status = False if not run_status: return False return True else: return False
def run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component): parameters = component.get_role_parameters() component_name = component.get_name() module_name = component.get_module() task_id = job_utils.generate_task_id(job_id=job_id, component_name=component_name) schedule_logger(job_id).info('job {} run component {}'.format(job_id, component_name)) for role, partys_parameters in parameters.items(): for party_index in range(len(partys_parameters)): party_parameters = partys_parameters[party_index] if role in job_args: party_job_args = job_args[role][party_index]['args'] else: party_job_args = {} dest_party_id = party_parameters.get('local', {}).get('party_id') response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/run'.format( API_VERSION, job_id, component_name, task_id, role, dest_party_id), src_party_id=job_initiator['party_id'], dest_party_id=dest_party_id, src_role=job_initiator['role'], json_body={'job_parameters': job_parameters, 'job_initiator': job_initiator, 'job_args': party_job_args, 'parameters': party_parameters, 'module_name': module_name, 'input': component.get_input(), 'output': component.get_output(), 'job_server': {'ip': get_lan_ip(), 'http_port': RuntimeConfig.HTTP_PORT}}, work_mode=job_parameters['work_mode']) if response['retcode']: if 'not authorized' in response['retmsg']: raise Exception('run component {} not authorized'.format(component_name)) component_task_status = TaskScheduler.check_task_status(job_id=job_id, component=component) job_status = TaskScheduler.check_job_status(job_id) if component_task_status and job_status: task_success = True else: task_success = False schedule_logger(job_id).info( 'job {} component {} run {}'.format(job_id, component_name, 'success' if task_success else 'failed')) # update progress TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job_utils.update_job_progress(job_id=job_id, dag=dag, current_task_id=task_id).to_json()) TaskScheduler.stop(job_id=job_id, component_name=component_name) if task_success: next_components = dag.get_next_components(component_name) schedule_logger(job_id).info('job {} component {} next components is {}'.format(job_id, component_name, [next_component.get_name() for next_component in next_components])) for next_component in next_components: try: schedule_logger(job_id).info( 'job {} check component {} dependencies status'.format(job_id, next_component.get_name())) dependencies_status = TaskScheduler.check_dependencies(job_id=job_id, dag=dag, component=next_component) job_status = TaskScheduler.check_job_status(job_id) schedule_logger(job_id).info( 'job {} component {} dependencies status is {}, job status is {}'.format(job_id, next_component.get_name(), dependencies_status, job_status)) if dependencies_status and job_status: run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, next_component) else: run_status = False except Exception as e: schedule_logger(job_id).exception(e) run_status = False if not run_status: return False return True else: if component_task_status == None: end_status = JobStatus.TIMEOUT else: end_status = JobStatus.FAILED TaskScheduler.stop(job_id=job_id, end_status=end_status) return False
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job.f_job_id).info("scheduling job tasks") initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] auto_rerun_tasks = [] for initiator_task in initiator_tasks_group.values(): if job.f_runtime_conf_on_party["job_parameters"][ "federated_status_collect_type"] == FederatedCommunicationType.PULL: # collect all parties task party status and store it in the database now cls.collect_task_of_all_party(job=job, initiator_task=initiator_task) else: # all parties report task party status and store it in the initiator database when federated_status_collect_type is push pass # get all parties party task status and calculate new_task_status = cls.get_federated_task_status( job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains( initiator_task.f_status): FederatedScheduler.stop_task( job=job, task=initiator_task, stop_status=initiator_task.f_status) if not canceled and AutoRerunStatus.contains( initiator_task.f_status): if initiator_task.f_auto_retries > 0: auto_rerun_tasks.append(initiator_task) schedule_logger(job.f_job_id).info( f"task {initiator_task.f_task_id} {initiator_task.f_status} will be retried" ) else: schedule_logger(job.f_job_id).info( f"task {initiator_task.f_task_id} {initiator_task.f_status} has no retry count" ) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components( component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[JobSaver.task_key( task_id=job_utils.generate_task_id( job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id)] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job.f_job_id).info( f"task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling" ) break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED waiting_task.f_status = StatusSet.FAILED FederatedScheduler.sync_task_status(job, waiting_task) break else: schedule_logger( job.f_job_id).info("have cancel signal, pass start job tasks") schedule_logger(job.f_job_id).info("finish scheduling job tasks") return scheduling_status_code, auto_rerun_tasks, initiator_tasks_group.values( )