예제 #1
0
 def __init__(self,
              job_id: str,
              role: str,
              party_id: int,
              model_id: str = None,
              model_version: str = None,
              component_name: str = None,
              component_module_name: str = None,
              task_id: str = None):
     self.job_id = job_id
     self.role = role
     self.party_id = party_id
     self.component_name = component_name if component_name else 'pipeline'
     self.module_name = component_module_name if component_module_name else 'Pipeline'
     self.task_id = task_id if task_id else job_utils.generate_task_id(
         job_id=self.job_id, component_name=self.component_name)
     self.table_namespace = '_'.join([
         'fate_flow', 'tracking', 'data', self.job_id, self.role,
         str(self.party_id), self.component_name
     ])
     self.job_table_namespace = '_'.join([
         'fate_flow', 'tracking', 'data', self.job_id, self.role,
         str(self.party_id)
     ])
     self.model_id = model_id
     self.party_model_id = model_utils.gen_party_model_id(model_id=model_id,
                                                          role=role,
                                                          party_id=party_id)
     self.model_version = model_version
     self.pipelined_model = None
     if self.party_model_id and self.model_version:
         self.pipelined_model = pipelined_model.PipelinedModel(
             model_id=self.party_model_id, model_version=self.model_version)
예제 #2
0
 def check_task_status(job_id, component, interval=1):
     task_id = job_utils.generate_task_id(
         job_id=job_id, component_name=component.get_name())
     while True:
         try:
             status_collect = set()
             parameters = component.get_role_parameters()
             for _role, _partys_parameters in parameters.items():
                 for _party_parameters in _partys_parameters:
                     _party_id = _party_parameters.get('local',
                                                       {}).get('party_id')
                     tasks = query_task(job_id=job_id,
                                        task_id=task_id,
                                        role=_role,
                                        party_id=_party_id)
                     if tasks:
                         task_status = tasks[0].f_status
                     else:
                         task_status = 'notRunning'
                     schedule_logger.info(
                         'job {} component {} run on {} {} status is {}'.
                         format(job_id, component.get_name(), _role,
                                _party_id, task_status))
                     status_collect.add(task_status)
             if 'failed' in status_collect:
                 return False
             elif len(status_collect) == 1 and 'success' in status_collect:
                 return True
             else:
                 time.sleep(interval)
         except Exception as e:
             schedule_logger.exception(e)
             return False
예제 #3
0
 def create_task(cls, role, party_id, run_on_this_party, task_info):
     task_info["role"] = role
     task_info["party_id"] = party_id
     task_info["status"] = TaskStatus.WAITING
     task_info["party_status"] = TaskStatus.WAITING
     task_info["create_time"] = base_utils.current_timestamp()
     task_info["run_on_this_party"] = run_on_this_party
     if "task_id" not in task_info:
         task_info["task_id"] = job_utils.generate_task_id(job_id=task_info["job_id"], component_name=task_info["component_name"])
     if "task_version" not in task_info:
         task_info["task_version"] = 0
     JobSaver.create_task(task_info=task_info)
예제 #4
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id))
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id)
        waiting_tasks = []
        for initiator_task in initiator_tasks_group.values():
            # collect all party task party status
            if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL:
                tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
                tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
                if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all:
                    cls.collect_task_of_all_party(job=job, task=initiator_task)
            new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job, task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(initiator_task.f_status):
                FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status)

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[
                        JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()),
                                          role=job.f_role,
                                          party_id=job.f_party_id
                                          )
                    ]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling")
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        break
        else:
            schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id))
        schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id))
        return scheduling_status_code, initiator_tasks_group.values()
예제 #5
0
    def create_task(cls, role, party_id, run_on_this_party, task_info):
        task_info["role"] = role
        task_info["party_id"] = str(party_id)
        task_info["status"] = TaskStatus.WAITING
        task_info["party_status"] = TaskStatus.WAITING
        task_info["create_time"] = base_utils.current_timestamp()
        task_info["run_on_this_party"] = run_on_this_party
        if task_info.get("task_id") is None:
            task_info["task_id"] = job_utils.generate_task_id(
                job_id=task_info["job_id"],
                component_name=task_info["component_name"])
        if task_info.get("task_version") is None:
            task_info["task_version"] = 0

        task = JobSaver.create_task(task_info=task_info)
        if task and run_on_this_party:
            job_utils.save_task_using_job_conf(task)
예제 #6
0
    def run_component(job_id, job_runtime_conf, job_parameters, job_initiator,
                      job_args, dag, component):
        parameters = component.get_role_parameters()
        component_name = component.get_name()
        module_name = component.get_module()
        task_id = job_utils.generate_task_id(job_id=job_id,
                                             component_name=component_name)
        schedule_logger.info('job {} run component {}'.format(
            job_id, component_name))
        for role, partys_parameters in parameters.items():
            for party_index in range(len(partys_parameters)):
                party_parameters = partys_parameters[party_index]
                if role in job_args:
                    party_job_args = job_args[role][party_index]['args']
                else:
                    party_job_args = {}
                dest_party_id = party_parameters.get('local',
                                                     {}).get('party_id')

                federated_api(job_id=job_id,
                              method='POST',
                              endpoint='/{}/job/{}/{}/{}/{}/{}/run'.format(
                                  API_VERSION, job_id, component_name, task_id,
                                  role, dest_party_id),
                              src_party_id=job_initiator['party_id'],
                              dest_party_id=dest_party_id,
                              json_body={
                                  'job_parameters': job_parameters,
                                  'job_initiator': job_initiator,
                                  'job_args': party_job_args,
                                  'parameters': party_parameters,
                                  'module_name': module_name,
                                  'input': component.get_input(),
                                  'output': component.get_output()
                              },
                              work_mode=job_parameters['work_mode'])
        component_task_status = TaskScheduler.check_task_status(
            job_id=job_id, component=component)
        if component_task_status:
            task_success = True
        else:
            task_success = False
        schedule_logger.info('job {} component {} run {}'.format(
            job_id, component_name, 'success' if task_success else 'failed'))
        # update progress
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job_utils.update_job_progress(
                job_id=job_id, dag=dag, current_task_id=task_id).to_json())
        if task_success:
            next_components = dag.get_next_components(component_name)
            schedule_logger.info(
                'job {} component {} next components is {}'.format(
                    job_id, component_name, [
                        next_component.get_name()
                        for next_component in next_components
                    ]))
            for next_component in next_components:
                try:
                    schedule_logger.info(
                        'job {} check component {} dependencies status'.format(
                            job_id, next_component.get_name()))
                    dependencies_status = TaskScheduler.check_dependencies(
                        job_id=job_id, dag=dag, component=next_component)
                    schedule_logger.info(
                        'job {} component {} dependencies status is {}'.format(
                            job_id, next_component.get_name(),
                            dependencies_status))
                    if dependencies_status:
                        run_status = TaskScheduler.run_component(
                            job_id, job_runtime_conf, job_parameters,
                            job_initiator, job_args, dag, next_component)
                    else:
                        run_status = False
                except Exception as e:
                    schedule_logger.info(e)
                    run_status = False
                if not run_status:
                    return False
            return True
        else:
            return False
예제 #7
0
    def run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component):
        parameters = component.get_role_parameters()
        component_name = component.get_name()
        module_name = component.get_module()
        task_id = job_utils.generate_task_id(job_id=job_id, component_name=component_name)
        schedule_logger(job_id).info('job {} run component {}'.format(job_id, component_name))
        for role, partys_parameters in parameters.items():
            for party_index in range(len(partys_parameters)):
                party_parameters = partys_parameters[party_index]
                if role in job_args:
                    party_job_args = job_args[role][party_index]['args']
                else:
                    party_job_args = {}
                dest_party_id = party_parameters.get('local', {}).get('party_id')

                response = federated_api(job_id=job_id,
                              method='POST',
                              endpoint='/{}/schedule/{}/{}/{}/{}/{}/run'.format(
                                  API_VERSION,
                                  job_id,
                                  component_name,
                                  task_id,
                                  role,
                                  dest_party_id),
                              src_party_id=job_initiator['party_id'],
                              dest_party_id=dest_party_id,
                              src_role=job_initiator['role'],
                              json_body={'job_parameters': job_parameters,
                                         'job_initiator': job_initiator,
                                         'job_args': party_job_args,
                                         'parameters': party_parameters,
                                         'module_name': module_name,
                                         'input': component.get_input(),
                                         'output': component.get_output(),
                                         'job_server': {'ip': get_lan_ip(), 'http_port': RuntimeConfig.HTTP_PORT}},
                              work_mode=job_parameters['work_mode'])
                if response['retcode']:
                    if 'not authorized' in response['retmsg']:
                        raise Exception('run component {} not authorized'.format(component_name))
        component_task_status = TaskScheduler.check_task_status(job_id=job_id, component=component)
        job_status = TaskScheduler.check_job_status(job_id)
        if component_task_status and job_status:
            task_success = True
        else:
            task_success = False
        schedule_logger(job_id).info(
            'job {} component {} run {}'.format(job_id, component_name, 'success' if task_success else 'failed'))
        # update progress
        TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'],
                                      work_mode=job_parameters['work_mode'],
                                      initiator_party_id=job_initiator['party_id'],
                                      initiator_role=job_initiator['role'],
                                      job_info=job_utils.update_job_progress(job_id=job_id, dag=dag,
                                                                             current_task_id=task_id).to_json())
        TaskScheduler.stop(job_id=job_id, component_name=component_name)
        if task_success:
            next_components = dag.get_next_components(component_name)
            schedule_logger(job_id).info('job {} component {} next components is {}'.format(job_id, component_name,
                                                                                    [next_component.get_name() for
                                                                                     next_component in
                                                                                     next_components]))
            for next_component in next_components:
                try:
                    schedule_logger(job_id).info(
                        'job {} check component {} dependencies status'.format(job_id, next_component.get_name()))
                    dependencies_status = TaskScheduler.check_dependencies(job_id=job_id, dag=dag,
                                                                           component=next_component)
                    job_status = TaskScheduler.check_job_status(job_id)
                    schedule_logger(job_id).info(
                        'job {} component {} dependencies status is {}, job status is {}'.format(job_id, next_component.get_name(),
                                                                               dependencies_status, job_status))
                    if dependencies_status and job_status:
                        run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters,
                                                                 job_initiator, job_args, dag,
                                                                 next_component)
                    else:
                        run_status = False
                except Exception as e:
                    schedule_logger(job_id).exception(e)
                    run_status = False
                if not run_status:
                    return False
            return True
        else:
            if component_task_status == None:
                end_status = JobStatus.TIMEOUT
            else:
                end_status = JobStatus.FAILED
            TaskScheduler.stop(job_id=job_id, end_status=end_status)
            return False
예제 #8
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job.f_job_id).info("scheduling job tasks")
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id,
                                                       role=job.f_role,
                                                       party_id=job.f_party_id)
        waiting_tasks = []
        auto_rerun_tasks = []
        for initiator_task in initiator_tasks_group.values():
            if job.f_runtime_conf_on_party["job_parameters"][
                    "federated_status_collect_type"] == FederatedCommunicationType.PULL:
                # collect all parties task party status and store it in the database now
                cls.collect_task_of_all_party(job=job,
                                              initiator_task=initiator_task)
            else:
                # all parties report task party status and store it in the initiator database when federated_status_collect_type is push
                pass
            # get all parties party task status and calculate
            new_task_status = cls.get_federated_task_status(
                job_id=initiator_task.f_job_id,
                task_id=initiator_task.f_task_id,
                task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job,
                                                    task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(
                    initiator_task.f_status):
                FederatedScheduler.stop_task(
                    job=job,
                    task=initiator_task,
                    stop_status=initiator_task.f_status)
                if not canceled and AutoRerunStatus.contains(
                        initiator_task.f_status):
                    if initiator_task.f_auto_retries > 0:
                        auto_rerun_tasks.append(initiator_task)
                        schedule_logger(job.f_job_id).info(
                            f"task {initiator_task.f_task_id} {initiator_task.f_status} will be retried"
                        )
                    else:
                        schedule_logger(job.f_job_id).info(
                            f"task {initiator_task.f_task_id} {initiator_task.f_status} has no retry count"
                        )

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(
                        component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[JobSaver.task_key(
                        task_id=job_utils.generate_task_id(
                            job_id=job.f_job_id,
                            component_name=component.get_name()),
                        role=job.f_role,
                        party_id=job.f_party_id)]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job.f_job_id).info(
                            f"task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling"
                        )
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        waiting_task.f_status = StatusSet.FAILED
                        FederatedScheduler.sync_task_status(job, waiting_task)
                        break
        else:
            schedule_logger(
                job.f_job_id).info("have cancel signal, pass start job tasks")
        schedule_logger(job.f_job_id).info("finish scheduling job tasks")
        return scheduling_status_code, auto_rerun_tasks, initiator_tasks_group.values(
        )