Exemplo n.º 1
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING, only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party:
                 continue
             count += 1
             try:
                 process_exist = build_engine(
                     task.f_engine_conf.get("computing_engine")).is_alive(
                         task)
                 if not process_exist:
                     msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}"
                     detect_logger(job_id=task.f_job_id).info(
                         f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist"
                     )
                     time.sleep(3)
                     _tasks = JobSaver.query_task(
                         task_id=task.f_task_id,
                         task_version=task.f_task_version,
                         role=task.f_role,
                         party_id=task.f_party_id)
                     if _tasks:
                         if _tasks[0].f_party_status == TaskStatus.RUNNING:
                             stop_job_ids.add(task.f_job_id)
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has been checked twice, try to stop job"
                             )
                         else:
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again"
                             )
                     else:
                         detect_logger(task.f_job_id).warning(
                             f"{msg} can not found on db")
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.FAILED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Exemplo n.º 2
0
def upload_history():
    request_data = request.json
    if request_data.get('job_id'):
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True)
    else:
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True)
    limit = request_data.get('limit')
    if not limit:
        tasks = tasks[-1::-1]
    else:
        tasks = tasks[-1:-limit - 1:-1]
    jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks)
    data = get_upload_info(jobs_run_conf=jobs_run_conf)
    return get_json_result(retcode=0, retmsg='success', data=data)
Exemplo n.º 3
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(
         task_id=initiator_task.f_task_id,
         task_version=initiator_task.f_task_version)
     tasks_status_on_all = set(
         [task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all
                ) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(
         job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(
             f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed"
         )
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(
                     task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response[
                     "retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Exemplo n.º 4
0
def query_task():
    tasks = JobSaver.query_task(**request.json)
    if not tasks:
        return get_json_result(retcode=101, retmsg='find task failed')
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[task.to_json() for task in tasks])
Exemplo n.º 5
0
def pipeline_dag_dependency(job_info):
    try:
        detect_utils.check_config(job_info, required_arguments=["party_id", "role"])
        component_need_run = {}
        if job_info.get('job_id'):
            jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"])
            if not jobs:
                raise Exception('query job {} failed'.format(job_info.get('job_id', '')))
            job = jobs[0]
            dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                           runtime_conf=job.f_runtime_conf_on_party,
                                                           train_runtime_conf=job.f_train_runtime_conf)
            tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True)
            for task in tasks:
                need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True)
                component_need_run[task.f_component_name] = need_run
        else:
            dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}),
                                                           runtime_conf=job_info.get('job_runtime_conf', {}),
                                                           train_runtime_conf=job_info.get('job_train_runtime_conf', {}))
        dependency = dsl_parser.get_dependency()
        dependency["component_need_run"] = component_need_run
        return dependency
    except Exception as e:
        stat_logger.exception(e)
        raise e
Exemplo n.º 6
0
def component_output_data_download():
    request_data = request.json
    tasks = JobSaver.query_task(only_latest=True,
                                job_id=request_data['job_id'],
                                component_name=request_data['component_name'],
                                role=request_data['role'],
                                party_id=request_data['party_id'])
    if not tasks:
        raise ValueError(
            f'no found task, please check if the parameters are correct:{request_data}'
        )
    import_component_output_depend(tasks[0].f_provider_info)
    try:
        output_tables_meta = get_component_output_tables_meta(
            task_data=request_data)
    except Exception as e:
        stat_logger.exception(e)
        return error_response(210, str(e))
    limit = request_data.get('limit', -1)
    if not output_tables_meta:
        return error_response(response_code=210, retmsg='no data')
    if limit == 0:
        return error_response(response_code=210, retmsg='limit is 0')
    tar_file_name = 'job_{}_{}_{}_{}_output_data.tar.gz'.format(
        request_data['job_id'], request_data['component_name'],
        request_data['role'], request_data['party_id'])
    return TableStorage.send_table(output_tables_meta,
                                   tar_file_name,
                                   limit=limit,
                                   need_head=request_data.get("head", True))
Exemplo n.º 7
0
 def report_task_to_initiator(cls, task_info):
     tasks = JobSaver.query_task(task_id=task_info["task_id"],
                                 task_version=task_info["task_version"],
                                 role=task_info["role"],
                                 party_id=task_info["party_id"])
     if tasks[
             0].f_federated_status_collect_type == FederatedCommunicationType.PUSH:
         FederatedScheduler.report_task_to_initiator(task=tasks[0])
Exemplo n.º 8
0
 def load_tasks(cls, component_list, job_id, role, party_id):
     tasks = JobSaver.query_task(job_id=job_id,
                                 role=role,
                                 party_id=party_id,
                                 only_latest=True)
     task_dict = {}
     for cpn in component_list:
         for task in tasks:
             if cpn == task.f_component_name:
                 task_dict[cpn] = task
     return task_dict
Exemplo n.º 9
0
def component_output_data():
    request_data = request.json
    tasks = JobSaver.query_task(only_latest=True,
                                job_id=request_data['job_id'],
                                component_name=request_data['component_name'],
                                role=request_data['role'],
                                party_id=request_data['party_id'])
    if not tasks:
        raise ValueError(
            f'no found task, please check if the parameters are correct:{request_data}'
        )
    import_component_output_depend(tasks[0].f_provider_info)
    output_tables_meta = get_component_output_tables_meta(
        task_data=request_data)
    if not output_tables_meta:
        return get_json_result(retcode=0, retmsg='no data', data=[])
    output_data_list = []
    headers = []
    totals = []
    data_names = []
    for output_name, output_table_meta in output_tables_meta.items():
        output_data = []
        is_str = False
        if output_table_meta:
            for k, v in output_table_meta.get_part_of_data():
                data_line, is_str, extend_header = feature_utils.get_component_output_data_line(
                    src_key=k,
                    src_value=v,
                    schema=output_table_meta.get_schema())
                output_data.append(data_line)
            total = output_table_meta.get_count()
            output_data_list.append(output_data)
            data_names.append(output_name)
            totals.append(total)
        if output_data:
            header = get_component_output_data_schema(
                output_table_meta=output_table_meta,
                is_str=is_str,
                extend_header=extend_header)
            headers.append(header)
        else:
            headers.append(None)
    if len(output_data_list) == 1 and not output_data_list[0]:
        return get_json_result(retcode=0, retmsg='no data', data=[])
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=output_data_list,
                           meta={
                               'header': headers,
                               'total': totals,
                               'names': data_names
                           })
Exemplo n.º 10
0
def component_parameters():
    request_data = request.json
    check_request_parameters(request_data)
    tasks = JobSaver.query_task(only_latest=True, **request_data)
    if not tasks:
        return get_json_result(retcode=101, retmsg='can not found this task')
    parameters = tasks[0].f_component_parameters
    output_parameters = {}
    output_parameters['module'] = parameters.get('module', '')
    for p_k, p_v in parameters.items():
        if p_k.endswith('Param'):
            output_parameters[p_k] = p_v
    return get_json_result(retcode=0, retmsg='success', data=output_parameters)
Exemplo n.º 11
0
 def collect_task(cls, job_id, component_name, task_id, task_version, role,
                  party_id):
     tasks = JobSaver.query_task(job_id=job_id,
                                 component_name=component_name,
                                 task_id=task_id,
                                 task_version=task_version,
                                 role=role,
                                 party_id=party_id)
     if tasks:
         return tasks[0].to_human_model_dict(
             only_primary_with=cls.INITIATOR_COLLECT_FIELDS)
     else:
         return None
Exemplo n.º 12
0
 def start_clean_job(cls, **kwargs):
     tasks = JobSaver.query_task(**kwargs)
     if tasks:
         for task in tasks:
             try:
                 # clean session
                 stat_logger.info('start {} {} {} {} session stop'.format(
                     task.f_job_id, task.f_role, task.f_party_id,
                     task.f_component_name))
                 start_session_stop(task)
                 stat_logger.info('stop {} {} {} {} session success'.format(
                     task.f_job_id, task.f_role, task.f_party_id,
                     task.f_component_name))
             except Exception as e:
                 pass
             try:
                 # clean data table
                 JobClean.clean_table(job_id=task.f_job_id,
                                      role=task.f_role,
                                      party_id=task.f_party_id,
                                      component_name=task.f_component_name)
             except Exception as e:
                 stat_logger.info(
                     'delete {} {} {} {} data table failed'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
                 stat_logger.exception(e)
             try:
                 # clean metric data
                 stat_logger.info(
                     'start delete {} {} {} {} metric data'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
                 delete_metric_data({
                     'job_id': task.f_job_id,
                     'role': task.f_role,
                     'party_id': task.f_party_id,
                     'component_name': task.f_component_name
                 })
                 stat_logger.info(
                     'delete {} {} {} {} metric data success'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
             except Exception as e:
                 stat_logger.info(
                     'delete {} {} {} {} metric data failed'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
                 stat_logger.exception(e)
     else:
         raise Exception('no found task')
Exemplo n.º 13
0
def stop_task(job_id, component_name, task_id, task_version, role, party_id,
              stop_status):
    tasks = JobSaver.query_task(job_id=job_id,
                                task_id=task_id,
                                task_version=task_version,
                                role=role,
                                party_id=int(party_id))
    kill_status = True
    for task in tasks:
        kill_status = kill_status & TaskController.stop_task(
            task=task, stop_status=stop_status)
    return get_json_result(
        retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR,
        retmsg='success' if kill_status else 'failed')
Exemplo n.º 14
0
 def task_command(cls,
                  job: Job,
                  task: Task,
                  command,
                  command_body=None,
                  parallel=False,
                  need_user=False):
     msg = f"execute federated task {task.f_component_name} command({command})"
     federated_response = {}
     job_parameters = job.f_runtime_conf_on_party["job_parameters"]
     tasks = JobSaver.query_task(task_id=task.f_task_id, only_latest=True)
     threads = []
     for task in tasks:
         dest_role, dest_party_id = task.f_role, task.f_party_id
         federated_response[dest_role] = federated_response.get(
             dest_role, {})
         endpoint = f"/party/{task.f_job_id}/{task.f_component_name}/{task.f_task_id}/{task.f_task_version}/{dest_role}/{dest_party_id}/{command}"
         if need_user:
             command_body["user_id"] = job.f_user.get(dest_role, {}).get(
                 str(dest_party_id), "")
             schedule_logger(job.f_job_id).info(
                 f'user:{job.f_user}, dest_role:{dest_role}, dest_party_id:{dest_party_id}'
             )
             schedule_logger(
                 job.f_job_id).info(f'command_body: {command_body}')
         args = (job.f_job_id, job.f_role, job.f_party_id, dest_role,
                 dest_party_id, endpoint, command_body,
                 job_parameters["federated_mode"], federated_response)
         if parallel:
             t = threading.Thread(target=cls.federated_command, args=args)
             threads.append(t)
             t.start()
         else:
             cls.federated_command(*args)
     for thread in threads:
         thread.join()
     status_code, response = cls.return_federated_response(
         federated_response=federated_response)
     if status_code == FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job.f_job_id).info(successful_log(msg))
     elif status_code == FederatedSchedulingStatusCode.NOT_EFFECTIVE:
         schedule_logger(job.f_job_id).warning(warning_log(msg))
     elif status_code == FederatedSchedulingStatusCode.ERROR:
         schedule_logger(job.f_job_id).critical(
             failed_log(msg, detail=response))
     else:
         schedule_logger(job.f_job_id).error(
             failed_log(msg, detail=response))
     return status_code, response
Exemplo n.º 15
0
 def stop_job(cls, job, stop_status):
     tasks = JobSaver.query_task(job_id=job.f_job_id,
                                 role=job.f_role,
                                 party_id=job.f_party_id,
                                 reverse=True)
     kill_status = True
     kill_details = {}
     for task in tasks:
         kill_task_status = TaskController.stop_task(
             task=task, stop_status=stop_status)
         kill_status = kill_status & kill_task_status
         kill_details[
             task.f_task_id] = 'success' if kill_task_status else 'failed'
     if kill_status:
         job_info = job.to_human_model_dict(only_primary_with=["status"])
         job_info["status"] = stop_status
         JobController.update_job_status(job_info)
     return kill_status, kill_details
Exemplo n.º 16
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING,
             run_on_this_party=True,
             run_ip=RuntimeConfig.JOB_SERVER_HOST,
             only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             count += 1
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger(job_id=task.f_job_id).info(
                         'job {} task {} {} on {} {} process {} does not exist'
                         .format(task.f_job_id, task.f_task_id,
                                 task.f_task_version, task.f_role,
                                 task.f_party_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.CANCELED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Exemplo n.º 17
0
 def federated_task_status(cls, job_id, task_id, task_version):
     tasks_on_all_party = JobSaver.query_task(task_id=task_id,
                                              task_version=task_version)
     status_flag = 0
     # idmapping role status can only be ignored if all non-idmapping roles success
     for task in tasks_on_all_party:
         if 'idmapping' not in task.f_role and task.f_party_status != TaskStatus.SUCCESS:
             status_flag = 1
             break
     if status_flag:
         tasks_party_status = [
             task.f_party_status for task in tasks_on_all_party
         ]
     else:
         tasks_party_status = [
             task.f_party_status for task in tasks_on_all_party
             if 'idmapping' not in task.f_role
         ]
     status = cls.calculate_multi_party_task_status(tasks_party_status)
     schedule_logger(job_id=job_id).info(
         "job {} task {} {} status is {}, calculate by task party status list: {}"
         .format(job_id, task_id, task_version, status, tasks_party_status))
     return status
Exemplo n.º 18
0
 def get_rerun_component(cls, component_name, job, dsl_parser, force):
     if not component_name or component_name == job_utils.job_pipeline_component_name(
     ):
         pass
     else:
         dependence_status_code, response = FederatedScheduler.check_component(
             job=job, check_type="rerun")
         success_task_list = [
             task.f_component_name
             for task in JobSaver.query_task(job_id=job.f_job_id,
                                             party_id=job.f_party_id,
                                             role=job.f_role,
                                             status=TaskStatus.SUCCESS,
                                             only_latest=True)
         ]
         component_set = set()
         for dest_role in response.keys():
             for party_id in response[dest_role].keys():
                 component_set = component_set.union(
                     set(response[dest_role][party_id].get("data")))
         schedule_logger(job.f_job_id).info(
             f"success task list: {success_task_list}, check failed component list: {list(component_set)}"
         )
         need_rerun = [
             cpn.name for cpn in dsl_parser.get_need_revisit_nodes(
                 success_task_list, list(component_set))
         ]
         schedule_logger(job.f_job_id).info(
             f"need rerun success component: {need_rerun}")
         if component_set:
             force = True
         if isinstance(component_name, str):
             component_name = set(need_rerun).union({component_name})
         else:
             component_name = set(need_rerun).union(set(component_name))
     return component_name, force
Exemplo n.º 19
0
 def component_check(cls, job, check_type="inheritance"):
     if check_type == "rerun":
         task_list = JobSaver.query_task(job_id=job.f_job_id,
                                         party_id=job.f_party_id,
                                         role=job.f_role,
                                         status=TaskStatus.SUCCESS,
                                         only_latest=True)
         tasks = {}
         for task in task_list:
             tasks[task.f_component_name] = task
     else:
         tasks = JobController.load_tasks(
             component_list=job.f_inheritance_info.get(
                 "component_list", []),
             job_id=job.f_inheritance_info.get("job_id"),
             role=job.f_role,
             party_id=job.f_party_id)
     tracker_dict = JobController.load_task_tracker(tasks)
     missing_dependence_component_list = []
     # data dependence
     for tracker in tracker_dict.values():
         table_infos = tracker.get_output_data_info()
         for table in table_infos:
             table_meta = storage.StorageTableMeta(
                 name=table.f_table_name, namespace=table.f_table_namespace)
             if not table_meta:
                 missing_dependence_component_list.append(
                     tracker.component_name)
                 continue
     if check_type == "rerun":
         return missing_dependence_component_list
     elif check_type == "inheritance":
         # reload component list
         return list(
             set(job.f_inheritance_info.get("component_list", [])) -
             set(missing_dependence_component_list))
Exemplo n.º 20
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id,
               component_name):
     schedule_logger(job_id=job_id).info(
         f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}"
     )
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(
             f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}"
         )
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id,
                                     component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(
         dsl=job.f_dsl,
         runtime_conf=job.f_runtime_conf_on_party,
         train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(
                 f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun"
             )
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job,
                                          task=task,
                                          stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job,
                                           task=task,
                                           content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version}"
             )
             for _role, _party_ids in job.f_runtime_conf_on_party[
                     "role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(
                         job_id,
                         _role,
                         _party_id,
                         False,
                         job.f_initiator_role,
                         job.f_initiator_party_id,
                         RunParameters(
                             **
                             job.f_runtime_conf_on_party["job_parameters"]),
                         dsl_parser,
                         component_name=task.f_component_name,
                         task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version} successfully"
             )
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(
             job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(
             job_id=job_id).info(f"job {job_id} no task to rerun")
Exemplo n.º 21
0
    def save_pipelined_model(cls, job_id, role, party_id):
        schedule_logger(job_id).info(
            f"start to save pipeline model on {role} {party_id}")
        job_configuration = job_utils.get_job_configuration(job_id=job_id,
                                                            role=role,
                                                            party_id=party_id)
        runtime_conf_on_party = job_configuration.runtime_conf_on_party
        job_parameters = runtime_conf_on_party.get('job_parameters', {})
        if role in job_parameters.get("assistant_role", []):
            return
        model_id = job_parameters['model_id']
        model_version = job_parameters['model_version']
        job_type = job_parameters.get('job_type', '')
        roles = runtime_conf_on_party['role']
        initiator_role = runtime_conf_on_party['initiator']['role']
        initiator_party_id = runtime_conf_on_party['initiator']['party_id']
        if job_type == 'predict':
            return
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job_configuration.dsl,
            runtime_conf=job_configuration.runtime_conf,
            train_runtime_conf=job_configuration.train_runtime_conf)

        components_parameters = {}
        tasks = JobSaver.query_task(job_id=job_id,
                                    role=role,
                                    party_id=party_id,
                                    only_latest=True)
        for task in tasks:
            components_parameters[
                task.f_component_name] = task.f_component_parameters
        predict_dsl = schedule_utils.fill_inference_dsl(
            dsl_parser,
            origin_inference_dsl=job_configuration.dsl,
            components_parameters=components_parameters)

        pipeline = pipeline_pb2.Pipeline()
        pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
        pipeline.train_dsl = json_dumps(job_configuration.dsl, byte=True)
        pipeline.train_runtime_conf = json_dumps(
            job_configuration.runtime_conf, byte=True)
        pipeline.fate_version = RuntimeConfig.get_env("FATE")
        pipeline.model_id = model_id
        pipeline.model_version = model_version

        pipeline.parent = True
        pipeline.loaded_times = 0
        pipeline.roles = json_dumps(roles, byte=True)
        pipeline.initiator_role = initiator_role
        pipeline.initiator_party_id = initiator_party_id
        pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party,
                                                    byte=True)
        pipeline.parent_info = json_dumps({}, byte=True)

        tracker = Tracker(job_id=job_id,
                          role=role,
                          party_id=party_id,
                          model_id=model_id,
                          model_version=model_version,
                          job_parameters=RunParameters(**job_parameters))
        tracker.save_pipeline_model(pipeline_buffer_object=pipeline)
        if role != 'local':
            tracker.save_machine_learning_model_info()
        schedule_logger(job_id).info(
            f"save pipeline on {role} {party_id} successfully")
Exemplo n.º 22
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id, **kwargs):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        job_dsl = job_utils.get_job_dsl(job_id, role, party_id)
        PrivilegeAuth.authentication_component(
            job_dsl,
            src_party_id=kwargs.get('src_party_id'),
            src_role=kwargs.get('src_role'),
            party_id=party_id,
            component_name=component_name)

        schedule_logger(job_id).info(
            f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess"
        )
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        is_failed = False
        try:
            task = JobSaver.query_task(task_id=task_id,
                                       task_version=task_version,
                                       role=role,
                                       party_id=party_id)[0]
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            run_parameters_dict["src_user"] = kwargs.get("src_user")
            run_parameters = RunParameters(**run_parameters_dict)

            config_dir = job_utils.get_task_directory(job_id, role, party_id,
                                                      component_name, task_id,
                                                      task_version)
            os.makedirs(config_dir, exist_ok=True)

            run_parameters_path = os.path.join(config_dir,
                                               'task_parameters.json')
            with open(run_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            schedule_logger(job_id).info(
                f"use computing engine {run_parameters.computing_engine}")
            task_info["engine_conf"] = {
                "computing_engine": run_parameters.computing_engine
            }
            backend_engine = build_engine(run_parameters.computing_engine)
            run_info = backend_engine.run(
                task=task,
                run_parameters=run_parameters,
                run_parameters_path=run_parameters_path,
                config_dir=config_dir,
                log_dir=job_utils.get_job_log_directory(
                    job_id, role, party_id, component_name),
                cwd_dir=job_utils.get_job_directory(job_id, role, party_id,
                                                    component_name),
                user_name=kwargs.get("user_id"))
            task_info.update(run_info)
            task_info["start_time"] = current_timestamp()
            task_executor_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
            is_failed = True
        finally:
            try:
                cls.update_task(task_info=task_info)
                task_info["party_status"] = TaskStatus.RUNNING
                cls.update_task_status(task_info=task_info)
                if is_failed:
                    task_info["party_status"] = TaskStatus.FAILED
                    cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                "task {} {} on {} {} executor subprocess start {}".format(
                    task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Exemplo n.º 23
0
    def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        submit_result = {"job_id": job_id}
        schedule_logger(job_id).info(
            f"submit job, body {submit_job_conf.to_dict()}")
        try:
            dsl = submit_job_conf.dsl
            runtime_conf = deepcopy(submit_job_conf.runtime_conf)
            job_utils.check_job_runtime_conf(runtime_conf)
            authentication_utils.check_constraint(runtime_conf, dsl)
            job_initiator = runtime_conf["initiator"]
            conf_adapter = JobRuntimeConfigAdapter(runtime_conf)
            common_job_parameters = conf_adapter.get_common_parameters()

            if common_job_parameters.job_type != "predict":
                # generate job model info
                conf_version = schedule_utils.get_conf_version(runtime_conf)
                if conf_version != 2:
                    raise Exception(
                        "only the v2 version runtime conf is supported")
                common_job_parameters.model_id = model_utils.gen_model_id(
                    runtime_conf["role"])
                common_job_parameters.model_version = job_id
                train_runtime_conf = {}
            else:
                # check predict job parameters
                detect_utils.check_config(common_job_parameters.to_dict(),
                                          ["model_id", "model_version"])
                # get inference dsl from pipeline model as job dsl
                tracker = Tracker(
                    job_id=job_id,
                    role=job_initiator["role"],
                    party_id=job_initiator["party_id"],
                    model_id=common_job_parameters.model_id,
                    model_version=common_job_parameters.model_version)
                pipeline_model = tracker.get_pipeline_model()
                train_runtime_conf = json_loads(
                    pipeline_model.train_runtime_conf)
                if not model_utils.check_if_deployed(
                        role=job_initiator["role"],
                        party_id=job_initiator["party_id"],
                        model_id=common_job_parameters.model_id,
                        model_version=common_job_parameters.model_version):
                    raise Exception(
                        f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet."
                    )
                dsl = json_loads(pipeline_model.inference_dsl)
            # dsl = ProviderManager.fill_fate_flow_provider(dsl)

            job = Job()
            job.f_job_id = job_id
            job.f_dsl = dsl
            job.f_train_runtime_conf = train_runtime_conf
            job.f_roles = runtime_conf["role"]
            job.f_initiator_role = job_initiator["role"]
            job.f_initiator_party_id = job_initiator["party_id"]
            job.f_role = job_initiator["role"]
            job.f_party_id = job_initiator["party_id"]

            path_dict = job_utils.save_job_conf(
                job_id=job_id,
                role=job.f_initiator_role,
                party_id=job.f_initiator_party_id,
                dsl=dsl,
                runtime_conf=runtime_conf,
                runtime_conf_on_party={},
                train_runtime_conf=train_runtime_conf,
                pipeline_dsl=None)

            if job.f_initiator_party_id not in runtime_conf["role"][
                    job.f_initiator_role]:
                msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}"
                schedule_logger(job_id).info(msg)
                raise Exception(msg)

            # create common parameters on initiator
            JobController.create_common_job_parameters(
                job_id=job.f_job_id,
                initiator_role=job.f_initiator_role,
                common_job_parameters=common_job_parameters)
            job.f_runtime_conf = conf_adapter.update_common_parameters(
                common_parameters=common_job_parameters)
            dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job.f_dsl,
                runtime_conf=job.f_runtime_conf,
                train_runtime_conf=job.f_train_runtime_conf)

            # initiator runtime conf as template
            job.f_runtime_conf_on_party = job.f_runtime_conf.copy()
            job.f_runtime_conf_on_party[
                "job_parameters"] = common_job_parameters.to_dict()

            # inherit job
            job.f_inheritance_info = common_job_parameters.inheritance_info
            job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS
            if job.f_inheritance_info:
                inheritance_jobs = JobSaver.query_job(
                    job_id=job.f_inheritance_info.get("job_id"),
                    role=job_initiator["role"],
                    party_id=job_initiator["party_id"])
                inheritance_tasks = JobSaver.query_task(
                    job_id=job.f_inheritance_info.get("job_id"),
                    role=job_initiator["role"],
                    party_id=job_initiator["party_id"],
                    only_latest=True)
                job_utils.check_job_inheritance_parameters(
                    job, inheritance_jobs, inheritance_tasks)

            status_code, response = FederatedScheduler.create_job(job=job)
            if status_code != FederatedSchedulingStatusCode.SUCCESS:
                job.f_status = JobStatus.FAILED
                job.f_tag = "submit_failed"
                FederatedScheduler.sync_job_status(job=job)
                raise Exception("create job failed", response)
            else:
                need_run_components = {}
                for role in response:
                    need_run_components[role] = {}
                    for party, res in response[role].items():
                        need_run_components[role][party] = [
                            name for name, value in response[role][party]
                            ["data"]["components"].items()
                            if value["need_run"] is True
                        ]
                if common_job_parameters.federated_mode == FederatedMode.MULTIPLE:
                    # create the task holder in db to record information of all participants in the initiator for scheduling
                    for role, party_ids in job.f_roles.items():
                        for party_id in party_ids:
                            if role == job.f_initiator_role and party_id == job.f_initiator_party_id:
                                continue
                            if not need_run_components[role][party_id]:
                                continue
                            JobController.initialize_tasks(
                                job_id=job_id,
                                role=role,
                                party_id=party_id,
                                run_on_this_party=False,
                                initiator_role=job.f_initiator_role,
                                initiator_party_id=job.f_initiator_party_id,
                                job_parameters=common_job_parameters,
                                dsl_parser=dsl_parser,
                                components=need_run_components[role][party_id])
                job.f_status = JobStatus.WAITING
                status_code, response = FederatedScheduler.sync_job_status(
                    job=job)
                if status_code != FederatedSchedulingStatusCode.SUCCESS:
                    raise Exception("set job to waiting status failed")

            schedule_logger(job_id).info(
                f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}"
            )
            logs_directory = job_utils.get_job_log_directory(job_id)
            result = {
                "code":
                RetCode.SUCCESS,
                "message":
                "success",
                "model_info": {
                    "model_id": common_job_parameters.model_id,
                    "model_version": common_job_parameters.model_version
                },
                "logs_directory":
                logs_directory,
                "board_url":
                job_utils.get_board_url(job_id, job_initiator["role"],
                                        job_initiator["party_id"])
            }
            warn_parameter = JobRuntimeConfigAdapter(
                submit_job_conf.runtime_conf).check_removed_parameter()
            if warn_parameter:
                result[
                    "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!"
            submit_result.update(result)
            submit_result.update(path_dict)
        except Exception as e:
            submit_result["code"] = RetCode.OPERATING_ERROR
            submit_result["message"] = exception_to_trace_string(e)
            schedule_logger(job_id).exception(e)
        return submit_result
Exemplo n.º 24
0
    def set_job_rerun(cls,
                      job_id,
                      initiator_role,
                      initiator_party_id,
                      auto,
                      force=False,
                      tasks: typing.List[Task] = None,
                      component_name: typing.Union[str, list] = None):
        schedule_logger(job_id).info(
            f"try to rerun job on initiator {initiator_role} {initiator_party_id}"
        )

        jobs = JobSaver.query_job(job_id=job_id,
                                  role=initiator_role,
                                  party_id=initiator_party_id)
        if not jobs:
            raise RuntimeError(
                f"can not found job on initiator {initiator_role} {initiator_party_id}"
            )
        job = jobs[0]

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        component_name, force = cls.get_rerun_component(
            component_name, job, dsl_parser, force)
        schedule_logger(job_id).info(f"rerun component: {component_name}")

        if tasks:
            schedule_logger(job_id).info(
                f"require {[task.f_component_name for task in tasks]} to rerun"
            )
        else:
            task_query = {
                'job_id': job_id,
                'role': initiator_role,
                'party_id': initiator_party_id,
            }

            if not component_name or component_name == job_utils.job_pipeline_component_name(
            ):
                # rerun all tasks
                schedule_logger(job_id).info(
                    "require all component of pipeline to rerun")
            else:
                _require_reruns = {component_name} if isinstance(
                    component_name, str) else set(component_name)
                _should_reruns = _require_reruns.copy()
                for _cpn in _require_reruns:
                    _components = dsl_parser.get_downstream_dependent_components(
                        _cpn)
                    for _c in _components:
                        _should_reruns.add(_c.get_name())

                schedule_logger(job_id).info(
                    f"require {_require_reruns} to rerun, "
                    f"and then found {_should_reruns} need be to rerun")
                task_query['component_name'] = _should_reruns

            tasks = JobSaver.query_task(**task_query)

        job_can_rerun = any([
            TaskScheduler.prepare_rerun_task(
                job=job,
                task=task,
                dsl_parser=dsl_parser,
                auto=auto,
                force=force,
            ) for task in tasks
        ])
        if not job_can_rerun:
            FederatedScheduler.sync_job_status(job=job)
            schedule_logger(job_id).info("job no task to rerun")
            return False

        schedule_logger(job_id).info("job set rerun signal")
        status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
        schedule_logger(job_id).info(
            f"job set rerun signal {'successfully' if status else 'failed'}")
        return True