Пример #1
0
def clean_job(job_id):
    try:
        logger.info('ready clean job {}'.format(job_id))
        eggroll.cleanup('*', namespace=job_id, persistent=False)
        logger.info('send clean job {}'.format(job_id))
    except Exception as e:
        logger.exception(e)
Пример #2
0
 def run_job(self, job_id, config):
     default_runtime_dict = file_utils.load_json_conf('workflow/conf/default_runtime_conf.json')
     setting_conf = file_utils.load_json_conf('workflow/conf/setting_conf.json')
     _job_dir = get_job_directory(job_id=job_id)
     os.makedirs(_job_dir, exist_ok=True)
     ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir)
     logger.info('job_id {} parameters overrode {}'.format(config, _job_dir))
     channel, stub = get_proxy_data_channel()
     for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True):
         runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path))
         _role = runtime_conf['local']['role']
         _party_id = runtime_conf['local']['party_id']
         _method = 'POST'
         _module = runtime_conf['module']
         _url = '/workflow/{}/{}/{}'.format(job_id, _module, _role)
         _packet = wrap_grpc_packet(runtime_conf, _method, _url, _party_id, job_id)
         logger.info(
             'Starting workflow job_id:{} party_id:{} role:{} method:{} url:{}'.format(job_id, _party_id,
                                                                                       _role, _method,
                                                                                       _url))
         try:
             _return = stub.unaryCall(_packet)
             logger.info("Grpc unary response: {}".format(_return))
         except grpc.RpcError as e:
             msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id,
                                                                                                    _party_id,
                                                                                                    _role, _method,
                                                                                                    _url)
             logger.exception(msg)
             return get_json_result(-101, 'UnaryCall submit to remote manager failed')
Пример #3
0
 def run_do(self):
     logger.info("{} job are running.".format(running_job_amount()))
     try:
         if running_job_amount() < MAX_CONCURRENT_JOB_RUN:
             wait_jobs = get_job_from_queue(status="waiting", limit=1)
             if wait_jobs:
                 wait_job = wait_jobs[0]
                 run_job_id = wait_job.job_id
                 try:
                     run_job_success = self.run_job(job_id=run_job_id,
                                                    config=json.loads(
                                                        wait_job.config))
                 except Exception as e:
                     run_job_success = False
                     logger.exception(e)
                 if run_job_success:
                     update_job_queue(job_id=run_job_id,
                                      role=wait_job.role,
                                      party_id=wait_job.party_id,
                                      save_data={"status": "ready"})
                 else:
                     pop_from_job_queue(job_id=run_job_id)
         logger.info("check waiting jobs done.")
         self.check_job()
     except Exception as e:
         logger.exception(e)
Пример #4
0
def stop_workflow(job_id, role, party_id):
    _job_dir = get_job_directory(job_id)
    task_pid_path = os.path.join(_job_dir, 'pids')
    if os.path.isdir(task_pid_path):
        for pid_file in os.listdir(task_pid_path):
            try:
                if not pid_file.endswith('.pid'):
                    continue
                with open(os.path.join(task_pid_path, pid_file), 'r') as f:
                    pids = f.read().split('\n')
                    for pid in pids:
                        try:
                            if len(pid) == 0:
                                continue
                            logger.debug(
                                "terminating process pid:{} {}".format(
                                    pid, pid_file))
                            p = psutil.Process(int(pid))
                            for child in p.children(recursive=True):
                                child.kill()
                            p.kill()
                        except NoSuchProcess:
                            continue
            except Exception as e:
                logger.exception("error")
                continue
        set_job_failed(job_id=job_id, role=role, party_id=party_id)
        pop_from_job_queue(job_id=job_id)
        clean_job(job_id=job_id)
    return get_json_result(job_id=job_id)
Пример #5
0
def import_offline_feature():
    eggroll.init(job_id=generate_job_id(), mode=WORK_MODE)
    request_data = request.json
    try:
        if not request_data.get("jobId"):
            return get_json_result(status=2, msg="no job id")
        job_id = request_data.get("jobId")
        job_data = query_job_by_id(job_id=job_id)
        if not job_data:
            return get_json_result(status=3,
                                   msg="can not found this job id: %s" %
                                   request_data.get("jobId", ""))
        response = GetFeature.import_data(request_data,
                                          json.loads(job_data[0]["config"]))
        if response.get("status", 1) == 0:
            update_job_by_id(job_id=job_id,
                             update_data={
                                 "status": "success",
                                 "end_date": datetime.datetime.now()
                             })
            return get_json_result()
        else:
            return get_json_result(status=1,
                                   msg="request offline feature error: %s" %
                                   response.get("msg", ""))
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1,
                               msg="request offline feature error: %s" % e)
Пример #6
0
def do_load_model():
    request_data = request.json
    try:
        request_data["servings"] = server_conf.get("servers", {}).get("servings", [])
        publish_model.load_model(config_data=request_data)
        return get_json_result()
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1, msg="load model error: %s" % e)
Пример #7
0
def import_id():
    eggroll.init(job_id=generate_job_id(), mode=WORK_MODE)
    request_data = request.json
    table_name_space = "id_library"
    try:
        id_library_info = eggroll.table("info",
                                        table_name_space,
                                        partition=10,
                                        create_if_missing=True,
                                        error_if_exist=False)
        if request_data.request("rangeStart") == 0:
            data_id = generate_job_id()
            id_library_info.put("tmp_data_id", data_id)
        else:
            data_id = id_library_info.request("tmp_data_id")
        data_table = eggroll.table(data_id,
                                   table_name_space,
                                   partition=50,
                                   create_if_missing=True,
                                   error_if_exist=False)
        for i in request_data.request("ids", []):
            data_table.put(i, "")
        if request_data.request("rangeEnd") and request_data.request(
                "total") and (request_data.request("total") -
                              request_data.request("rangeEnd") == 1):
            # end
            new_id_count = data_table.count()
            if new_id_count == request_data["total"]:
                id_library_info.put(
                    data_id,
                    json.dumps({
                        "salt": request_data.request("salt"),
                        "saltMethod": request_data.request("saltMethod")
                    }))
                old_data_id = id_library_info.request("use_data_id")
                id_library_info.put("use_data_id", data_id)
                logger.info(
                    "import id success, dtable name is {}, namespace is {}",
                    data_id, table_name_space)

                # TODO: destroy DTable, should be use a lock
                old_data_table = eggroll.table(old_data_id,
                                               table_name_space,
                                               partition=50,
                                               create_if_missing=True,
                                               error_if_exist=False)
                old_data_table.destroy()
                id_library_info.delete(old_data_id)
            else:
                data_table.destroy()
                return get_json_result(
                    2, "the actual amount of data is not equal to total.")
        return get_json_result()
    except Exception as e:
        logger.exception(e)
        return get_json_result(1, "import error.")
Пример #8
0
def query_model_version_history():
    request_data = request.json
    try:
        config = file_utils.load_json_conf(request_data.get("config_path"))
        eggroll.init(mode=WORK_MODE)
        history = version_history(data_table_namespace=config.get("namespace"))
        return get_json_result(msg=json.dumps(history))
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1, msg="load model error: %s" % e)
Пример #9
0
def publish_model_online():
    request_data = request.json
    try:
        config = file_utils.load_json_conf(request_data.get("config_path"))
        if not config.get('servings'):
            # get my party all servings
            config['servings'] = SERVINGS
        publish_model.publish_online(config_data=config)
        return get_json_result()
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1, msg="publish model error: %s" % e)
Пример #10
0
def federated_api(job_id, method, url, party_id, json_body={}, overall_timeout=DEFAULT_GRPC_OVERALL_TIMEOUT):
    _packet = wrap_grpc_packet(json_body, method, url, party_id, job_id, overall_timeout=overall_timeout)
    try:
        channel, stub = get_proxy_data_channel()
        _return = stub.unaryCall(_packet)
        logger.info("grpc unary response: {}".format(_return))
        channel.close()
        return 0, _return.body.value
    except grpc.RpcError as e:
        logger.exception(e)
        return 101, 'rpc error'
    except Exception as e:
        logger.exception(e)
        return 102, str(e)
Пример #11
0
def request_offline_feature():
    request_data = request.json
    try:
        job_id = uuid.uuid1().hex
        response = GetFeature.request(job_id, request_data)
        if response.get("status", 1) == 0:
            job_data = dict()
            job_data.update(request_data)
            job_data["begin_date"] = datetime.datetime.now()
            job_data["status"] = "running"
            job_data["config"] = json.dumps(request_data)
            save_job_info(job_id=job_id, **job_data)
            return get_json_result()
        else:
            return get_json_result(status=1,
                                   msg="request offline feature error: %s" %
                                   response.get("msg", ""))
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1,
                               msg="request offline feature error: %s" % e)
Пример #12
0
def stop_job(job_id):
    _job_dir = get_job_directory(job_id)
    for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True):
        runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path))
        _role = runtime_conf['local']['role']
        _party_id = runtime_conf['local']['party_id']
        _url = '/workflow/{}'.format(job_id)
        _method = 'DELETE'
        _packet = wrap_grpc_packet({}, _method, _url, _party_id, job_id)
        channel, stub = get_proxy_data_channel()
        try:
            _return = stub.unaryCall(_packet)
            logger.info("Grpc unary response: {}".format(_return))
        except grpc.RpcError as e:
            msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id,
                                                                                                   _party_id,
                                                                                                   _role, _method,
                                                                                                   _url)
            logger.exception(msg)
            return get_json_result(-101, 'UnaryCall stop to remote manager failed')
    return get_json_result()
Пример #13
0
def load_model():
    config = file_utils.load_json_conf(request.json.get("config_path"))
    _job_id = generate_job_id()
    channel, stub = get_proxy_data_channel()
    for _party_id in config.get("party_ids"):
        config['my_party_id'] = _party_id
        _method = 'POST'
        _url = '/model/load/do'
        _packet = wrap_grpc_packet(config, _method, _url, _party_id, _job_id)
        logger.info(
            'Starting load model job_id:{} party_id:{} method:{} url:{}'.format(_job_id, _party_id,_method, _url))
        try:
            _return = stub.unaryCall(_packet)
            logger.info("Grpc unary response: {}".format(_return))
        except grpc.RpcError as e:
            msg = 'job_id:{} party_id:{} method:{} url:{} Failed to start load model'.format(_job_id,
                                                                                             _party_id,
                                                                                             _method,
                                                                                             _url)
            logger.exception(msg)
            return get_json_result(-101, 'UnaryCall submit to remote manager failed')
    return get_json_result()
Пример #14
0
def stop_workflow(job_id, role, party_id):
    _job_dir = get_job_directory(job_id)
    task_pid_path = os.path.join(_job_dir, 'pids')
    if os.path.isdir(task_pid_path):
        for pid_file in os.listdir(task_pid_path):
            try:
                if not pid_file.endswith('.pid'):
                    continue
                with open(os.path.join(task_pid_path, pid_file), 'r') as f:
                    pids = f.read().split('\n')
                    for pid in pids:
                        try:
                            if len(pid) == 0:
                                continue
                            logger.debug(
                                "terminating process pid:{} {}".format(
                                    pid, pid_file))
                            p = psutil.Process(int(pid))
                            for child in p.children(recursive=True):
                                child.kill()
                            p.kill()
                        except NoSuchProcess:
                            continue
            except Exception as e:
                logger.exception("error")
                continue
        federated_api(job_id=job_id,
                      method='POST',
                      url='/job/jobStatus/{}/{}/{}'.format(
                          job_id, role, party_id),
                      party_id=party_id,
                      json_body={
                          'status': 'failed',
                          'stopJob': True
                      })
        clean_job(job_id=job_id)
    return get_json_result(job_id=job_id)
Пример #15
0
def download_upload(data_func):
    request_config = request.json
    _job_id = generate_job_id()
    logger.info('generated job_id {}, body {}'.format(_job_id, request_config))
    _job_dir = get_job_directory(_job_id)
    os.makedirs(_job_dir, exist_ok=True)
    module = data_func
    if module == "upload":
        if not os.path.isabs(request_config.get("file", "")):
            request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"])
    try:
        request_config["work_mode"] = request_config.get('work_mode', WORK_MODE)
        table_name, namespace = dtable_utils.get_table_info(config=request_config, create=(True if module == 'upload' else False))
        if not table_name or not namespace:
            return get_json_result(status=102, msg='no table name and namespace')
        request_config['table_name'] = table_name
        request_config['namespace'] = namespace
        conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module,
                                          role=request_config.get('local', {}).get("role"),
                                          party_id=request_config.get('local', {}).get("party_id", PARTY_ID))
        file_utils.dump_json_conf(request_config, conf_file_path)
        if module == "download":
            progs = ["python3",
                     os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]),
                     "-j", _job_id,
                     "-c", conf_file_path
                     ]
        else:
            progs = ["python3",
                     os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]),
                     "-c", conf_file_path
                     ]
        p = run_subprocess(job_dir=_job_dir, job_role=data_func, progs=progs)
        return get_json_result(job_id=_job_id, data={'pid': p.pid, 'table_name': request_config['table_name'], 'namespace': request_config['namespace']})
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=-104, msg="failed", job_id=_job_id)
Пример #16
0
def internal_server_error(e):
    logger.exception(e)
    return get_json_result(status=100, msg=str(e))