def run_do(self): logger.info("{} job are running.".format(running_job_amount())) if running_job_amount() < MAX_CONCURRENT_JOB_RUN: wait_jobs = get_job_from_queue(status="waiting", limit=1) if wait_jobs: update_job_queue(job_id=wait_jobs[0].get("job_id"), update_data={"status": "ready"}) self.run_job(wait_jobs[0].get("job_id"), json.loads(wait_jobs[0].get("config")))
def run_do(self): logger.info("{} job are running.".format(running_job_amount())) try: if running_job_amount() < MAX_CONCURRENT_JOB_RUN: wait_jobs = get_job_from_queue(status="waiting", limit=1) if wait_jobs: wait_job = wait_jobs[0] run_job_id = wait_job.job_id try: run_job_success = self.run_job(job_id=run_job_id, config=json.loads( wait_job.config)) except Exception as e: run_job_success = False logger.exception(e) if run_job_success: update_job_queue(job_id=run_job_id, role=wait_job.role, party_id=wait_job.party_id, save_data={"status": "ready"}) else: pop_from_job_queue(job_id=run_job_id) logger.info("check waiting jobs done.") self.check_job() except Exception as e: logger.exception(e)
def clean_job(job_id): try: logger.info('ready clean job {}'.format(job_id)) eggroll.cleanup('*', namespace=job_id, persistent=False) logger.info('send clean job {}'.format(job_id)) except Exception as e: logger.exception(e)
def run_subprocess(job_dir, job_role, progs): logger.info('Starting progs: {}'.format(progs)) std_dir = os.path.join(job_dir, job_role) if not os.path.exists(std_dir): os.makedirs(os.path.join(job_dir, job_role)) std_log = open(os.path.join(std_dir, 'std.log'), 'w') task_pid_path = os.path.join(job_dir, 'pids') if os.name == 'nt': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE else: startupinfo = None p = subprocess.Popen(progs, stdout=std_log, stderr=std_log, startupinfo=startupinfo) os.makedirs(task_pid_path, exist_ok=True) with open(os.path.join(task_pid_path, job_role + ".pid"), 'w') as f: f.truncate() f.write(str(p.pid) + "\n") f.flush() return p
def submit_job(): _data = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, _data)) try: push_into_job_queue(job_id=_job_id, config=_data) return get_json_result(0, "success, job_id {}".format(_job_id)) except Exception as e: return get_json_result(1, "failed, error: {}".format(e))
def import_id(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json table_name_space = "id_library" try: id_library_info = eggroll.table("info", table_name_space, partition=10, create_if_missing=True, error_if_exist=False) if request_data.request("rangeStart") == 0: data_id = generate_job_id() id_library_info.put("tmp_data_id", data_id) else: data_id = id_library_info.request("tmp_data_id") data_table = eggroll.table(data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) for i in request_data.request("ids", []): data_table.put(i, "") if request_data.request("rangeEnd") and request_data.request( "total") and (request_data.request("total") - request_data.request("rangeEnd") == 1): # end new_id_count = data_table.count() if new_id_count == request_data["total"]: id_library_info.put( data_id, json.dumps({ "salt": request_data.request("salt"), "saltMethod": request_data.request("saltMethod") })) old_data_id = id_library_info.request("use_data_id") id_library_info.put("use_data_id", data_id) logger.info( "import id success, dtable name is {}, namespace is {}", data_id, table_name_space) # TODO: destroy DTable, should be use a lock old_data_table = eggroll.table(old_data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) old_data_table.destroy() id_library_info.delete(old_data_id) else: data_table.destroy() return get_json_result( 2, "the actual amount of data is not equal to total.") return get_json_result() except Exception as e: logger.exception(e) return get_json_result(1, "import error.")
def download_data(data_func): _data = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, _data)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) _download_module = os.path.join(file_utils.get_project_base_directory(), "arch/api/utils/download.py") _upload_module = os.path.join(file_utils.get_project_base_directory(), "arch/api/utils/upload.py") if data_func == "download": _module = _download_module else: _module = _upload_module try: if data_func == "download": progs = [ "python3", _module, "-j", _job_id, "-c", os.path.abspath(_data.get("config_path")) ] else: progs = [ "python3", _module, "-c", os.path.abspath(_data.get("config_path")) ] logger.info('Starting progs: {}'.format(progs)) std_log = open(os.path.join(_job_dir, 'std.log'), 'w') task_pid_path = os.path.join(_job_dir, 'pids') if os.name == 'nt': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE else: startupinfo = None p = subprocess.Popen(progs, stdout=std_log, stderr=std_log, startupinfo=startupinfo) os.makedirs(task_pid_path, exist_ok=True) with open(os.path.join(task_pid_path, data_func + ".pid"), 'w') as f: f.truncate() f.write(str(p.pid) + "\n") f.flush() return get_json_result(0, "success, job_id {}".format(_job_id)) except Exception as e: print(e) return get_json_result(-104, "failed, job_id {}".format(_job_id))
def load_model(config_data): for serving in config_data.get('servings'): with grpc.insecure_channel(serving) as channel: stub = model_service_pb2_grpc.ModelServiceStub(channel) request = model_service_pb2.PublishRequest() request.myPartyId = config_data.get("my_party_id") for party_id, model in config_data.get("models").items(): request.models[int(party_id)].name = model["name"] request.models[int(party_id)].namespace = model["namespace"] response = stub.publishLoad(request) logger.info( "party_id: {}, serving server: {}, load status: {}".format( request.myPartyId, serving, response.statusCode))
def federated_api(job_id, method, url, party_id, json_body={}, overall_timeout=DEFAULT_GRPC_OVERALL_TIMEOUT): _packet = wrap_grpc_packet(json_body, method, url, party_id, job_id, overall_timeout=overall_timeout) try: channel, stub = get_proxy_data_channel() _return = stub.unaryCall(_packet) logger.info("grpc unary response: {}".format(_return)) channel.close() return 0, _return.body.value except grpc.RpcError as e: logger.exception(e) return 101, 'rpc error' except Exception as e: logger.exception(e) return 102, str(e)
def load_model(config_data): logger.info(config_data) for serving in config_data.get('servings'): with grpc.insecure_channel(serving) as channel: stub = model_service_pb2_grpc.ModelServiceStub(channel) load_model_request = model_service_pb2.PublishRequest() for role_name, role_partys in config_data.get("role").items(): for _party_id in role_partys: load_model_request.role[role_name].partyId.append( _party_id) for role_name, role_model_config in config_data.get( "model").items(): for _party_id, role_party_model_config in role_model_config.items( ): load_model_request.model[role_name].roleModelInfo[ _party_id].tableName = role_party_model_config[ 'table_name'] load_model_request.model[role_name].roleModelInfo[ _party_id].namespace = role_party_model_config[ 'namespace'] logger.info('request serving: {} load model'.format(serving)) load_model_request.local.role = config_data.get('local').get( 'role') load_model_request.local.partyId = config_data.get('local').get( 'party_id') print(load_model_request) logger.info(load_model_request) response = stub.publishLoad(load_model_request) logger.info('{} {} load model status: {}'.format( load_model_request.local.role, load_model_request.local.partyId, response.statusCode))
def start_workflow(job_id, module, role): _data = request.json _job_dir = get_job_directory(job_id) _party_id = str(_data['local']['party_id']) _method = _data['WorkFlowParam']['method'] conf_path_dir = os.path.join(_job_dir, _method, module, role, _party_id) os.makedirs(conf_path_dir, exist_ok=True) conf_file_path = os.path.join(conf_path_dir, 'runtime_conf.json') with open(conf_file_path, 'w+') as f: f.truncate() f.write(json.dumps(_data, indent=4)) f.flush() if os.name == 'nt': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE else: startupinfo = None task_pid_path = os.path.join(_job_dir, 'pids') std_log = open(os.path.join(_job_dir, role + '.std.log'), 'w') progs = [ "python3", os.path.join(file_utils.get_project_base_directory(), _data['CodePath']), "-j", job_id, "-c", os.path.abspath(conf_file_path) ] logger.info('Starting progs: {}'.format(" ".join(progs))) p = subprocess.Popen(progs, stdout=std_log, stderr=std_log, startupinfo=startupinfo) os.makedirs(task_pid_path, exist_ok=True) with open(os.path.join(task_pid_path, role + ".pid"), 'w') as f: f.truncate() f.write(str(p.pid) + "\n") f.flush() job_data = dict() job_data["begin_date"] = datetime.datetime.now() job_data["status"] = "ready" with open(conf_file_path) as fr: config = json.load(fr) job_data.update(config) job_data["my_role"] = config.get("local", {}).get("role") save_job_info(job_id=job_id, **job_data) update_job_queue(job_id=job_id, update_data={"status": "ready"}) return get_json_result(msg="success, pid is %s" % p.pid)
def run_job(self, job_id, config): default_runtime_dict = file_utils.load_json_conf( 'workflow/conf/default_runtime_conf.json') setting_conf = file_utils.load_json_conf( 'workflow/conf/setting_conf.json') _job_dir = get_job_directory(job_id=job_id) os.makedirs(_job_dir, exist_ok=True) ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir) logger.info('job_id {} parameters overrode {}'.format( config, _job_dir)) run_job_success = True job_param = dict() job_param['job_id'] = job_id job_param['initiator'] = PARTY_ID for runtime_conf_path in glob.glob(os.path.join( _job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf( os.path.abspath(runtime_conf_path)) runtime_conf['JobParam'] = job_param _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _module = runtime_conf['module'] st, msg = federated_api(job_id=job_id, method='POST', url='/workflow/{}/{}/{}'.format( job_id, _module, _role), party_id=_party_id, json_body=runtime_conf) if st == 0: save_job_info(job_id=job_id, role=_role, party_id=_party_id, save_info={ "status": "ready", "initiator": PARTY_ID }, create=True) else: run_job_success = False logger.info("run job done") return run_job_success
def stop_job(job_id): _job_dir = get_job_directory(job_id) all_party = [] for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf( os.path.abspath(runtime_conf_path)) for _role, _party_ids in runtime_conf['role'].items(): all_party.extend([(_role, _party_id) for _party_id in _party_ids]) all_party = set(all_party) logger.info('start send stop job to {}'.format(','.join( [i[0] for i in all_party]))) _method = 'DELETE' for _role, _party_id in all_party: federated_api(job_id=job_id, method=_method, url='/workflow/{}/{}/{}'.format(job_id, _role, _party_id), party_id=_party_id) return get_json_result(job_id=job_id)
def stop_job(job_id): _job_dir = get_job_directory(job_id) for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _url = '/workflow/{}'.format(job_id) _method = 'DELETE' _packet = wrap_grpc_packet({}, _method, _url, _party_id, job_id) channel, stub = get_proxy_data_channel() try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall stop to remote manager failed') return get_json_result()
def run_job(self, job_id, config): default_runtime_dict = file_utils.load_json_conf('workflow/conf/default_runtime_conf.json') setting_conf = file_utils.load_json_conf('workflow/conf/setting_conf.json') _job_dir = get_job_directory(job_id=job_id) os.makedirs(_job_dir, exist_ok=True) ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir) logger.info('job_id {} parameters overrode {}'.format(config, _job_dir)) channel, stub = get_proxy_data_channel() for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _method = 'POST' _module = runtime_conf['module'] _url = '/workflow/{}/{}/{}'.format(job_id, _module, _role) _packet = wrap_grpc_packet(runtime_conf, _method, _url, _party_id, job_id) logger.info( 'Starting workflow job_id:{} party_id:{} role:{} method:{} url:{}'.format(job_id, _party_id, _role, _method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed')
def load_model(): config = file_utils.load_json_conf(request.json.get("config_path")) _job_id = generate_job_id() channel, stub = get_proxy_data_channel() for _party_id in config.get("party_ids"): config['my_party_id'] = _party_id _method = 'POST' _url = '/model/load/do' _packet = wrap_grpc_packet(config, _method, _url, _party_id, _job_id) logger.info( 'Starting load model job_id:{} party_id:{} method:{} url:{}'.format(_job_id, _party_id,_method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} method:{} url:{} Failed to start load model'.format(_job_id, _party_id, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed') return get_json_result()
def publish_online(config_data): _role = config_data.get('local').get('role') _party_id = config_data.get('local').get('party_id') for serving in config_data.get('servings'): with grpc.insecure_channel(serving) as channel: stub = model_service_pb2_grpc.ModelServiceStub(channel) publish_model_request = model_service_pb2.PublishRequest() for role_name, role_party in config_data.get("role").items(): publish_model_request.role[role_name].partyId.extend( role_party) for role_name, role_model_config in config_data.get( "model").items(): if role_name != _role: continue if role_model_config.get(str(_party_id)): table_config = copy.deepcopy( role_model_config.get(str(_party_id))) table_config['scene_id'] = config_data.get('scene_id') table_config['local'] = { 'role': _role, 'party_id': _party_id } table_config['role'] = config_data.get('role') table_config['data_type'] = 'model' table_config['gen_table_info'] = True table_name, namespace = dtable_utils.get_table_info( config=table_config) publish_model_request.model[_role].roleModelInfo[ _party_id].tableName = table_name publish_model_request.model[_role].roleModelInfo[ _party_id].namespace = namespace publish_model_request.local.role = _role publish_model_request.local.partyId = _party_id logger.info(publish_model_request) response = stub.publishOnline(publish_model_request) logger.info(response)
def download_upload(data_func): request_config = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, request_config)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) module = data_func if module == "upload": if not os.path.isabs(request_config.get("file", "")): request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"]) try: request_config["work_mode"] = request_config.get('work_mode', WORK_MODE) table_name, namespace = dtable_utils.get_table_info(config=request_config, create=(True if module == 'upload' else False)) if not table_name or not namespace: return get_json_result(status=102, msg='no table name and namespace') request_config['table_name'] = table_name request_config['namespace'] = namespace conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module, role=request_config.get('local', {}).get("role"), party_id=request_config.get('local', {}).get("party_id", PARTY_ID)) file_utils.dump_json_conf(request_config, conf_file_path) if module == "download": progs = ["python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-j", _job_id, "-c", conf_file_path ] else: progs = ["python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-c", conf_file_path ] p = run_subprocess(job_dir=_job_dir, job_role=data_func, progs=progs) return get_json_result(job_id=_job_id, data={'pid': p.pid, 'table_name': request_config['table_name'], 'namespace': request_config['namespace']}) except Exception as e: logger.exception(e) return get_json_result(status=-104, msg="failed", job_id=_job_id)
def update_job(job_id, role, party_id): request_data = request.json logger.info('job_id:{} role:{} party_id:{} status:{}'.format( job_id, role, party_id, request_data.get('status'))) job_info = save_job_info(job_id=job_id, role=role, party_id=party_id, save_info={"status": request_data.get("status")}) if not job_info: logger.info( 'job_id {} may not be started by the Task Manager.'.format(job_id)) return get_json_result( job_id=job_id, status=101, msg='this task may not be started by the Task Manager.') update_job_queue(job_id=job_id, role=role, party_id=party_id, save_data={"status": request_data.get("status")}) if request_data.get("status") in ["success", "failed", "deleted"]: pop_from_job_queue(job_id=job_id) if is_job_initiator(job_info.initiator, PARTY_ID): # I am job initiator logger.info('i am job {} initiator'.format(job_id)) # check job status jobs = query_job_by_id(job_id=job_id) job_status = set([job.status for job in jobs]) do_stop_job = False if 'failed' in job_status or 'deleted' in job_status: do_stop_job = True elif len(job_status) == 1 and 'success' in job_status: do_stop_job = True if do_stop_job: stop_job(job_id=job_id) else: # send job status to initiator if not request_data.get('initiatorUpdate', False): request_data['initiatorUpdate'] = True federated_api(job_id=job_id, method='POST', url='/job/jobStatus/{}/{}/{}'.format( job_id, role, party_id), party_id=job_info.initiator, json_body=request_data) return get_json_result(job_id=job_id)
def submit_workflow_job(): _data = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, _data)) push_into_job_queue(job_id=_job_id, config=_data) return get_json_result(job_id=_job_id)
def load_model(config_data): default_table_config = dict() default_table_config['scene_id'] = config_data.get('scene_id') default_table_config['role'] = config_data.get('role') default_table_config['data_type'] = 'model' default_table_config['gen_table_info'] = True logger.info(config_data) for serving in config_data.get('servings'): with grpc.insecure_channel(serving) as channel: stub = model_service_pb2_grpc.ModelServiceStub(channel) load_model_request = model_service_pb2.PublishRequest() model_table_name = '' for role_name, role_party in config_data.get("role").items(): for _party_id in role_party: load_model_request.role[role_name].partyId.append( _party_id) if _party_id == PARTY_ID: # get model table name # the model table names automatically generated by all parties are the same local_party_model_config = config_data.get( 'model').get(role_name, {}).get(str(_party_id)) if local_party_model_config: table_config = copy.deepcopy(default_table_config) table_config.update(local_party_model_config) table_config['local'] = { 'role': role_name, 'party_id': PARTY_ID } table_name, namespace = dtable_utils.get_table_info( config=table_config) model_table_name = table_name load_model_request.model[role_name].roleModelInfo[ int(_party_id)].tableName = table_name load_model_request.model[role_name].roleModelInfo[ int(_party_id)].namespace = namespace logger.info('load another party model') for role_name, role_model_config in config_data.get( "model").items(): for _party_id, role_party_model_config in role_model_config.items( ): if _party_id == str(PARTY_ID) or not model_table_name: continue table_config = copy.deepcopy(default_table_config) table_config['local'] = { 'role': role_name, 'party_id': _party_id } table_config.update(role_party_model_config) table_config['table_name'] = table_config[ 'table_name'] if table_config.get( 'table_name') else model_table_name table_name, namespace = dtable_utils.get_table_info( config=table_config) load_model_request.model[role_name].roleModelInfo[int( _party_id)].tableName = table_name load_model_request.model[role_name].roleModelInfo[int( _party_id)].namespace = namespace logger.info('request serving: {} load model'.format(serving)) for role_name, role_party in config_data.get("role").items(): if role_name == 'arbiter': continue for _party_id in role_party: if _party_id == PARTY_ID: load_model_request.local.role = role_name load_model_request.local.partyId = _party_id logger.info(load_model_request) response = stub.publishLoad(load_model_request) logger.info('{} {} load model status: {}'.format( role_name, _party_id, response.statusCode))