def insert_data_to_db(self, metric_namespace: str, metric_name: str, data_type: int, kv, job_level=False): with DB.connection_context(): try: tracking_metric = TrackingMetric.model(table_index=self.job_id) tracking_metric.f_job_id = self.job_id tracking_metric.f_component_name = self.component_name if not job_level else 'dag' tracking_metric.f_task_id = self.task_id tracking_metric.f_role = self.role tracking_metric.f_party_id = self.party_id tracking_metric.f_metric_namespace = metric_namespace tracking_metric.f_metric_name = metric_name tracking_metric.f_type = data_type default_db_source = tracking_metric.to_json() tracking_metric_data_source = [] for k, v in kv: db_source = default_db_source.copy() db_source['f_key'] = serialize_b64(k) db_source['f_value'] = serialize_b64(v) db_source['f_create_time'] = current_timestamp() tracking_metric_data_source.append(db_source) self.bulk_insert_model_data( TrackingMetric.model(table_index=self.get_table_index()), tracking_metric_data_source) except Exception as e: stat_logger.exception(e)
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) component_need_run = {} if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format(job_info.get('job_id', ''))) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True) for task in tasks: need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True) component_need_run[task.f_component_name] = need_run else: dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) dependency = dsl_parser.get_dependency() dependency["component_need_run"] = component_need_run return dependency except Exception as e: stat_logger.exception(e) raise e
def get_component_summary(): request_data = request.json try: required_params = ["job_id", "component_name", "role", "party_id"] detect_utils.check_config(request_data, required_params) tracker = Tracker(job_id=request_data["job_id"], component_name=request_data["component_name"], role=request_data["role"], party_id=request_data["party_id"], task_id=request_data.get("task_id", None), task_version=request_data.get("task_version", None)) summary = tracker.read_summary_from_db() if summary: if request_data.get("filename"): temp_filepath = os.path.join(TEMP_DIRECTORY, request_data.get("filename")) with open(temp_filepath, "w") as fout: fout.write(json.dumps(summary, indent=4)) return send_file( open(temp_filepath, "rb"), as_attachment=True, attachment_filename=request_data.get("filename")) else: return get_json_result(data=summary) return error_response( 210, "No component summary found, please check if arguments are specified correctly." ) except Exception as e: stat_logger.exception(e) return error_response(210, str(e))
def load_model(): request_config = request.json _job_id = generate_job_id() initiator_party_id = request_config['initiator']['party_id'] initiator_role = request_config['initiator']['role'] publish_model.generate_publish_model_info(request_config) load_status = True load_status_info = {} load_status_msg = 'success' for role_name, role_partys in request_config.get("role").items(): if role_name == 'arbiter': continue load_status_info[role_name] = load_status_info.get(role_name, {}) for _party_id in role_partys: request_config['local'] = {'role': role_name, 'party_id': _party_id} try: response = federated_api(job_id=_job_id, method='POST', endpoint='/{}/model/load/do'.format(API_VERSION), src_party_id=initiator_party_id, dest_party_id=_party_id, src_role = initiator_role, json_body=request_config, work_mode=request_config['job_parameters']['work_mode']) load_status_info[role_name][_party_id] = response['retcode'] except Exception as e: stat_logger.exception(e) load_status = False load_status_msg = 'failed' load_status_info[role_name][_party_id] = 100 return get_json_result(job_id=_job_id, retcode=(0 if load_status else 101), retmsg=load_status_msg, data=load_status_info)
def dsl_generator(): data = request.json cpn_str = data.get("cpn_str", "") try: if not cpn_str: raise Exception("Component list should not be empty.") if isinstance(cpn_str, list): cpn_list = cpn_str else: if (cpn_str.find("/") and cpn_str.find("\\")) != -1: raise Exception( "Component list string should not contain '/' or '\\'.") cpn_str = cpn_str.replace(" ", "").replace("\n", "").strip(",[]") cpn_list = cpn_str.split(",") train_dsl = json_loads(data.get("train_dsl")) parser = schedule_utils.get_dsl_parser_by_version( data.get("version", "2")) predict_dsl = parser.deploy_component(cpn_list, train_dsl) if data.get("filename"): os.makedirs(TEMP_DIRECTORY, exist_ok=True) temp_filepath = os.path.join(TEMP_DIRECTORY, data.get("filename")) with open(temp_filepath, "w") as fout: fout.write(json.dumps(predict_dsl, indent=4)) return send_file(open(temp_filepath, 'rb'), as_attachment=True, attachment_filename=data.get("filename")) return get_json_result(data=predict_dsl) except Exception as e: stat_logger.exception(e) return error_response( 210, "DSL generating failed. For more details, " "please check logs/fate_flow/fate_flow_stat.log.")
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format( job_info.get('job_id', ''))) job = jobs[0] job_dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) else: job_dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) return job_dsl_parser.get_dependency(role=job_info["role"], party_id=int( job_info["party_id"])) except Exception as e: stat_logger.exception(e) raise e
def do_load_model(): request_data = request.json adapter_servings_config(request_data) retcode, retmsg = publish_model.load_model(config_data=request_data) try: if not retcode: with DB.connection_context(): model = MLModel.get_or_none(MLModel.f_role == request_data.get("local").get("role"), MLModel.f_party_id == request_data.get("local").get("party_id"), MLModel.f_model_id == request_data.get("job_parameters").get("model_id"), MLModel.f_model_version == request_data.get("job_parameters").get("model_version")) if model: count = model.f_loaded_times model.f_loaded_times = count + 1 model.save() except Exception as modify_err: stat_logger.exception(modify_err) try: party_model_id = gen_party_model_id(role=request_data.get("local").get("role"), party_id=request_data.get("local").get("party_id"), model_id=request_data.get("job_parameters").get("model_id")) src_model_path = os.path.join(file_utils.get_project_base_directory(), 'model_local_cache', party_model_id, request_data.get("job_parameters").get("model_version")) dst_model_path = os.path.join(file_utils.get_project_base_directory(), 'loaded_model_backup', party_model_id, request_data.get("job_parameters").get("model_version")) if not os.path.exists(dst_model_path): shutil.copytree(src=src_model_path, dst=dst_model_path) except Exception as copy_err: stat_logger.exception(copy_err) operation_record(request_data, "load", "success" if not retcode else "failed") return get_json_result(retcode=retcode, retmsg=retmsg)
def _wrapper(*args, **kwargs): if RuntimeConfig.PROCESS_ROLE in [ProcessRole.SERVER]: for i in range(3): try: stat_logger.info("detect session {} by table {} {}".format( session.get_session_id(), DETECT_TABLE[0], DETECT_TABLE[1])) stat_logger.info("start count table {} {}".format(DETECT_TABLE[0], DETECT_TABLE[1])) count = session.table(namespace=DETECT_TABLE[0], name=DETECT_TABLE[1]).count() stat_logger.info("table {} {} count is {}".format(DETECT_TABLE[0], DETECT_TABLE[1], count)) if count != DETECT_TABLE[2]: raise Exception("session {} count error".format(session.get_session_id())) stat_logger.info("session {} is ok".format(session.get_session_id())) break except Exception as e: stat_logger.exception(e) stat_logger.info("start init new session") try: clean_server_used_session() init_session_for_flow_server() except Exception as e: stat_logger.exception(e) stat_logger.info("init new session failed.") else: stat_logger.error("init new session failed.") else: # If in executor pass. TODO: detect and restore the session in executor pass return func(*args, **kwargs)
def put_event(self, event): try: self.queue.put(event) stat_logger.info('put event into in-process queue successfully: {}'.format(event)) except Exception as e: stat_logger.exception(e) stat_logger.error('put event into in-process queue failed')
def component_output_data_download(): request_data = request.json tasks = JobSaver.query_task(only_latest=True, job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) if not tasks: raise ValueError( f'no found task, please check if the parameters are correct:{request_data}' ) import_component_output_depend(tasks[0].f_provider_info) try: output_tables_meta = get_component_output_tables_meta( task_data=request_data) except Exception as e: stat_logger.exception(e) return error_response(210, str(e)) limit = request_data.get('limit', -1) if not output_tables_meta: return error_response(response_code=210, retmsg='no data') if limit == 0: return error_response(response_code=210, retmsg='limit is 0') tar_file_name = 'job_{}_{}_{}_{}_output_data.tar.gz'.format( request_data['job_id'], request_data['component_name'], request_data['role'], request_data['party_id']) return TableStorage.send_table(output_tables_meta, tar_file_name, limit=limit, need_head=request_data.get("head", True))
def do_load_model(): request_data = request.json request_data['servings'] = RuntimeConfig.SERVICE_DB.get_urls('servings') role = request_data['local']['role'] party_id = request_data['local']['party_id'] model_id = request_data['job_parameters']['model_id'] model_version = request_data['job_parameters']['model_version'] party_model_id = model_utils.gen_party_model_id(model_id, role, party_id) if get_base_config('enable_model_store', False): pipeline_model = pipelined_model.PipelinedModel( party_model_id, model_version) component_parameters = { 'model_id': party_model_id, 'model_version': model_version, 'store_address': ServiceRegistry.MODEL_STORE_ADDRESS, } model_storage = get_model_storage(component_parameters) if pipeline_model.exists() and not model_storage.exists( **component_parameters): stat_logger.info( f'Uploading {pipeline_model.model_path} to model storage.') model_storage.store(**component_parameters) elif not pipeline_model.exists() and model_storage.exists( **component_parameters): stat_logger.info( f'Downloading {pipeline_model.model_path} from model storage.') model_storage.restore(**component_parameters) if not model_utils.check_if_deployed(role, party_id, model_id, model_version): return get_json_result( retcode=100, retmsg= "Only deployed models could be used to execute process of loading. " "Please deploy model before loading.") retcode, retmsg = publish_model.load_model(request_data) try: if not retcode: with DB.connection_context(): model = MLModel.get_or_none( MLModel.f_role == request_data["local"]["role"], MLModel.f_party_id == request_data["local"]["party_id"], MLModel.f_model_id == request_data["job_parameters"] ["model_id"], MLModel.f_model_version == request_data["job_parameters"]["model_version"]) if model: model.f_loaded_times += 1 model.save() except Exception as modify_err: stat_logger.exception(modify_err) operation_record(request_data, "load", "success" if not retcode else "failed") return get_json_result(retcode=retcode, retmsg=retmsg)
def del_event(self, event): try: ret = self.dell(event) stat_logger.info('delete event from redis queue {}: {}'.format('successfully' if ret else 'failed', event)) except Exception as e: stat_logger.error('delete event from queue failed') stat_logger.exception(e) raise Exception('{} not in ListQueue'.format(event))
def put_event(self, event): try: conn = self.get_conn() ret = conn.lpush(self.queue_name, json.dumps(event)) stat_logger.info('put event into redis queue {}: {}'.format('successfully' if ret else 'failed', event)) except Exception as e: stat_logger.exception(e) stat_logger.error('put event into redis queue failed')
def server_error_response(e): stat_logger.exception(e) if len(e.args) > 1: return get_json_result(retcode=RetCode.EXCEPTION_ERROR, retmsg=str(e.args[0]), data=e.args[1]) else: return get_json_result(retcode=RetCode.EXCEPTION_ERROR, retmsg=str(e))
def nodes_unquote(nodes): urls = [parse.unquote(node) for node in nodes] servers = [] for url in urls: try: servers.append(url.split('/')[2]) except Exception as e: stat_logger.exception(e) return servers
def query_model_info_from_file(model_id=None, model_version=None, role=None, party_id=None, query_filters=None, to_dict=False, **kwargs): res = {} if to_dict else [] model_dir = os.path.join(get_fate_flow_directory(), 'model_local_cache') glob_dir = f"{model_dir}{os.sep}{role if role else '*'}#{party_id if party_id else '*'}#{model_id if model_id else '*'}{os.sep}{model_version if model_version else '*'}" stat_logger.info(f'glob model dir: {glob_dir}') model_fp_list = glob.glob(glob_dir) if model_fp_list: for fp in model_fp_list: pipeline_model = PipelinedModel(model_id=fp.split(os.path.sep)[-2], model_version=fp.split( os.path.sep)[-1]) model_info = gather_model_info_data(pipeline_model, query_filters=query_filters) if model_info: _role = fp.split('/')[-2].split('#')[0] _party_id = fp.split('/')[-2].split('#')[1] model_info["f_role"] = _role model_info["f_party_id"] = _party_id if isinstance(res, dict): res[fp] = model_info else: res.append(model_info) if kwargs.get('save'): try: insert_info = gather_model_info_data( pipeline_model).copy() insert_info['role'] = _role insert_info['party_id'] = _party_id insert_info['job_id'] = insert_info.get( 'f_model_version') insert_info[ 'size'] = pipeline_model.calculate_model_file_size( ) if compare_version(insert_info['f_fate_version'], '1.5.1') == 'lt': insert_info['roles'] = insert_info.get( 'f_train_runtime_conf', {}).get('role', {}) insert_info['initiator_role'] = insert_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('role') insert_info[ 'initiator_party_id'] = insert_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('party_id') save_model_info(insert_info) except Exception as e: stat_logger.exception(e) if res: return 0, 'Query model info from local model success.', res return 100, 'Query model info failed, cannot find model from local model files.', res
def drop_metric_data_mode(model): try: drop_sql = 'drop table t_tracking_metric_{}'.format(model) DB.execute_sql(drop_sql) stat_logger.info(drop_sql) return drop_sql except Exception as e: stat_logger.exception(e) raise e
def get_event(self): try: event = self.queue.get(block=True) stat_logger.info('get event from in-process queue successfully: {}'.format(event)) return event except Exception as e: stat_logger.exception(e) stat_logger.error('get event from in-process queue failed') return None
def put_event(self, event, status=None, job_id=None): try: self.put(event) stat_logger.info( 'put event into in-process queue successfully: {}'.format( event)) except Exception as e: stat_logger.error('put event into in-process queue failed') stat_logger.exception(e) raise e
def put_event(self, event, status=None, job_id=''): try: is_failed = self.put(item=event, status=status, job_id=job_id) stat_logger.info( 'put event into queue successfully: {}'.format(event)) return is_failed except Exception as e: stat_logger.error('put event into queue failed') stat_logger.exception(e) raise e
def get_event(self, status=None, end_status=None): try: event = self.get(block=True, status=status, end_status=end_status) stat_logger.info( 'get event from queue successfully: {}, status {}'.format( event, status)) return event except Exception as e: stat_logger.error('get event from queue failed') stat_logger.exception(e) return None
def del_event(self, event): try: conn = self.get_conn() ret = conn.lrem(self.queue_name, 1, json.dumps(event)) stat_logger.info('delete event from redis queue {}: {}'.format('successfully' if ret else 'failed', event)) if not ret: raise Exception('job not in redis queue') except Exception as e: stat_logger.error('delete event from redis queue failed') stat_logger.exception(e) raise Exception('delete event from redis queue failed')
def get_event(self): try: conn = self.get_conn() content = conn.brpop([self.queue_name]) event = self.parse_event(content[1]) stat_logger.info('get event from redis queue: {}'.format(event)) return event except Exception as e: stat_logger.exception(e) stat_logger.error('get event from redis queue failed') return None
def start_clean_job(**kwargs): tasks = query_task(**kwargs) if tasks: for task in tasks: task_info = get_task_info(task.f_job_id, task.f_role, task.f_party_id, task.f_component_name) try: # clean session stat_logger.info('start {} {} {} {} session stop'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) start_session_stop(task) stat_logger.info('stop {} {} {} {} session success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: pass try: # clean data table stat_logger.info('start delete {} {} {} {} data table'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) data_views = query_data_view(**task_info) if data_views: delete_table(data_views) stat_logger.info( 'delete {} {} {} {} data table success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: stat_logger.info('delete {} {} {} {} data table failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) try: # clean metric data stat_logger.info('start delete {} {} {} {} metric data'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) delete_metric_data(task_info) stat_logger.info( 'delete {} {} {} {} metric data success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: stat_logger.info( 'delete {} {} {} {} metric data failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) else: raise Exception('no found task')
def pipeline_dag_dependency(job_id): try: jobs = job_utils.query_job(job_id=job_id) if not jobs: raise Exception('query job {} failed'.format(job_id)) job = jobs[0] job_dsl_parser = job_utils.get_job_dsl_parser(dsl=json_loads(job.f_dsl), runtime_conf=json_loads(job.f_runtime_conf), train_runtime_conf=json_loads(job.f_train_runtime_conf)) return job_dsl_parser.get_dependency() except Exception as e: stat_logger.exception(e) raise e
def bulk_insert_model_data(self, model, data_source): with DB.connection_context(): try: DB.create_tables([model]) batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000 for i in range(0, len(data_source), batch_size): with DB.atomic(): model.insert_many(data_source[i:i + batch_size]).execute() return len(data_source) except Exception as e: stat_logger.exception(e) return 0
def start_clean_job(cls, **kwargs): tasks = JobSaver.query_task(**kwargs) if tasks: for task in tasks: try: # clean session stat_logger.info('start {} {} {} {} session stop'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) start_session_stop(task) stat_logger.info('stop {} {} {} {} session success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: pass try: # clean data table JobClean.clean_table(job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, component_name=task.f_component_name) except Exception as e: stat_logger.info( 'delete {} {} {} {} data table failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) try: # clean metric data stat_logger.info( 'start delete {} {} {} {} metric data'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) delete_metric_data({ 'job_id': task.f_job_id, 'role': task.f_role, 'party_id': task.f_party_id, 'component_name': task.f_component_name }) stat_logger.info( 'delete {} {} {} {} metric data success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: stat_logger.info( 'delete {} {} {} {} metric data failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) else: raise Exception('no found task')
def parse_proto_object(proto_object, proto_object_serialized_bytes): try: proto_object.ParseFromString(proto_object_serialized_bytes) stat_logger.info('parse {} proto object normal'.format( type(proto_object).__name__)) except Exception as e1: try: fill_message = default_empty_fill_pb2.DefaultEmptyFillMessage() fill_message.ParseFromString(proto_object_serialized_bytes) proto_object.ParseFromString(bytes()) stat_logger.info( 'parse {} proto object with default values'.format( type(proto_object).__name__)) except Exception as e2: stat_logger.exception(e2) raise e1
def delete_metric_data_from_db(metric_info): try: job_id = metric_info['job_id'] metric_info.pop('job_id') delete_sql = 'delete from t_tracking_metric_{} where f_job_id="{}"'.format( job_id[:8], job_id) for k, v in metric_info.items(): if hasattr(TrackingMetric, "f_" + k): connect_str = " and f_" delete_sql = delete_sql + connect_str + k + '="{}"'.format(v) DB.execute_sql(delete_sql) stat_logger.info(delete_sql) return delete_sql except Exception as e: stat_logger.exception(e) raise e
def table_delete(): request_data = request.json table_name = request_data.get('table_name') namespace = request_data.get('namespace') with storage.Session.build(name=table_name, namespace=namespace) as storage_session: table = storage_session.get_table() if table: table.destroy() data = {'table_name': table_name, 'namespace': namespace} try: table.close() except Exception as e: stat_logger.exception(e) return get_json_result(data=data) else: return get_json_result(retcode=101, retmsg='no find table')