def get_component_summary(): request_data = request.json try: required_params = ["job_id", "component_name", "role", "party_id"] detect_utils.check_config(request_data, required_params) tracker = Tracker(job_id=request_data["job_id"], component_name=request_data["component_name"], role=request_data["role"], party_id=request_data["party_id"], task_id=request_data.get("task_id", None), task_version=request_data.get("task_version", None)) summary = tracker.read_summary_from_db() if summary: if request_data.get("filename"): temp_filepath = os.path.join(TEMP_DIRECTORY, request_data.get("filename")) with open(temp_filepath, "w") as fout: fout.write(json.dumps(summary, indent=4)) return send_file( open(temp_filepath, "rb"), as_attachment=True, attachment_filename=request_data.get("filename")) else: return get_json_result(data=summary) return error_response( 210, "No component summary found, please check if arguments are specified correctly." ) except Exception as e: stat_logger.exception(e) return error_response(210, str(e))
def job_view(): request_data = request.json check_request_parameters(request_data) job_tracker = Tracker(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) job_view_data = job_tracker.get_job_view() if job_view_data: job_metric_list = job_tracker.get_metric_list(job_level=True) job_view_data['model_summary'] = {} for metric_namespace, namespace_metrics in job_metric_list.items(): job_view_data['model_summary'][metric_namespace] = job_view_data[ 'model_summary'].get(metric_namespace, {}) for metric_name in namespace_metrics: job_view_data['model_summary'][metric_namespace][ metric_name] = job_view_data['model_summary'][ metric_namespace].get(metric_name, {}) for metric_data in job_tracker.get_job_metric_data( metric_namespace=metric_namespace, metric_name=metric_name): job_view_data['model_summary'][metric_namespace][ metric_name][metric_data.key] = metric_data.value return get_json_result(retcode=0, retmsg='success', data=job_view_data) else: return get_json_result(retcode=101, retmsg='error')
def component_metric_all(): request_data = request.json check_request_parameters(request_data) tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) metrics = tracker.get_metric_list() all_metric_data = {} if metrics: for metric_namespace, metric_names in metrics.items(): all_metric_data[metric_namespace] = all_metric_data.get( metric_namespace, {}) for metric_name in metric_names: all_metric_data[metric_namespace][ metric_name] = all_metric_data[metric_namespace].get( metric_name, {}) metric_data, metric_meta = get_metric_all_data( tracker=tracker, metric_namespace=metric_namespace, metric_name=metric_name) all_metric_data[metric_namespace][metric_name][ 'data'] = metric_data all_metric_data[metric_namespace][metric_name][ 'meta'] = metric_meta return get_json_result(retcode=0, retmsg='success', data=all_metric_data) else: return get_json_result(retcode=0, retmsg='no data', data={})
def component_metrics(): request_data = request.json check_request_parameters(request_data) tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) metrics = tracker.get_metric_list() if metrics: return get_json_result(retcode=0, retmsg='success', data=metrics) else: return get_json_result(retcode=0, retmsg='no data', data={})
def save_component_summary(job_id: str, component_name: str, task_version: int, task_id: str, role: str, party_id: int): request_data = request.json tracker = Tracker(job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id) summary_data = request_data['summary'] tracker.insert_summary_into_db(summary_data) return get_json_result()
def get_component_output_tables_meta(task_data): check_request_parameters(task_data) tracker = Tracker(job_id=task_data['job_id'], component_name=task_data['component_name'], role=task_data['role'], party_id=task_data['party_id']) job_dsl_parser = schedule_utils.get_job_dsl_parser_by_job_id(job_id=task_data['job_id']) if not job_dsl_parser: raise Exception('can not get dag parser, please check if the parameters are correct') component = job_dsl_parser.get_component_info(task_data['component_name']) if not component: raise Exception('can not found component, please check if the parameters are correct') output_data_table_infos = tracker.get_output_data_info() output_tables_meta = tracker.get_output_data_table(output_data_infos=output_data_table_infos) return output_tables_meta
def save_output_data_info(job_id, component_name, task_version, task_id, role, party_id): request_data = request.json tracker = Tracker(job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id) tracker.insert_output_data_info_into_db( data_name=request_data["data_name"], table_namespace=request_data["table_namespace"], table_name=request_data["table_name"]) return get_json_result()
def clean_table(cls, job_id, role, party_id, component_name): # clean data table stat_logger.info('start delete {} {} {} {} data table'.format( job_id, role, party_id, component_name)) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, component_name=component_name) output_data_table_infos = tracker.get_output_data_info() if output_data_table_infos: delete_tables_by_table_infos(output_data_table_infos) stat_logger.info('delete {} {} {} {} data table success'.format( job_id, role, party_id, component_name))
def read_output_data_info(job_id, component_name, task_version, task_id, role, party_id): request_data = request.json tracker = Tracker(job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id) output_data_infos = tracker.read_output_data_info_from_db( data_name=request_data["data_name"]) response_data = [] for output_data_info in output_data_infos: response_data.append(output_data_info.to_human_model_dict()) return get_json_result(data=response_data)
def save_metric_meta(job_id, component_name, task_version, task_id, role, party_id): request_data = request.json tracker = Tracker(job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id) metric_meta = deserialize_b64(request_data['metric_meta']) tracker.save_metric_meta(metric_namespace=request_data['metric_namespace'], metric_name=request_data['metric_name'], metric_meta=metric_meta, job_level=request_data['job_level']) return get_json_result()
def __init__(self, job_id: str, role: str, party_id: int, model_id: str = None, model_version: str = None, component_name: str = None, component_module_name: str = None, task_id: str = None, task_version: int = None, job_parameters: RunParameters = None): self.job_id = job_id self.role = role self.party_id = party_id self.model_id = model_id self.model_version = model_version self.component_name = component_name if component_name else 'pipeline' self.module_name = component_module_name if component_module_name else 'Pipeline' self.task_id = task_id self.task_version = task_version self.job_parameters = job_parameters self.job_tracker = Tracker(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=model_id, model_version=model_version, job_parameters=job_parameters)
def component_output_model(): request_data = request.json check_request_parameters(request_data) job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) try: model_id = runtime_conf_on_party['job_parameters']['model_id'] model_version = runtime_conf_on_party['job_parameters']['model_version'] except Exception as e: job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_model_configuration(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) if any([job_dsl, job_runtime_conf, train_runtime_conf]): adapter = JobRuntimeConfigAdapter(job_runtime_conf) model_id = adapter.get_common_parameters().to_dict().get('model_id') model_version = adapter.get_common_parameters().to_dict.get('model_version') else: stat_logger.exception(e) stat_logger.error(f"Can not find model info by filters: job id: {request_data.get('job_id')}, " f"role: {request_data.get('role')}, party id: {request_data.get('party_id')}") raise Exception(f"Can not find model info by filters: job id: {request_data.get('job_id')}, " f"role: {request_data.get('role')}, party id: {request_data.get('party_id')}") tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id'], model_id=model_id, model_version=model_version) dag = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) component = dag.get_component_info(request_data['component_name']) output_model_json = {} # There is only one model output at the current dsl version. output_model = tracker.get_output_model(component.get_output()['model'][0] if component.get_output().get('model') else 'default') for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Param'): output_model_json = json_format.MessageToDict(buffer_object, including_default_value_fields=True) if output_model_json: component_define = tracker.get_component_define() this_component_model_meta = {} for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Meta'): this_component_model_meta['meta_data'] = json_format.MessageToDict(buffer_object, including_default_value_fields=True) this_component_model_meta.update(component_define) return get_json_result(retcode=0, retmsg='success', data=output_model_json, meta=this_component_model_meta) else: return get_json_result(retcode=0, retmsg='no data', data={})
def query_component_output_data_info(): output_data_infos = Tracker.query_output_data_infos(**request.json) if not output_data_infos: return get_json_result(retcode=101, retmsg='find data view failed') return get_json_result(retcode=0, retmsg='success', data=[ output_data_info.to_json() for output_data_info in output_data_infos ])
def component_output_model(): request_data = request.json check_request_parameters(request_data) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) model_id = job_runtime_conf['job_parameters']['model_id'] model_version = job_runtime_conf['job_parameters']['model_version'] tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id'], model_id=model_id, model_version=model_version) dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) component = dag.get_component_info(request_data['component_name']) output_model_json = {} # There is only one model output at the current dsl version. output_model = tracker.get_output_model(component.get_output( )['model'][0] if component.get_output().get('model') else 'default') for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Param'): output_model_json = json_format.MessageToDict( buffer_object, including_default_value_fields=True) if output_model_json: component_define = tracker.get_component_define() this_component_model_meta = {} for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Meta'): this_component_model_meta[ 'meta_data'] = json_format.MessageToDict( buffer_object, including_default_value_fields=True) this_component_model_meta.update(component_define) return get_json_result(retcode=0, retmsg='success', data=output_model_json, meta=this_component_model_meta) else: return get_json_result(retcode=0, retmsg='no data', data={})
def component_metric_data(): request_data = request.json check_request_parameters(request_data) tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) metric_data, metric_meta = get_metric_all_data(tracker=tracker, metric_namespace=request_data['metric_namespace'], metric_name=request_data['metric_name']) if metric_data or metric_meta: return get_json_result(retcode=0, retmsg='success', data=metric_data, meta=metric_meta) else: return get_json_result(retcode=0, retmsg='no data', data=[], meta={})
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( 'job {} on {} {} start to save pipeline'.format( job_id, role, party_id)) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=role, party_id=party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') if job_type == 'predict': return dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) tracker.save_pipelined_model(pipelined_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( 'job {} on {} {} save pipeline successfully'.format( job_id, role, party_id))
def initialize_job_tracker(cls, job_id, role, party_id, job_parameters, roles, is_initiator, dsl_parser): tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=job_parameters["model_id"], model_version=job_parameters["model_version"]) if job_parameters.get("job_type", "") != "predict": tracker.init_pipelined_model() partner = {} show_role = {} for _role, _role_party in roles.items(): if is_initiator or _role == role: show_role[_role] = show_role.get(_role, []) for _party_id in _role_party: if is_initiator or _party_id == party_id: show_role[_role].append(_party_id) if _role != role: partner[_role] = partner.get(_role, []) partner[_role].extend(_role_party) else: for _party_id in _role_party: if _party_id != party_id: partner[_role] = partner.get(_role, []) partner[_role].append(_party_id) job_args = dsl_parser.get_args_input() dataset = cls.get_dataset(is_initiator, role, party_id, roles, job_args) tracker.log_job_view({'partner': partner, 'dataset': dataset, 'roles': show_role})
def clean_task(cls, job_id, task_id, task_version, role, party_id, content_type): status = set() if content_type == "metrics": tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version) status.add(tracker.clean_metrics()) elif content_type == "table": jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) if jobs: job = jobs[0] job_parameters = RunParameters( **job.f_runtime_conf_on_party["job_parameters"]) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_parameters=job_parameters) status.add(tracker.clean_task(job.f_runtime_conf_on_party)) if len(status) == 1 and True in status: return True else: return False
def component_output_data_table(): output_data_infos = Tracker.query_output_data_infos(**request.json) if output_data_infos: return get_json_result(retcode=0, retmsg='success', data=[{ 'table_name': output_data_info.f_table_name, 'table_namespace': output_data_info.f_table_namespace, "data_name": output_data_info.f_data_name } for output_data_info in output_data_infos]) else: return get_json_result( retcode=100, retmsg='No found table, please check if the parameters are correct' )
def get_job_all_table(job): dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) _, hierarchical_structure = dsl_parser.get_dsl_hierarchical_structure() component_table = {} component_output_tables = Tracker.query_output_data_infos( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) for component_name_list in hierarchical_structure: for component_name in component_name_list: component_table[component_name] = {} component_input_table = get_component_input_table( dsl_parser, job, component_name) component_table[component_name]['input'] = component_input_table component_table[component_name]['output'] = {} for output_table in component_output_tables: if output_table.f_component_name == component_name: component_table[component_name]['output'][output_table.f_data_name] = \ {'name': output_table.f_table_name, 'namespace': output_table.f_table_namespace} return component_table
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info('job {} on {} {} start to save pipeline'.format(job_id, role, party_id)) job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) job_parameters = runtime_conf_on_party.get('job_parameters', {}) if role in job_parameters.get("assistant_role", []): return model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') work_mode = job_parameters['work_mode'] roles = runtime_conf_on_party['role'] initiator_role = runtime_conf_on_party['initiator']['role'] initiator_party_id = runtime_conf_on_party['initiator']['party_id'] if job_type == 'predict': return dag = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version pipeline.parent = True pipeline.loaded_times = 0 pipeline.roles = json_dumps(roles, byte=True) pipeline.work_mode = work_mode pipeline.initiator_role = initiator_role pipeline.initiator_party_id = initiator_party_id pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) pipeline.parent_info = json_dumps({}, byte=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) tracker.save_pipelined_model(pipelined_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info('job {} on {} {} save pipeline successfully'.format(job_id, role, party_id))
def get_task_run_args(cls, job_id, role, party_id, task_id, task_version, job_args, job_parameters: RunParameters, task_parameters: RunParameters, input_dsl, filter_type=None, filter_attr=None, get_input_table=False): task_run_args = {} input_table = {} if 'idmapping' in role: return {} for input_type, input_detail in input_dsl.items(): if filter_type and input_type not in filter_type: continue if input_type == 'data': this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for data_type, data_list in input_detail.items(): data_dict = {} for data_key in data_list: data_key_item = data_key.split('.') data_dict[data_key_item[0]] = {data_type: []} for data_key in data_list: data_key_item = data_key.split('.') search_component_name, search_data_name = data_key_item[ 0], data_key_item[1] storage_table_meta = None if search_component_name == 'args': if job_args.get( 'data', {}).get(search_data_name).get( 'namespace', '') and job_args.get( 'data', {}).get(search_data_name).get( 'name', ''): storage_table_meta = storage.StorageTableMeta( name=job_args['data'][search_data_name] ['name'], namespace=job_args['data'] [search_data_name]['namespace']) else: tracker_client = TrackerClient( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name) upstream_output_table_infos_json = tracker_client.get_output_data_info( data_name=search_data_name) if upstream_output_table_infos_json: tracker = Tracker( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name) upstream_output_table_infos = [] for _ in upstream_output_table_infos_json: upstream_output_table_infos.append( fill_db_model_object( Tracker.get_dynamic_db_model( TrackingOutputDataInfo, job_id)(), _)) output_tables_meta = tracker.get_output_data_table( output_data_infos= upstream_output_table_infos) if output_tables_meta: storage_table_meta = output_tables_meta.get( search_data_name, None) args_from_component = this_type_args[ search_component_name] = this_type_args.get( search_component_name, {}) if get_input_table and storage_table_meta: input_table[data_key] = { 'namespace': storage_table_meta.get_namespace(), 'name': storage_table_meta.get_name() } computing_table = None elif storage_table_meta: LOGGER.info( f"load computing table use {task_parameters.computing_partitions}" ) computing_table = session.get_latest_opened( ).computing.load( storage_table_meta.get_address(), schema=storage_table_meta.get_schema(), partitions=task_parameters.computing_partitions ) else: computing_table = None if not computing_table or not filter_attr or not filter_attr.get( "data", None): data_dict[search_component_name][data_type].append( computing_table) args_from_component[data_type] = data_dict[ search_component_name][data_type] else: args_from_component[data_type] = dict([ (a, getattr(computing_table, "get_{}".format(a))()) for a in filter_attr["data"] ]) elif input_type in ['model', 'isometric_model']: this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for dsl_model_key in input_detail: dsl_model_key_items = dsl_model_key.split('.') if len(dsl_model_key_items) == 2: search_component_name, search_model_alias = dsl_model_key_items[ 0], dsl_model_key_items[1] elif len(dsl_model_key_items ) == 3 and dsl_model_key_items[0] == 'pipeline': search_component_name, search_model_alias = dsl_model_key_items[ 1], dsl_model_key_items[2] else: raise Exception( 'get input {} failed'.format(input_type)) models = Tracker( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name, model_id=job_parameters.model_id, model_version=job_parameters.model_version ).get_output_model(model_alias=search_model_alias) this_type_args[search_component_name] = models if get_input_table: return input_table return task_run_args
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": board_url } submit_result.update(path_dict) return submit_result
def run_task(cls): task_info = {} try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-v', '--task_version', required=True, type=int, help="task version") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=int, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task parameters") parser.add_argument('--run_ip', help="run ip", type=str) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config( JOB_SERVER_HOST=args.job_server.split(':')[0], HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id task_version = args.task_version role = args.role party_id = args.party_id executor_pid = os.getpid() task_info.update({ "job_id": job_id, "component_name": component_name, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, "run_ip": args.run_ip, "run_pid": executor_pid }) start_time = current_timestamp() job_conf = job_utils.get_job_conf(job_id, role) job_dsl = job_conf["job_dsl_path"] job_runtime_conf = job_conf["job_runtime_conf_path"] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=job_conf["train_runtime_conf_path"], pipeline_dsl=job_conf["pipeline_dsl_path"]) party_index = job_runtime_conf["role"][role].index(party_id) job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser, job_runtime_conf, role, party_id) component = dsl_parser.get_component_info( component_name=component_name) component_parameters = component.get_role_parameters() component_parameters_on_party = component_parameters[role][ party_index] if role in component_parameters else {} module_name = component.get_module() task_input_dsl = component.get_input() task_output_dsl = component.get_output() component_parameters_on_party[ 'output_data_name'] = task_output_dsl.get('data') task_parameters = RunParameters( **file_utils.load_json_conf(args.config)) job_parameters = task_parameters if job_parameters.assistant_role: TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task_info["party_status"] = TaskStatus.FAILED return try: job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) tracker_client = TrackerClient( job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) run_class_paths = component_parameters_on_party.get( 'CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_info["party_status"] = TaskStatus.RUNNING cls.report_task_update_to_driver(task_info=task_info) # init environment, process is shared globally RuntimeConfig.init_config( WORK_MODE=job_parameters.work_mode, COMPUTING_ENGINE=job_parameters.computing_engine, FEDERATION_ENGINE=job_parameters.federation_engine, FEDERATED_MODE=job_parameters.federated_mode) if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL: session_options = task_parameters.eggroll_run.copy() else: session_options = {} sess = session.Session( computing_type=job_parameters.computing_engine, federation_type=job_parameters.federation_engine) computing_session_id = job_utils.generate_session_id( task_id, task_version, role, party_id) sess.init_computing(computing_session_id=computing_session_id, options=session_options) federation_session_id = job_utils.generate_task_version_id( task_id, task_version) component_parameters_on_party[ "job_parameters"] = job_parameters.to_dict() sess.init_federation( federation_session_id=federation_session_id, runtime_conf=component_parameters_on_party, service_conf=job_parameters.engines_address.get( EngineType.FEDERATION, {})) sess.as_default() schedule_logger().info('Run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger().info("Component parameters on party {}".format( component_parameters_on_party)) schedule_logger().info("Task input dsl {}".format(task_input_dsl)) task_run_args = cls.get_task_run_args( job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, ) if module_name in {"Upload", "Download", "Reader", "Writer"}: task_run_args["job_parameters"] = job_parameters run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker_client) run_object.set_task_version_id( task_version_id=job_utils.generate_task_version_id( task_id, task_version)) # add profile logs profile.profile_start() run_object.run(component_parameters_on_party, task_run_args) profile.profile_ends() output_data = run_object.save_data() if not isinstance(output_data, list): output_data = [output_data] for index in range(0, len(output_data)): data_name = task_output_dsl.get( 'data')[index] if task_output_dsl.get( 'data') else '{}'.format(index) persistent_table_namespace, persistent_table_name = tracker.save_output_data( computing_table=output_data[index], output_storage_engine=job_parameters.storage_engine, output_storage_address=job_parameters.engines_address.get( EngineType.STORAGE, {})) if persistent_table_namespace and persistent_table_name: tracker.log_output_data_info( data_name=data_name, table_namespace=persistent_table_namespace, table_name=persistent_table_name) output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model( output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task_info["party_status"] = TaskStatus.SUCCESS except Exception as e: task_info["party_status"] = TaskStatus.FAILED schedule_logger().exception(e) finally: try: task_info["end_time"] = current_timestamp() task_info["elapsed"] = task_info["end_time"] - start_time cls.report_task_update_to_driver(task_info=task_info) except Exception as e: task_info["party_status"] = TaskStatus.FAILED traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format( task_id, role, party_id, timestamp_to_date(start_time))) schedule_logger().info('task {} {} {} end time: {}'.format( task_id, role, party_id, timestamp_to_date(task_info["end_time"]))) schedule_logger().info('task {} {} {} takes {}s'.format( task_id, role, party_id, int(task_info["elapsed"]) / 1000)) schedule_logger().info('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) print('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) return task_info
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_initiator = job_runtime_conf['initiator'] job_parameters = RunParameters(**job_runtime_conf['job_parameters']) cls.backend_compatibility(job_parameters=job_parameters) job_utils.check_job_runtime_conf(job_runtime_conf) if job_parameters.job_type != 'predict': # generate job model info job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role']) job_parameters.model_version = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters.model_id, model_version=job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = job_utils.save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) cls.adapt_job_parameters(job_parameters=job_parameters) # update runtime conf job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("create job failed: {}".format(response)) if job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job_runtime_conf["role"].items(): for party_id in party_ids: if role == job_initiator['role'] and party_id == job_initiator['party_id']: continue JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser) # push into queue try: JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) except Exception as e: raise Exception(f'push job into queue failed:\n{e}') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url