def eval(args): model_file = os.path.join(args.model_path, args.model_filename) params_file = os.path.join(args.model_path, args.params_filename) config = paddle_infer.Config(model_file, params_file) config.enable_mkldnn() predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() input_handle = predictor.get_input_handle(input_names[0]) output_names = predictor.get_output_names() output_handle = predictor.get_output_handle(output_names[0]) val_dataset = dataset.ImageNetDataset(data_dir=args.data_dir, mode='val') eval_loader = paddle.io.DataLoader( val_dataset, batch_size=args.batch_size, drop_last=True) cost_time = 0. total_num = 0. correct_1_num = 0 correct_5_num = 0 for batch_id, data in enumerate(eval_loader()): img_np = np.array([tensor.numpy() for tensor in data[0]]) label_np = np.array([tensor.numpy() for tensor in data[1]]) input_handle.reshape(img_np.shape) input_handle.copy_from_cpu(img_np) t1 = time.time() predictor.run() t2 = time.time() cost_time += (t2 - t1) output_data = output_handle.copy_to_cpu() for i in range(len(label_np)): label = label_np[i][0] result = output_data[i, :] index = result.argsort() total_num += 1 if index[-1] == label: correct_1_num += 1 if label in index[-5:]: correct_5_num += 1 if batch_id % 10 == 0: acc1 = correct_1_num / total_num acc5 = correct_5_num / total_num avg_time = cost_time / total_num print( "batch_id {}, acc1 {:.3f}, acc5 {:.3f}, avg time {:.5f} sec/img". format(batch_id, acc1, acc5, avg_time)) if args.test_samples > 0 and \ (batch_id + 1)* args.batch_size >= args.test_samples: break acc1 = correct_1_num / total_num acc5 = correct_5_num / total_num print("End test: test_acc1 {:.3f}, test_acc5 {:.5f}".format(acc1, acc5))
def create_paddle_predictor(self, args, inference_model_dir=None): if inference_model_dir is None: inference_model_dir = args.inference_model_dir params_file = os.path.join(inference_model_dir, "inference.pdiparams") model_file = os.path.join(inference_model_dir, "inference.pdmodel") config = Config(model_file, params_file) if args.use_gpu: config.enable_use_gpu(args.gpu_mem, 0) else: config.disable_gpu() if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() config.set_cpu_math_library_num_threads(args.cpu_num_threads) if args.enable_profile: config.enable_profile() config.disable_glog_info() config.switch_ir_optim(args.ir_optim) # default true if args.use_tensorrt: config.enable_tensorrt_engine( precision_mode=Config.Precision.Half if args.use_fp16 else Config.Precision.Float32, max_batch_size=args.batch_size, min_subgraph_size=30) config.enable_memory_optim() # use zero copy config.switch_use_feed_fetch_ops(False) predictor = create_predictor(config) return predictor, config
def __init__(self, cfg, name='BMN'): name = name.upper() self.name = name model_file = cfg[name]['model_file'] params_file = cfg[name]['params_file'] gpu_mem = cfg[name]['gpu_mem'] device_id = cfg[name]['device_id'] self.nms_thread = cfg[name]['nms_thread'] self.min_pred_score = cfg[name]['score_thread'] self.min_frame_thread = cfg['COMMON']['fps'] # model init config = Config(model_file, params_file) config.enable_use_gpu(gpu_mem, device_id) config.switch_ir_optim(True) # default true config.enable_memory_optim() # use zero copy config.switch_use_feed_fetch_ops(False) self.predictor = create_predictor(config) input_names = self.predictor.get_input_names() self.input_tensor = self.predictor.get_input_handle(input_names[0]) output_names = self.predictor.get_output_names() self.output1_tensor = self.predictor.get_output_handle(output_names[0]) self.output2_tensor = self.predictor.get_output_handle(output_names[1]) self.output3_tensor = self.predictor.get_output_handle(output_names[2])
def init_predictor(args): if args.model_dir is not "": config = Config(args.model_dir) else: config = Config(args.model_file, args.params_file) config.enable_memory_optim() if args.tune: config.collect_shape_range_info(shape_file) if args.use_gpu: config.enable_use_gpu(1000, 0) if args.use_trt: # using dynamic shpae mode, the max_batch_size will be ignored. config.enable_tensorrt_engine(workspace_size=1 << 30, max_batch_size=1, min_subgraph_size=5, precision_mode=PrecisionType.Float32, use_static=False, use_calib_mode=False) if args.tuned_dynamic_shape: config.enable_tuned_tensorrt_dynamic_shape(shape_file, True) else: # If not specific mkldnn, you can set the blas thread. # The thread num should not be greater than the number of cores in the CPU. config.set_cpu_math_library_num_threads(4) config.enable_mkldnn() predictor = create_predictor(config) return predictor
def create_predictor(args, mode, logger): if mode == "det": model_dir = args.det_model_dir elif mode == 'cls': model_dir = args.cls_model_dir elif mode == 'rec': model_dir = args.rec_model_dir else: model_dir = args.e2e_model_dir if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) model_file_path = model_dir + "/inference.pdmodel" params_file_path = model_dir + "/inference.pdiparams" if not os.path.exists(model_file_path): logger.info("not find model file path {}".format(model_file_path)) sys.exit(0) if not os.path.exists(params_file_path): logger.info("not find params file path {}".format(params_file_path)) sys.exit(0) config = inference.Config(model_file_path, params_file_path) if args.use_gpu: config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( precision_mode=inference.PrecisionType.Half if args.use_fp16 else inference.PrecisionType.Float32, max_batch_size=args.max_batch_size) else: config.disable_gpu() config.set_cpu_math_library_num_threads(6) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() # TODO LDOUBLEV: fix mkldnn bug when bach_size > 1 #config.set_mkldnn_op({'conv2d', 'depthwise_conv2d', 'pool2d', 'batch_norm'}) args.rec_batch_num = 1 # enable memory optim config.enable_memory_optim() config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") config.switch_use_feed_fetch_ops(False) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: input_tensor = predictor.get_input_handle(name) output_names = predictor.get_output_names() output_tensors = [] for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return predictor, input_tensor, output_tensors
def get_model(self): # Download and unzip model URL = self.args.url model_name = self.args.model_name file_name = self.args.file_name # Save model in temporary directory and load model into memory with tempfile.TemporaryDirectory() as tmpdirname: os.system("wget -P {} {}".format(tmpdirname, URL)) os.system("tar -zvxf {0}/{1} -C {0}".format(tmpdirname, file_name)) with open("{}/{}/inference.pdmodel".format(tmpdirname, model_name), "rb") as f: model = f.read() with open( "{}/{}/inference.pdiparams".format(tmpdirname, model_name), "rb") as f: params = f.read() # acquire input names paddle_config = self.create_inference_config(ir_optim=False) paddle_config.set_model_buffer(model, len(model), params, len(params)) predictor = paddle_infer.create_predictor(paddle_config) self.input_names = predictor.get_input_names() return model, params
def create_predictor(cls, args, config=None): if config is None: config = inference.Config( os.path.join(args.inference_model_dir, "transformer.pdmodel"), os.path.join(args.inference_model_dir, "transformer.pdiparams")) if args.use_gpu: config.enable_use_gpu(100, 0) elif args.use_xpu: config.enable_xpu(100) else: # CPU # such as enable_mkldnn, set_cpu_math_library_num_threads config.disable_gpu() # Use ZeroCopy. config.switch_use_feed_fetch_ops(False) predictor = inference.create_predictor(config) input_handles = [ predictor.get_input_handle(name) for name in predictor.get_input_names() ] output_handles = [ predictor.get_input_handle(name) for name in predictor.get_output_names() ] return cls(predictor, input_handles, output_handles)
def __init__(self, cfg, name='PPTSM'): name = name.upper() self.name = name model_file = cfg[name]['model_file'] params_file = cfg[name]['params_file'] gpu_mem = cfg[name]['gpu_mem'] device_id = cfg[name]['device_id'] # model init config = Config(model_file, params_file) config.enable_use_gpu(gpu_mem, device_id) config.switch_ir_optim(True) # default true config.enable_memory_optim() # use zero copy config.switch_use_feed_fetch_ops(False) self.predictor = create_predictor(config) input_names = self.predictor.get_input_names() self.input_tensor = self.predictor.get_input_handle(input_names[0]) output_names = self.predictor.get_output_names() print("output_names = ", output_names) #self.output_tensor = self.predictor.get_output_handle(output_names[1]) self.output_tensor = self.predictor.get_output_handle(output_names[0])
def get_truth_val_by_inference(self): try: import paddle.inference as paddle_infer except: # when paddle is not installed, directly return return data = np.array( [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]).astype("float32")[np.newaxis, :] input_dict = {"x": data} pd_config = paddle_infer.Config("uci_housing_model/") pd_config.disable_gpu() pd_config.switch_ir_optim(False) predictor = paddle_infer.create_predictor(pd_config) input_names = predictor.get_input_names() for i, input_name in enumerate(input_names): input_handle = predictor.get_input_handle(input_name) input_handle.copy_from_cpu(input_dict[input_name]) predictor.run() output_data_dict = {} output_names = predictor.get_output_names() for _, output_data_name in enumerate(output_names): output_handle = predictor.get_output_handle(output_data_name) output_data = output_handle.copy_to_cpu() output_data_dict[output_data_name] = output_data # convert to the same format of Serving output output_data_dict["prob"] = output_data_dict["fc_0.tmp_1"] del output_data_dict["fc_0.tmp_1"] self.truth_val = output_data_dict
def paddle_inference(args): import paddle.inference as paddle_infer config = paddle_infer.Config(args.model_file, args.params_file) predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() input_handle = predictor.get_input_handle(input_names[0]) img = cv2.imread(args.image_path) # normalize to mean 0.5, std 0.5 img = (img - 127.5) * 0.00784313725 # BGR2RGB img = img[:, :, ::-1] img = img.transpose((2, 0, 1)) img = np.expand_dims(img, 0) img = img.astype('float32') input_handle.copy_from_cpu(img) predictor.run() output_names = predictor.get_output_names() output_handle = predictor.get_output_handle(output_names[0]) output_data = output_handle.copy_to_cpu() print('paddle inference result: ', output_data.shape)
def create_paddle_predictor(args): config = Config(args.model_file, args.params_file) if args.use_gpu: config.enable_use_gpu(args.gpu_mem, 0) else: config.disable_gpu() if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() #config.disable_glog_info() config.switch_ir_optim(args.ir_optim) # default true if args.use_tensorrt: config.enable_tensorrt_engine( precision_mode=Config.Precision.Half if args.use_fp16 else Config.Precision.Float32, max_batch_size=args.batch_size) config.enable_memory_optim() # use zero copy config.switch_use_feed_fetch_ops(False) predictor = create_predictor(config) return predictor
def __init__(self, cfg, name='ACTION'): name = name.upper() self.name = name model_file = cfg[name]['model_file'] params_file = cfg[name]['params_file'] gpu_mem = cfg[name]['gpu_mem'] device_id = cfg[name]['device_id'] self.topk = cfg[name]['topk'] self.frame_offset = cfg[name]['nms_offset'] self.nms_thread = cfg[name]['nms_thread'] self.cls_thread = cfg[name]['classify_score_thread'] self.iou_thread = cfg[name]['iou_score_thread'] self.label_map_file = cfg['COMMON']['label_dic'] self.fps = cfg['COMMON']['fps'] self.nms_id = 5 # model init config = Config(model_file, params_file) config.enable_use_gpu(gpu_mem, device_id) config.switch_ir_optim(True) # default true config.enable_memory_optim() # use zero copy config.switch_use_feed_fetch_ops(False) self.predictor = create_predictor(config) input_names = self.predictor.get_input_names() self.input1_tensor = self.predictor.get_input_handle(input_names[0]) #self.input2_tensor = self.predictor.get_input_handle(input_names[1]) output_names = self.predictor.get_output_names() self.output1_tensor = self.predictor.get_output_handle(output_names[0]) self.output2_tensor = self.predictor.get_output_handle(output_names[1])
def __init__(self, model_path, param_path, use_gpu=False): model_path, param_path = self.check_param(model_path, param_path) try: config = paddle_infer.Config(model_path, param_path) except: ValueError(" 模型和参数不匹配,请检查模型和参数是否加载错误") if not use_gpu: config.enable_mkldnn() # TODO: fluid要废弃了,研究判断方式 # if paddle.fluid.core.supports_bfloat16(): # config.enable_mkldnn_bfloat16() config.switch_ir_optim(True) config.set_cpu_math_library_num_threads(10) else: config.enable_use_gpu(500, 0) config.delete_pass("conv_elementwise_add_act_fuse_pass") config.delete_pass("conv_elementwise_add2_act_fuse_pass") config.delete_pass("conv_elementwise_add_fuse_pass") config.switch_ir_optim() config.enable_memory_optim() # use_tensoret = False # TODO: 目前Linux和windows下使用TensorRT报错 # if use_tensoret: # config.enable_tensorrt_engine( # workspace_size=1 << 30, # precision_mode=paddle_infer.PrecisionType.Float32, # max_batch_size=1, # min_subgraph_size=5, # use_static=False, # use_calib_mode=False, # ) self.model = paddle_infer.create_predictor(config)
def load_predictor(self, model_file, params_file): config = Config(model_file, params_file) if self.predictor_config["use_gpu"]: config.enable_use_gpu(200, 0) config.switch_ir_optim(True) else: config.disable_gpu() config.set_cpu_math_library_num_threads( self.predictor_config["cpu_threads"]) if self.predictor_config["enable_mkldnn"]: try: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() except Exception as e: logging.error( "The current environment does not support `mkldnn`, so disable mkldnn." ) config.disable_glog_info() config.enable_memory_optim() # use zero copy config.switch_use_feed_fetch_ops(False) predictor = create_predictor(config) input_names = predictor.get_input_names() output_names = predictor.get_output_names() return predictor, input_names, output_names
def load(self) -> bool: def get_model_files(ext: str) -> str: file_list = [] for filename in os.listdir(model_path): if filename.endswith(ext): file_list.append(filename) if len(file_list) == 0: raise Exception("Missing {} model file".format(ext)) if len(file_list) > 1: raise Exception("More than one {} model file".format(ext)) return os.path.join(model_path, file_list[0]) model_path = kserve.Storage.download(self.model_dir) config = inference.Config(get_model_files('.pdmodel'), get_model_files('.pdiparams')) # TODO: add GPU support config.disable_gpu() self.predictor = inference.create_predictor(config) # TODO: add support for multiple input_names/output_names input_names = self.predictor.get_input_names() self.input_tensor = self.predictor.get_input_handle(input_names[0]) output_names = self.predictor.get_output_names() self.output_tensor = self.predictor.get_output_handle(output_names[0]) self.ready = True return self.ready
def eval(self): ''' create the model predictor by model config ''' # 创建预测器 self.predictor = create_predictor(self.config) # 获取模型的输入输出名称 self.input_names = self.predictor.get_input_names() self.output_names = self.predictor.get_output_names() # 获取模型的输入输出节点数量 self.input_num = len(self.input_names) self.output_num = len(self.output_names) # 获取输入 self.input_handles = [] for input_name in self.input_names: self.input_handles.append( self.predictor.get_input_handle(input_name)) # 获取输出 self.output_handles = [] for output_name in self.output_names: self.output_handles.append( self.predictor.get_output_handle(output_name))
def test_static_save_and_run_inference_predictor(self): paddle.enable_static() np_data = np.random.random((1, 1, 28, 28)).astype("float32") np_label = np.random.random((1, 1)).astype("int64") path_prefix = "custom_op_inference/custom_relu" from paddle.inference import Config from paddle.inference import create_predictor for device in self.devices: predict = custom_relu_static_inference( self.custom_ops[0], device, np_data, np_label, path_prefix) # load inference model config = Config(path_prefix + ".pdmodel", path_prefix + ".pdiparams") predictor = create_predictor(config) input_tensor = predictor.get_input_handle(predictor.get_input_names( )[0]) input_tensor.reshape(np_data.shape) input_tensor.copy_from_cpu(np_data.copy()) predictor.run() output_tensor = predictor.get_output_handle( predictor.get_output_names()[0]) predict_infer = output_tensor.copy_to_cpu() self.assertTrue( np.isclose( predict, predict_infer, rtol=5e-5).any(), "custom op predict: {},\n custom op infer predict: {}".format( predict, predict_infer)) paddle.disable_static()
def load_model(self, model_dir, use_gpu=False, enable_mkldnn=False, cpu_threads=1): model = os.path.join(model_dir, '__model__') params = os.path.join(model_dir, '__params__') config = Config(model, params) # 设置参数 if use_gpu: config.enable_use_gpu(100, 0) else: config.disable_gpu() config.set_cpu_math_library_num_threads(cpu_threads) if enable_mkldnn: config.enable_mkldnn() config.set_mkldnn_cache_capacity(10) config.disable_glog_info() config.switch_ir_optim(True) config.enable_memory_optim() config.switch_use_feed_fetch_ops(False) config.switch_specify_input_names(True) # 通过参数加载模型预测器 predictor = create_predictor(config) # 获取模型的输入输出 input_names = predictor.get_input_names() output_names = predictor.get_output_names() input_handle = predictor.get_input_handle(input_names[0]) output_handle = predictor.get_output_handle(output_names[0]) return predictor, input_handle, output_handle
def create_predictor(cls, args, config=None, profile=False, model_name=None): if config is None: config = inference.Config( os.path.join(args.inference_model_dir, "transformer.pdmodel"), os.path.join(args.inference_model_dir, "transformer.pdiparams")) if args.device == "gpu": config.enable_use_gpu(100, 0) elif args.device == "xpu": config.enable_xpu(100) else: # CPU config.disable_gpu() if args.use_mkl: config.enable_mkldnn() config.set_cpu_math_library_num_threads(args.threads) # Use ZeroCopy. config.switch_use_feed_fetch_ops(False) if profile: if args.mod is recorder: autolog = args.mod.Recorder(config, args.infer_batch_size, args.model_name) else: pid = os.getpid() autolog = args.mod.AutoLogger( model_name=args.model_name, model_precision="fp32", batch_size=args.infer_batch_size, save_path=args.save_log_path, inference_config=config, data_shape="dynamic", pids=pid, process_name=None, gpu_ids=0 if args.device == "gpu" else None, time_keys=[ 'preprocess_time', 'inference_time', 'postprocess_time' ], warmup=0, logger=logger) else: autolog = None predictor = inference.create_predictor(config) input_handles = [ predictor.get_input_handle(name) for name in predictor.get_input_names() ] output_handles = [ predictor.get_output_handle(name) for name in predictor.get_output_names() ] return cls(predictor, input_handles, output_handles, autolog)
def load_predictor(model_dir, run_mode='fluid', batch_size=1, use_gpu=False, min_subgraph_size=3): """set AnalysisConfig, generate AnalysisPredictor Args: model_dir (str): root path of __model__ and __params__ use_gpu (bool): whether use gpu Returns: predictor (PaddlePredictor): AnalysisPredictor Raises: ValueError: predict by TensorRT need use_gpu == True. """ if not use_gpu and not run_mode == 'fluid': raise ValueError( "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}" .format(run_mode, use_gpu)) if run_mode == 'trt_int8': raise ValueError("TensorRT int8 mode is not supported now, " "please use trt_fp32 or trt_fp16 instead.") config = Config(os.path.join(model_dir, 'model.pdmodel'), os.path.join(model_dir, 'model.pdiparams')) precision_map = { 'trt_int8': Config.Precision.Int8, 'trt_fp32': Config.Precision.Float32, 'trt_fp16': Config.Precision.Half } if use_gpu: # initial GPU memory(M), device ID config.enable_use_gpu(200, 0) # optimize graph and fuse op # FIXME(dkp): ir optimize may prune variable inside graph # and incur error in Paddle 2.0, e.g. in SSDLite # FCOS model, set as False currently and should # be set as True after switch_ir_optim fixed config.switch_ir_optim(False) else: config.disable_gpu() if run_mode in precision_map.keys(): config.enable_tensorrt_engine(workspace_size=1 << 10, max_batch_size=batch_size, min_subgraph_size=min_subgraph_size, precision_mode=precision_map[run_mode], use_static=False, use_calib_mode=False) # disable print log when predict config.disable_glog_info() # enable shared memory config.enable_memory_optim() # disable feed, fetch OP, needed by zero_copy_run config.switch_use_feed_fetch_ops(False) predictor = create_predictor(config) return predictor
def init_resnet50_predictor(model_dir): model_file = model_dir + '.pdmodel' params_file = model_dir + '.pdiparams' config = inference.Config() config.set_prog_file(model_file) config.set_params_file(params_file) config.use_gpu() config.enable_use_gpu(500, 0) predictor = inference.create_predictor(config) return predictor
def init_predictor(args): config = Config() if args.model_dir == "": config.set_model(args.model_file, args.params_file) else: config.set_model(args.model_dir) #config.disable_glog_info() config.enable_use_gpu(1000, 3) predictor = create_predictor(config) return predictor
def test_wrong_input(self): with self.assertRaises(TypeError): program, params = get_sample_model() config = self.get_config(program, params) predictor = create_predictor(config) in_names = predictor.get_input_names() in_handle = predictor.get_input_handle(in_names[0]) in_data = np.ones((1, 6, 64, 64)).astype(np.float32) in_handle.copy_from_cpu(list(in_data)) predictor.run()
def test_apis(self): print('trt compile version:', get_trt_compile_version()) print('trt runtime version:', get_trt_runtime_version()) program, params = get_sample_model() config = self.get_config(program, params) predictor = create_predictor(config) in_names = predictor.get_input_names() in_handle = predictor.get_input_handle(in_names[0]) in_data = np.ones((1, 6, 32, 32)).astype(np.float32) in_handle.copy_from_cpu(in_data) predictor.run()
def init_predictor(model_dir): # refer https://paddle-inference.readthedocs.io/en/latest/api_reference/python_api_doc/Config/GPUConfig.html model_file = model_dir+'.pdmodel' params_file = model_dir + '.pdiparams' config = inference.Config() config.set_prog_file(model_file) config.set_params_file(params_file) # 启用 GPU 进行预测 - 初始化 GPU 显存 50M, Deivce_ID 为 0 config.enable_use_gpu(50, 0) predictor = inference.create_predictor(config) return predictor
def _set_config(self): """ predictor config setting. """ # create default cpu predictor cpu_config = Config(self.default_pretrained_model_path) cpu_config.disable_glog_info() cpu_config.disable_gpu() self.cpu_predictor = create_predictor(cpu_config) # create predictors using various types of devices # npu npu_id = self._get_device_id("FLAGS_selected_npus") if npu_id != -1: # use npu npu_config = Config(self.default_pretrained_model_path) npu_config.disable_glog_info() npu_config.enable_npu(device_id=npu_id) self.npu_predictor = create_predictor(npu_config) # gpu gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES") if gpu_id != -1: # use gpu gpu_config = Config(self.default_pretrained_model_path) gpu_config.disable_glog_info() gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id) self.gpu_predictor = create_predictor(gpu_config) # xpu xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES") if xpu_id != -1: # use xpu xpu_config = Config(self.default_pretrained_model_path) xpu_config.disable_glog_info() xpu_config.enable_xpu(100) self.xpu_predictor = create_predictor(xpu_config)
def __init__(self, args): """ Prepare for prediction. The usage and docs of paddle inference, please refer to https://paddleinference.paddlepaddle.org.cn/product_introduction/summary.html """ self.args = args self.cfg = DeployConfig(args.cfg) self._init_base_config() self._init_cpu_config() self.predictor = create_predictor(self.pred_cfg)
def init_predictor(args): if args.model_dir: config = Config(args.model_dir) else: config = Config(args.model_file, args.params_file) if args.use_gpu: config.enable_use_gpu(1000, 0) else: config.disable_gpu() print(config) # config.delete('repeated_fc_relu_fuse_pass') predictor = create_predictor(config) return predictor
def infer(args): model_name = 'plato-xl' tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name) context = [ "Hi , Becky , what's up ?", "Not much , except that my mother-in-law is driving me up the wall .", "What's the problem ?" ] data = tokenizer.dialogue_encode( history=context, add_start_token_as_response=True, return_length=True, return_role_ids=args.use_role, position_style=args.position_style) # Load FasterTransformer lib. load("FasterTransformer", verbose=True) config = paddle_infer.Config(args.inference_model_dir + "plato.pdmodel", args.inference_model_dir + "plato.pdiparams") config.enable_use_gpu(100, 0) config.disable_glog_info() predictor = paddle_infer.create_predictor(config) input_handles = {} for name in predictor.get_input_names(): input_handles[name] = predictor.get_input_handle(name) if name == "attention_mask": input_handles[name].copy_from_cpu( np.expand_dims( np.asarray( data[name], dtype="float32"), axis=(0, 1))) else: input_handles[name].copy_from_cpu( np.asarray( data[name], dtype="int32").reshape([1, -1])) output_handles = [ predictor.get_output_handle(name) for name in predictor.get_output_names() ] predictor.run() output = [output_handle.copy_to_cpu() for output_handle in output_handles] for sample in output[0].transpose([1, 0]).tolist(): print(" ".join(postprocess_response(sample, tokenizer)))
def init_predictor(args): config = Config(os.path.join(args.model_dir, "inference.pdmodel"), os.path.join(args.model_dir, "inference.pdiparams")) config.enable_memory_optim() if args.use_gpu: config.enable_use_gpu(1000, 0) else: # If not specific mkldnn, you can set the blas thread. # The thread num should not be greater than the number of cores in the CPU. config.set_cpu_math_library_num_threads(4) predictor = create_predictor(config) return predictor