Exemplo n.º 1
0
 def create_inference_config(self, use_trt=True) -> paddle_infer.Config:
     if use_trt:
         config = paddle_infer.Config()
         config.disable_glog_info()
         config.enable_use_gpu(100, 0)
         config.set_optim_cache_dir(self.cache_dir)
         config.switch_ir_debug()
         config.enable_tensorrt_engine(
             max_batch_size=self.trt_param.max_batch_size,
             workspace_size=self.trt_param.workspace_size,
             min_subgraph_size=self.trt_param.min_subgraph_size,
             precision_mode=self.trt_param.precision,
             use_static=self.trt_param.use_static,
             use_calib_mode=self.trt_param.use_calib_mode)
         if len(self.dynamic_shape.min_input_shape
                ) != 0 and self.dynamic_shape.min_input_shape.keys(
                ) == self.dynamic_shape.max_input_shape.keys(
                ) and self.dynamic_shape.min_input_shape.keys(
                ) == self.dynamic_shape.opt_input_shape.keys():
             config.set_trt_dynamic_shape_info(
                 self.dynamic_shape.min_input_shape,
                 self.dynamic_shape.max_input_shape,
                 self.dynamic_shape.opt_input_shape,
                 self.dynamic_shape.disable_trt_plugin_fp16)
         return config
     else:
         config = paddle_infer.Config()
         config.switch_ir_debug(True)
         config.set_optim_cache_dir(self.cache_dir)
         config.disable_glog_info()
         return config
Exemplo n.º 2
0
 def create_trt_inference_config(self) -> paddle_infer.Config:
     config = paddle_infer.Config()
     config.disable_glog_info()
     config.enable_use_gpu(100, 0)
     config.set_optim_cache_dir(self.cache_dir)
     config.switch_ir_debug()
     return config
Exemplo n.º 3
0
    def create_predictor(cls, args, config=None):
        if config is None:
            config = inference.Config(
                os.path.join(args.inference_model_dir, "transformer.pdmodel"),
                os.path.join(args.inference_model_dir,
                             "transformer.pdiparams"))
            if args.use_gpu:
                config.enable_use_gpu(100, 0)
            elif args.use_xpu:
                config.enable_xpu(100)
            else:
                # CPU
                # such as enable_mkldnn, set_cpu_math_library_num_threads
                config.disable_gpu()
            # Use ZeroCopy.
            config.switch_use_feed_fetch_ops(False)

        predictor = inference.create_predictor(config)
        input_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_input_names()
        ]
        output_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_output_names()
        ]
        return cls(predictor, input_handles, output_handles)
Exemplo n.º 4
0
    def get_truth_val_by_inference(self):
        try:
            import paddle.inference as paddle_infer
        except:
            # when paddle is not installed, directly return
            return
        data = np.array(
            [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795,
             -0.0332]).astype("float32")[np.newaxis, :]
        input_dict = {"x": data}

        pd_config = paddle_infer.Config("uci_housing_model/")
        pd_config.disable_gpu()
        pd_config.switch_ir_optim(False)

        predictor = paddle_infer.create_predictor(pd_config)

        input_names = predictor.get_input_names()
        for i, input_name in enumerate(input_names):
            input_handle = predictor.get_input_handle(input_name)
            input_handle.copy_from_cpu(input_dict[input_name])

        predictor.run()

        output_data_dict = {}
        output_names = predictor.get_output_names()
        for _, output_data_name in enumerate(output_names):
            output_handle = predictor.get_output_handle(output_data_name)
            output_data = output_handle.copy_to_cpu()
            output_data_dict[output_data_name] = output_data
        # convert to the same format of Serving output
        output_data_dict["prob"] = output_data_dict["fc_0.tmp_1"]
        del output_data_dict["fc_0.tmp_1"]
        self.truth_val = output_data_dict
Exemplo n.º 5
0
def paddle_inference(args):
    import paddle.inference as paddle_infer

    config = paddle_infer.Config(args.model_file, args.params_file)
    predictor = paddle_infer.create_predictor(config)

    input_names = predictor.get_input_names()
    input_handle = predictor.get_input_handle(input_names[0])

    img = cv2.imread(args.image_path)
    # normalize to mean 0.5, std 0.5
    img = (img - 127.5) * 0.00784313725
    # BGR2RGB
    img = img[:, :, ::-1]
    img = img.transpose((2, 0, 1))
    img = np.expand_dims(img, 0)
    img = img.astype('float32')

    input_handle.copy_from_cpu(img)

    predictor.run()

    output_names = predictor.get_output_names()
    output_handle = predictor.get_output_handle(output_names[0])
    output_data = output_handle.copy_to_cpu()

    print('paddle inference result: ', output_data.shape)
Exemplo n.º 6
0
 def __init__(self, model_path, param_path, use_gpu=False):
     model_path, param_path = self.check_param(model_path, param_path)
     try:
         config = paddle_infer.Config(model_path, param_path)
     except:
         ValueError(" 模型和参数不匹配,请检查模型和参数是否加载错误")
     if not use_gpu:
         config.enable_mkldnn()
         # TODO: fluid要废弃了,研究判断方式
         # if paddle.fluid.core.supports_bfloat16():
         #     config.enable_mkldnn_bfloat16()
         config.switch_ir_optim(True)
         config.set_cpu_math_library_num_threads(10)
     else:
         config.enable_use_gpu(500, 0)
         config.delete_pass("conv_elementwise_add_act_fuse_pass")
         config.delete_pass("conv_elementwise_add2_act_fuse_pass")
         config.delete_pass("conv_elementwise_add_fuse_pass")
         config.switch_ir_optim()
         config.enable_memory_optim()
         # use_tensoret = False  # TODO: 目前Linux和windows下使用TensorRT报错
         # if use_tensoret:
         #     config.enable_tensorrt_engine(
         #         workspace_size=1 << 30,
         #         precision_mode=paddle_infer.PrecisionType.Float32,
         #         max_batch_size=1,
         #         min_subgraph_size=5,
         #         use_static=False,
         #         use_calib_mode=False,
         #     )
     self.model = paddle_infer.create_predictor(config)
Exemplo n.º 7
0
    def load(self) -> bool:
        def get_model_files(ext: str) -> str:
            file_list = []
            for filename in os.listdir(model_path):
                if filename.endswith(ext):
                    file_list.append(filename)
            if len(file_list) == 0:
                raise Exception("Missing {} model file".format(ext))
            if len(file_list) > 1:
                raise Exception("More than one {} model file".format(ext))
            return os.path.join(model_path, file_list[0])

        model_path = kserve.Storage.download(self.model_dir)
        config = inference.Config(get_model_files('.pdmodel'),
                                  get_model_files('.pdiparams'))
        # TODO: add GPU support
        config.disable_gpu()

        self.predictor = inference.create_predictor(config)

        # TODO: add support for multiple input_names/output_names
        input_names = self.predictor.get_input_names()
        self.input_tensor = self.predictor.get_input_handle(input_names[0])
        output_names = self.predictor.get_output_names()
        self.output_tensor = self.predictor.get_output_handle(output_names[0])

        self.ready = True
        return self.ready
Exemplo n.º 8
0
def eval(args):
    model_file = os.path.join(args.model_path, args.model_filename)
    params_file = os.path.join(args.model_path, args.params_filename)
    config = paddle_infer.Config(model_file, params_file)
    config.enable_mkldnn()

    predictor = paddle_infer.create_predictor(config)

    input_names = predictor.get_input_names()
    input_handle = predictor.get_input_handle(input_names[0])
    output_names = predictor.get_output_names()
    output_handle = predictor.get_output_handle(output_names[0])

    val_dataset = dataset.ImageNetDataset(data_dir=args.data_dir, mode='val')
    eval_loader = paddle.io.DataLoader(
        val_dataset, batch_size=args.batch_size, drop_last=True)

    cost_time = 0.
    total_num = 0.
    correct_1_num = 0
    correct_5_num = 0
    for batch_id, data in enumerate(eval_loader()):
        img_np = np.array([tensor.numpy() for tensor in data[0]])
        label_np = np.array([tensor.numpy() for tensor in data[1]])

        input_handle.reshape(img_np.shape)
        input_handle.copy_from_cpu(img_np)

        t1 = time.time()
        predictor.run()
        t2 = time.time()
        cost_time += (t2 - t1)

        output_data = output_handle.copy_to_cpu()

        for i in range(len(label_np)):
            label = label_np[i][0]
            result = output_data[i, :]
            index = result.argsort()
            total_num += 1
            if index[-1] == label:
                correct_1_num += 1
            if label in index[-5:]:
                correct_5_num += 1

        if batch_id % 10 == 0:
            acc1 = correct_1_num / total_num
            acc5 = correct_5_num / total_num
            avg_time = cost_time / total_num
            print(
                "batch_id {}, acc1 {:.3f}, acc5 {:.3f}, avg time {:.5f} sec/img".
                format(batch_id, acc1, acc5, avg_time))

        if args.test_samples > 0 and \
            (batch_id + 1)* args.batch_size >= args.test_samples:
            break

    acc1 = correct_1_num / total_num
    acc5 = correct_5_num / total_num
    print("End test: test_acc1 {:.3f}, test_acc5 {:.5f}".format(acc1, acc5))
Exemplo n.º 9
0
def create_predictor(args, mode, logger):
    if mode == "det":
        model_dir = args.det_model_dir
    elif mode == 'cls':
        model_dir = args.cls_model_dir
    elif mode == 'rec':
        model_dir = args.rec_model_dir
    else:
        model_dir = args.e2e_model_dir

    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
        sys.exit(0)
    model_file_path = model_dir + "/inference.pdmodel"
    params_file_path = model_dir + "/inference.pdiparams"
    if not os.path.exists(model_file_path):
        logger.info("not find model file path {}".format(model_file_path))
        sys.exit(0)
    if not os.path.exists(params_file_path):
        logger.info("not find params file path {}".format(params_file_path))
        sys.exit(0)

    config = inference.Config(model_file_path, params_file_path)

    if args.use_gpu:
        config.enable_use_gpu(args.gpu_mem, 0)
        if args.use_tensorrt:
            config.enable_tensorrt_engine(
                precision_mode=inference.PrecisionType.Half
                if args.use_fp16 else inference.PrecisionType.Float32,
                max_batch_size=args.max_batch_size)
    else:
        config.disable_gpu()
        config.set_cpu_math_library_num_threads(6)
        if args.enable_mkldnn:
            # cache 10 different shapes for mkldnn to avoid memory leak
            config.set_mkldnn_cache_capacity(10)
            config.enable_mkldnn()
            #  TODO LDOUBLEV: fix mkldnn bug when bach_size  > 1
            #config.set_mkldnn_op({'conv2d', 'depthwise_conv2d', 'pool2d', 'batch_norm'})
            args.rec_batch_num = 1

    # enable memory optim
    config.enable_memory_optim()
    config.disable_glog_info()

    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
    config.switch_use_feed_fetch_ops(False)

    # create predictor
    predictor = inference.create_predictor(config)
    input_names = predictor.get_input_names()
    for name in input_names:
        input_tensor = predictor.get_input_handle(name)
    output_names = predictor.get_output_names()
    output_tensors = []
    for output_name in output_names:
        output_tensor = predictor.get_output_handle(output_name)
        output_tensors.append(output_tensor)
    return predictor, input_tensor, output_tensors
Exemplo n.º 10
0
    def create_predictor(cls,
                         args,
                         config=None,
                         profile=False,
                         model_name=None):
        if config is None:
            config = inference.Config(
                os.path.join(args.inference_model_dir, "transformer.pdmodel"),
                os.path.join(args.inference_model_dir,
                             "transformer.pdiparams"))
            if args.device == "gpu":
                config.enable_use_gpu(100, 0)
            elif args.device == "xpu":
                config.enable_xpu(100)
            else:
                # CPU
                config.disable_gpu()
                if args.use_mkl:
                    config.enable_mkldnn()
                    config.set_cpu_math_library_num_threads(args.threads)
            # Use ZeroCopy.
            config.switch_use_feed_fetch_ops(False)

        if profile:
            if args.mod is recorder:
                autolog = args.mod.Recorder(config, args.infer_batch_size,
                                            args.model_name)
            else:
                pid = os.getpid()
                autolog = args.mod.AutoLogger(
                    model_name=args.model_name,
                    model_precision="fp32",
                    batch_size=args.infer_batch_size,
                    save_path=args.save_log_path,
                    inference_config=config,
                    data_shape="dynamic",
                    pids=pid,
                    process_name=None,
                    gpu_ids=0 if args.device == "gpu" else None,
                    time_keys=[
                        'preprocess_time', 'inference_time', 'postprocess_time'
                    ],
                    warmup=0,
                    logger=logger)
        else:
            autolog = None

        predictor = inference.create_predictor(config)
        input_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_input_names()
        ]
        output_handles = [
            predictor.get_output_handle(name)
            for name in predictor.get_output_names()
        ]
        return cls(predictor, input_handles, output_handles, autolog)
Exemplo n.º 11
0
 def init_resnet50_predictor(model_dir):
     model_file = model_dir + '.pdmodel'
     params_file = model_dir + '.pdiparams'
     config = inference.Config()
     config.set_prog_file(model_file)
     config.set_params_file(params_file)
     config.use_gpu()
     config.enable_use_gpu(500, 0)
     predictor = inference.create_predictor(config)
     return predictor
Exemplo n.º 12
0
def init_predictor(model_dir):
    # refer   https://paddle-inference.readthedocs.io/en/latest/api_reference/python_api_doc/Config/GPUConfig.html
    model_file = model_dir+'.pdmodel'
    params_file = model_dir + '.pdiparams'
    config = inference.Config()
    config.set_prog_file(model_file)
    config.set_params_file(params_file)
    # 启用 GPU 进行预测 - 初始化 GPU 显存 50M, Deivce_ID 为 0
    config.enable_use_gpu(50, 0)
    predictor = inference.create_predictor(config)
    return predictor
Exemplo n.º 13
0
def infer(args):
    model_name = 'plato-xl'
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)

    context = [
        "Hi , Becky , what's up ?",
        "Not much , except that my mother-in-law is driving me up the wall .",
        "What's the problem ?"
    ]

    data = tokenizer.dialogue_encode(
        history=context,
        add_start_token_as_response=True,
        return_length=True,
        return_role_ids=args.use_role,
        position_style=args.position_style)

    # Load FasterTransformer lib. 
    load("FasterTransformer", verbose=True)

    config = paddle_infer.Config(args.inference_model_dir + "plato.pdmodel",
                                 args.inference_model_dir + "plato.pdiparams")
    config.enable_use_gpu(100, 0)
    config.disable_glog_info()
    predictor = paddle_infer.create_predictor(config)

    input_handles = {}
    for name in predictor.get_input_names():
        input_handles[name] = predictor.get_input_handle(name)
        if name == "attention_mask":
            input_handles[name].copy_from_cpu(
                np.expand_dims(
                    np.asarray(
                        data[name], dtype="float32"), axis=(0, 1)))
        else:
            input_handles[name].copy_from_cpu(
                np.asarray(
                    data[name], dtype="int32").reshape([1, -1]))

    output_handles = [
        predictor.get_output_handle(name)
        for name in predictor.get_output_names()
    ]

    predictor.run()

    output = [output_handle.copy_to_cpu() for output_handle in output_handles]

    for sample in output[0].transpose([1, 0]).tolist():
        print(" ".join(postprocess_response(sample, tokenizer)))
Exemplo n.º 14
0
    def get(self):
        # Create predictor, if one doesn't exist, when inference is run
        if not self._predictor:
            # If model isn't saved, save model to a temp dir
            # because predictor init requires the path to a saved model
            if self._model_path is None:
                self._model_path = tempfile.TemporaryDirectory().name
                self._save(self._model_path)

            config = paddle_infer.Config(self._model_path + ".pdmodel",
                                         self._model_path + ".pdiparams")
            config.enable_memory_optim()
            predictor = paddle_infer.create_predictor(config)
            self._predictor = predictor
        return self._predictor
Exemplo n.º 15
0
 def create_inference_config(self,
                             passes: Optional[List[str]] = None,
                             use_gpu: bool = False,
                             use_mkldnn: bool = False,
                             ir_optim: Optional[bool] = None):
     config = paddle_infer.Config()
     config.switch_ir_debug(True)
     config.disable_glog_info()
     if ir_optim is not None:
         config.switch_ir_optim(ir_optim)
     if use_gpu:
         config.enable_use_gpu(100, 0)
     if use_mkldnn:
         config.enable_mkldnn()
     if passes is not None:
         config.pass_builder().set_passes(passes)
     return config
Exemplo n.º 16
0
def create_predictor(args, mode, logger):
    # if mode == "det":
    #     model_dir = args[]
    # elif mode == 'cls':
    #     model_dir = args.cls_model_dir
    # else:
    #     model_dir = args.rec_model_dir
    model_dir = args['model_dir']
    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
        sys.exit(0)
    model_file_path = model_dir + "/inference.pdmodel"
    params_file_path = model_dir + "/inference.pdiparams"
    if not os.path.exists(model_file_path):
        logger.info("not find model file path {}".format(model_file_path))
        sys.exit(0)
    if not os.path.exists(params_file_path):
        logger.info("not find params file path {}".format(params_file_path))
        sys.exit(0)

    config = inference.Config(model_file_path, params_file_path)

    if args['use_gpu']:
        config.enable_use_gpu(8000, 0)
    else:
        config.disable_gpu()
        config.set_cpu_math_library_num_threads(6)

    # config.enable_memory_optim()
    config.disable_glog_info()

    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
    config.switch_use_feed_fetch_ops(False)

    # create predictor
    predictor = inference.create_predictor(config)
    input_names = predictor.get_input_names()
    for name in input_names:
        input_tensor = predictor.get_input_handle(name)
    output_names = predictor.get_output_names()
    output_tensors = []
    for output_name in output_names:
        output_tensor = predictor.get_output_handle(output_name)
        output_tensors.append(output_tensor)
    return predictor, input_tensor, output_tensors
Exemplo n.º 17
0
def init_paddle_inference_config(args):
    import paddle.inference as paddle_infer
    config = paddle_infer.Config(args.model_file, args.params_file)
    if hasattr(args, 'precision'):
        if args.precision == "fp16" and args.use_tensorrt:
            precision = paddle_infer.PrecisionType.Half
        elif args.precision == "int8":
            precision = paddle_infer.PrecisionType.Int8
        else:
            precision = paddle_infer.PrecisionType.Float32
    else:
        precision = paddle_infer.PrecisionType.Float32

    if args.use_gpu:
        gpu_id = get_infer_gpuid()
        if gpu_id is None:
            raise ValueError(
                "Not found GPU in current device. Please check your device or set args.use_gpu as False"
            )
        config.enable_use_gpu(args.gpu_mem, 0)
        if args.use_tensorrt:
            config.enable_tensorrt_engine(
                precision_mode=precision,
                max_batch_size=args.max_batch_size,
                min_subgraph_size=args.min_subgraph_size)
            # skip the minmum trt subgraph
            min_input_shape = {"x": [1, 3, 10, 10]}
            max_input_shape = {"x": [1, 3, 1000, 1000]}
            opt_input_shape = {"x": [1, 3, 112, 112]}
            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
                                            opt_input_shape)

    else:
        config.disable_gpu()
        cpu_threads = args.cpu_threads if  hasattr(args, "cpu_threads") else 10
        config.set_cpu_math_library_num_threads(cpu_threads)
        if args.enable_mkldnn:
            # cache 10 different shapes for mkldnn to avoid memory leak
            config.enable_mkldnn()
            config.set_mkldnn_cache_capacity(10)
            if args.precision == "fp16":
                config.enable_mkldnn_bfloat16()
    return config
Exemplo n.º 18
0
def infer(args):
    model_name = 'unimo-text-1.0-lcsts-new'
    tokenizer = UNIMOTokenizer.from_pretrained(model_name)

    inputs = "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。"

    data = tokenizer.gen_encode(inputs,
                                add_start_token_for_decoding=True,
                                return_length=True,
                                is_split_into_words=False)

    # Load FasterTransformer lib.
    load("FasterTransformer", verbose=True)

    config = paddle_infer.Config(
        args.inference_model_dir + "unimo_text.pdmodel",
        args.inference_model_dir + "unimo_text.pdiparams")
    config.enable_use_gpu(100, 0)
    config.disable_glog_info()
    predictor = paddle_infer.create_predictor(config)

    input_handles = {}
    for name in predictor.get_input_names():
        input_handles[name] = predictor.get_input_handle(name)
        if name == "attention_mask":
            input_handles[name].copy_from_cpu(
                np.expand_dims(np.asarray(data[name], dtype="float32"),
                               axis=(0, 1)))
        else:
            input_handles[name].copy_from_cpu(
                np.asarray(data[name], dtype="int32").reshape([1, -1]))

    output_handles = [
        predictor.get_output_handle(name)
        for name in predictor.get_output_names()
    ]

    predictor.run()

    output = [output_handle.copy_to_cpu() for output_handle in output_handles]

    for sample in output[0].transpose([1, 0]).tolist():
        print("".join(postprocess_response(sample, tokenizer)))
Exemplo n.º 19
0
    def load_predictor(self, model_file_path, params_file_path):
        """load_predictor

        initialize the inference engine

        Args:
            model_file_path: inference model path (*.pdmodel)
            model_file_path: inference parmaeter path (*.pdiparams)
        Return:
            predictor: Predictor created using Paddle Inference.
            config: Configuration of the predictor.
            input_tensor: Input tensor of the predictor.
            output_tensor: Output tensor of the predictor.
        """
        args = self.args
        config = inference.Config(model_file_path, params_file_path)
        if args.use_gpu:
            config.enable_use_gpu(1000, 0)
        else:
            config.disable_gpu()
            # The thread num should not be greater than the number of cores in the CPU.
            config.set_cpu_math_library_num_threads(4)

        # enable memory optim
        config.enable_memory_optim()
        config.disable_glog_info()

        config.switch_use_feed_fetch_ops(False)
        config.switch_ir_optim(True)

        # create predictor
        predictor = inference.create_predictor(config)

        # get input and output tensor property
        input_names = predictor.get_input_names()
        input_tensor = predictor.get_input_handle(input_names[0])

        output_names = predictor.get_output_names()
        output_tensor = predictor.get_output_handle(output_names[0])

        return predictor, config, input_tensor, output_tensor
Exemplo n.º 20
0
    def create_predictor(cls,
                         args,
                         config=None,
                         profile=False,
                         model_name=None):
        if config is None:
            config = inference.Config(
                os.path.join(args.inference_model_dir, "transformer.pdmodel"),
                os.path.join(args.inference_model_dir,
                             "transformer.pdiparams"))
            if args.device == "gpu":
                config.enable_use_gpu(100, 0)
            elif args.device == "xpu":
                config.enable_xpu(100)
            else:
                # CPU
                config.disable_gpu()
                if args.use_mkl:
                    config.enable_mkldnn()
                    config.set_cpu_math_library_num_threads(args.threads)
            # Use ZeroCopy.
            config.switch_use_feed_fetch_ops(False)

        if profile:
            recorder = Recorder(config, args.infer_batch_size, model_name)
        else:
            recorder = None

        predictor = inference.create_predictor(config)
        input_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_input_names()
        ]
        output_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_output_names()
        ]
        return cls(predictor, input_handles, output_handles, recorder)
Exemplo n.º 21
0
def eval():
    # create predictor
    model_file = os.path.join(FLAGS.model_path, FLAGS.model_filename)
    params_file = os.path.join(FLAGS.model_path, FLAGS.params_filename)
    config = paddle_infer.Config(model_file, params_file)
    if FLAGS.use_gpu:
        config.enable_use_gpu(1000, 0)
    if not FLAGS.ir_optim:
        config.switch_ir_optim(False)

    predictor = paddle_infer.create_predictor(config)

    input_names = predictor.get_input_names()
    input_handle = predictor.get_input_handle(input_names[0])
    output_names = predictor.get_output_names()
    output_handle = predictor.get_output_handle(output_names[0])

    # prepare data
    val_dataset = ImageNetValDataset(FLAGS.data_dir)
    eval_loader = paddle.io.DataLoader(val_dataset,
                                       batch_size=FLAGS.batch_size,
                                       num_workers=5)

    cost_time = 0.
    total_num = 0.
    correct_1_num = 0
    correct_5_num = 0
    for batch_id, data in enumerate(eval_loader()):
        # set input
        img_np = np.array([tensor.numpy() for tensor in data[0]])
        label_np = np.array([tensor.numpy() for tensor in data[1]])

        input_handle.reshape(img_np.shape)
        input_handle.copy_from_cpu(img_np)

        # run
        t1 = time.time()
        predictor.run()
        t2 = time.time()
        cost_time += (t2 - t1)

        output_data = output_handle.copy_to_cpu()

        # calculate accuracy
        for i in range(len(label_np)):
            label = label_np[i][0]
            result = output_data[i, :]
            index = result.argsort()
            total_num += 1
            if index[-1] == label:
                correct_1_num += 1
            if label in index[-5:]:
                correct_5_num += 1

        if batch_id % 10 == 0:
            acc1 = correct_1_num / total_num
            acc5 = correct_5_num / total_num
            avg_time = cost_time / total_num
            print(
                "batch_id {}, acc1 {:.3f}, acc5 {:.3f}, avg time {:.5f} sec/img"
                .format(batch_id, acc1, acc5, avg_time))

        if FLAGS.test_samples > 0 and \
            (batch_id + 1)* FLAGS.batch_size >= FLAGS.test_samples:
            break

    acc1 = correct_1_num / total_num
    acc5 = correct_5_num / total_num
    avg_time = cost_time / total_num
    print("End test: test image {}".format(total_num))
    print("test_acc1 {:.4f}, test_acc5 {:.4f}, avg time {:.5f} sec/img".format(
        acc1, acc5, avg_time))
    print("\n")
Exemplo n.º 22
0
    def load_model_config(self,
                          model_path,
                          use_gpu=False,
                          gpu_id=0,
                          use_profile=False,
                          thread_num=1,
                          mem_optim=True,
                          ir_optim=False,
                          use_trt=False,
                          use_lite=False,
                          use_xpu=False,
                          precision="fp32",
                          use_mkldnn=False,
                          mkldnn_cache_capacity=0,
                          mkldnn_op_list=None,
                          mkldnn_bf16_op_list=None,
                          use_feed_fetch_ops=False,
                          use_ascend_cl=False,
                          min_subgraph_size=3,
                          dynamic_shape_info={},
                          use_calib=False):
        """
        Load model configs and create the paddle predictor by Paddle Inference API.
   
        Args:
            model_path: model config path.
            use_gpu: calculating with gpu, False default.
            gpu_id: gpu id, 0 default.
            use_profile: use predictor profiles, False default.
            thread_num: thread nums of cpu math library, default 1. 
            mem_optim: memory optimization, True default.
            ir_optim: open calculation chart optimization, False default.
            use_trt: use nvidia TensorRT optimization, False default
            use_lite: use Paddle-Lite Engint, False default
            ir_optim: open calculation chart optimization, False default.
            use_trt: use nvidia TensorRT optimization, False default
            use_lite: use Paddle-Lite Engint, False default
            use_xpu: run predict on Baidu Kunlun, False default
            precision: precision mode, "fp32" default
            use_mkldnn: use MKLDNN, False default.
            mkldnn_cache_capacity: cache capacity for input shapes, 0 default.
            mkldnn_op_list: op list accelerated using MKLDNN, None default.
            mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
            use_feed_fetch_ops: use feed/fetch ops, False default.
            use_ascend_cl: run predict on Huawei Ascend, False default
            min_subgraph_size: the minimal subgraph size for opening tensorrt to optimize, 3 default
            dynamic_shape_info: dict including min_input_shape,max_input_shape, opt_input_shape, {} default 
            use_calib: use TensorRT calibration, False default
        """
        gpu_id = int(gpu_id)
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
        model_conf = m_config.GeneralModelConfig()
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)

        # Init paddle_infer config
        # Paddle's model files and parameter files have multiple naming rules:
        #   1) __model__, __params__
        #   2) *.pdmodel, *.pdiparams
        #   3) __model__, conv2d_1.w_0, conv2d_2.w_0, fc_1.w_0, conv2d_1.b_0, ... 
        pdmodel_file_list = self.search_suffix_files(model_path, "*.pdmodel")
        pdiparams_file_list = self.search_suffix_files(model_path,
                                                       "*.pdiparams")
        if os.path.exists(os.path.join(model_path, "__params__")):
            # case 1) initializing
            config = paddle_infer.Config(
                os.path.join(model_path, "__model__"),
                os.path.join(model_path, "__params__"))
        elif pdmodel_file_list and len(
                pdmodel_file_list) > 0 and pdiparams_file_list and len(
                    pdiparams_file_list) > 0:
            # case 2) initializing
            logger.info("pdmodel_file_list:{}, pdiparams_file_list:{}".format(
                pdmodel_file_list, pdiparams_file_list))
            config = paddle_infer.Config(pdmodel_file_list[0],
                                         pdiparams_file_list[0])
        else:
            # case 3) initializing.
            config = paddle_infer.Config(model_path)

        logger.info(
            "LocalPredictor load_model_config params: model_path:{}, use_gpu:{}, "
            "gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, "
            "use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, "
            "use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
            "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, "
            "use_ascend_cl:{}, min_subgraph_size:{}, dynamic_shape_info:{}".
            format(model_path, use_gpu, gpu_id, use_profile, thread_num,
                   mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
                   use_calib, use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list,
                   mkldnn_bf16_op_list, use_feed_fetch_ops, use_ascend_cl,
                   min_subgraph_size, dynamic_shape_info))

        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
        self.feed_names_to_idx_ = {}
        self.fetch_names_to_idx_ = {}

        for i, var in enumerate(model_conf.feed_var):
            self.feed_names_to_idx_[var.alias_name] = i
            self.feed_types_[var.alias_name] = var.feed_type
            self.feed_shapes_[var.alias_name] = var.shape

        for i, var in enumerate(model_conf.fetch_var):
            self.fetch_names_to_idx_[var.alias_name] = i
            self.fetch_types_[var.alias_name] = var.fetch_type
            self.fetch_names_to_type_[var.alias_name] = var.shape

        # set precision of inference.
        precision_type = paddle_infer.PrecisionType.Float32
        if precision is not None and precision.lower() in precision_map:
            precision_type = precision_map[precision.lower()]
        else:
            logger.warning("precision error!!! Please check precision:{}".
                           format(precision))
        # set profile
        if use_profile:
            config.enable_profile()
        # set memory optimization
        if mem_optim:
            config.enable_memory_optim()
        # set ir optimization, threads of cpu math library
        config.switch_ir_optim(ir_optim)
        # use feed & fetch ops
        config.switch_use_feed_fetch_ops(use_feed_fetch_ops)
        # pass optim
        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")

        # set cpu & mkldnn
        config.set_cpu_math_library_num_threads(thread_num)
        if use_mkldnn:
            config.enable_mkldnn()
            if precision_type == "bf16":
                config.enable_mkldnn_bfloat16()
            if mkldnn_cache_capacity > 0:
                config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
            if mkldnn_op_list is not None:
                config.set_mkldnn_op(mkldnn_op_list)
        # set gpu
        if not use_gpu:
            config.disable_gpu()
        else:
            config.enable_use_gpu(100, gpu_id)
            if use_trt:
                config.enable_tensorrt_engine(
                    precision_mode=precision_type,
                    workspace_size=1 << 20,
                    max_batch_size=32,
                    min_subgraph_size=min_subgraph_size,
                    use_static=False,
                    use_calib_mode=use_calib)

                @ErrorCatch
                @ParamChecker
                def dynamic_shape_info_helper(dynamic_shape_info:lambda dynamic_shape_info: check_dynamic_shape_info(dynamic_shape_info)):
                    pass
                _, resp = dynamic_shape_info_helper(dynamic_shape_info)
                if resp.err_no != CustomExceptionCode.OK.value:
                    print("dynamic_shape_info configure error, it should contain [min_input_shape', 'max_input_shape', 'opt_input_shape' {}".format(resp.err_msg))
                    kill_stop_process_by_pid("kill", os.getpgid(os.getpid()))

                if len(dynamic_shape_info):
                    config.set_trt_dynamic_shape_info(
                        dynamic_shape_info['min_input_shape'],
                        dynamic_shape_info['max_input_shape'],
                        dynamic_shape_info['opt_input_shape'])
        # set lite
        if use_lite:
            config.enable_lite_engine(
                precision_mode=precision_type,
                zero_copy=True,
                passes_filter=[],
                ops_filter=[])
            config.switch_ir_optim(True)
        # set xpu
        if use_xpu:
            # 2MB l3 cache
            config.enable_xpu(8 * 1024 * 1024)
            config.set_xpu_device_id(gpu_id)
        # set ascend cl
        if use_ascend_cl:
            if use_lite:
                # for ascend 310
                nnadapter_device_names = "huawei_ascend_npu"
                nnadapter_context_properties = \
                    "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={}".format(gpu_id)
                nnadapter_model_cache_dir = ""
                config.nnadapter() \
                .enable() \
                .set_device_names([nnadapter_device_names]) \
                .set_context_properties(nnadapter_context_properties) \
                .set_model_cache_dir(nnadapter_model_cache_dir)
            else:
                # for ascend 910
                config.enable_npu(gpu_id)
        # set cpu low precision
        if not use_gpu and not use_lite:
            if precision_type == paddle_infer.PrecisionType.Int8:
                logger.warning(
                    "PRECISION INT8 is not supported in CPU right now! Please use fp16 or bf16."
                )
                #config.enable_quantizer()
            if precision is not None and precision.lower() == "bf16":
                config.enable_mkldnn_bfloat16()
                if mkldnn_bf16_op_list is not None:
                    config.set_bfloat16_op(mkldnn_bf16_op_list)

        @ErrorCatch
        def create_predictor_check(config):
            predictor = paddle_infer.create_predictor(config)
            return predictor
        predictor, resp = create_predictor_check(config)
        if resp.err_no != CustomExceptionCode.OK.value:
            logger.critical(
                "failed to create predictor: {}".format(resp.err_msg),
                exc_info=False)
            print("failed to create predictor: {}".format(resp.err_msg))
            kill_stop_process_by_pid("kill", os.getpgid(os.getpid()))
        self.predictor = predictor
Exemplo n.º 23
0
def create_predictor(args, mode, logger):
    if mode == "det":
        model_dir = args.det_model_dir
    elif mode == 'cls':
        model_dir = args.cls_model_dir
    elif mode == 'rec':
        model_dir = args.rec_model_dir
    elif mode == 'table':
        model_dir = args.table_model_dir
    else:
        model_dir = args.e2e_model_dir

    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
        sys.exit(0)
    if args.use_onnx:
        import onnxruntime as ort
        model_file_path = model_dir
        if not os.path.exists(model_file_path):
            raise ValueError(
                "not find model file path {}".format(model_file_path))
        sess = ort.InferenceSession(model_file_path)
        return sess, sess.get_inputs()[0], None, None

    else:
        model_file_path = model_dir + "/inference.pdmodel"
        params_file_path = model_dir + "/inference.pdiparams"
        if not os.path.exists(model_file_path):
            raise ValueError(
                "not find model file path {}".format(model_file_path))
        if not os.path.exists(params_file_path):
            raise ValueError(
                "not find params file path {}".format(params_file_path))

        config = inference.Config(model_file_path, params_file_path)

        if hasattr(args, 'precision'):
            if args.precision == "fp16" and args.use_tensorrt:
                precision = inference.PrecisionType.Half
            elif args.precision == "int8":
                precision = inference.PrecisionType.Int8
            else:
                precision = inference.PrecisionType.Float32
        else:
            precision = inference.PrecisionType.Float32

        if args.use_gpu:
            gpu_id = get_infer_gpuid()
            if gpu_id is None:
                logger.warning(
                    "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jeston."
                )
            config.enable_use_gpu(args.gpu_mem, 0)
            if args.use_tensorrt:
                config.enable_tensorrt_engine(
                    workspace_size=1 << 30,
                    precision_mode=precision,
                    max_batch_size=args.max_batch_size,
                    min_subgraph_size=args.min_subgraph_size)
                # skip the minmum trt subgraph
            use_dynamic_shape = True
            if mode == "det":
                min_input_shape = {
                    "x": [1, 3, 50, 50],
                    "conv2d_92.tmp_0": [1, 120, 20, 20],
                    "conv2d_91.tmp_0": [1, 24, 10, 10],
                    "conv2d_59.tmp_0": [1, 96, 20, 20],
                    "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
                    "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
                    "conv2d_124.tmp_0": [1, 256, 20, 20],
                    "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
                    "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
                    "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
                    "elementwise_add_7": [1, 56, 2, 2],
                    "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2]
                }
                max_input_shape = {
                    "x": [1, 3, 1536, 1536],
                    "conv2d_92.tmp_0": [1, 120, 400, 400],
                    "conv2d_91.tmp_0": [1, 24, 200, 200],
                    "conv2d_59.tmp_0": [1, 96, 400, 400],
                    "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
                    "conv2d_124.tmp_0": [1, 256, 400, 400],
                    "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
                    "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
                    "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
                    "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
                    "elementwise_add_7": [1, 56, 400, 400],
                    "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400]
                }
                opt_input_shape = {
                    "x": [1, 3, 640, 640],
                    "conv2d_92.tmp_0": [1, 120, 160, 160],
                    "conv2d_91.tmp_0": [1, 24, 80, 80],
                    "conv2d_59.tmp_0": [1, 96, 160, 160],
                    "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
                    "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
                    "conv2d_124.tmp_0": [1, 256, 160, 160],
                    "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
                    "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
                    "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
                    "elementwise_add_7": [1, 56, 40, 40],
                    "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40]
                }
                min_pact_shape = {
                    "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
                    "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
                    "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
                    "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20]
                }
                max_pact_shape = {
                    "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
                    "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
                    "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
                    "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400]
                }
                opt_pact_shape = {
                    "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
                    "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
                    "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
                    "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160]
                }
                min_input_shape.update(min_pact_shape)
                max_input_shape.update(max_pact_shape)
                opt_input_shape.update(opt_pact_shape)
            elif mode == "rec":
                if args.rec_algorithm != "CRNN":
                    use_dynamic_shape = False
                min_input_shape = {"x": [1, 3, 32, 10]}
                max_input_shape = {"x": [args.rec_batch_num, 3, 32, 1536]}
                opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]}
            elif mode == "cls":
                min_input_shape = {"x": [1, 3, 48, 10]}
                max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]}
                opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
            else:
                use_dynamic_shape = False
            if use_dynamic_shape:
                config.set_trt_dynamic_shape_info(min_input_shape,
                                                  max_input_shape,
                                                  opt_input_shape)

        else:
            config.disable_gpu()
            if hasattr(args, "cpu_threads"):
                config.set_cpu_math_library_num_threads(args.cpu_threads)
            else:
                # default cpu threads as 10
                config.set_cpu_math_library_num_threads(10)
            if args.enable_mkldnn:
                # cache 10 different shapes for mkldnn to avoid memory leak
                config.set_mkldnn_cache_capacity(10)
                config.enable_mkldnn()
                if args.precision == "fp16":
                    config.enable_mkldnn_bfloat16()
        # enable memory optim
        config.enable_memory_optim()
        config.disable_glog_info()

        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
        if mode == 'table':
            config.delete_pass("fc_fuse_pass")  # not supported for table
        config.switch_use_feed_fetch_ops(False)
        config.switch_ir_optim(True)

        # create predictor
        predictor = inference.create_predictor(config)
        input_names = predictor.get_input_names()
        for name in input_names:
            input_tensor = predictor.get_input_handle(name)
        output_names = predictor.get_output_names()
        output_tensors = []
        for output_name in output_names:
            output_tensor = predictor.get_output_handle(output_name)
            output_tensors.append(output_tensor)
        return predictor, input_tensor, output_tensors, config
Exemplo n.º 24
0
import os

import cv2
import numpy as np
import paddle.inference as pi

from licsber.cv import parse_img

CHANNEL, HEIGHT, WIDTH = (3, 34, 92)
CHAR_LIST = '12345678ABCDEFHKNPQXYZabcdefhknpxyz'

_now_path = os.path.dirname(__file__)
MODEL_PATH = os.path.join(_now_path, 'models', 'inference.pdmodel')
PARAMS_PATH = os.path.join(_now_path, 'models', 'inference.pdiparams')

_config = pi.Config(MODEL_PATH, PARAMS_PATH)
_predictor = pi.create_predictor(_config)

if CHANNEL == 1:
    def pre_process(img):
        _, binary = cv2.threshold(img, 0x70, 1, cv2.THRESH_BINARY)
        binary = binary[:, :, 0]
        return np.array(binary, dtype='float32').reshape((1, HEIGHT, WIDTH))
elif CHANNEL == 3:
    def pre_process(img):
        return np.array(img, dtype='float32').reshape([CHANNEL, HEIGHT, WIDTH]) / 255
else:
    print('error, cannot pre_process img like this.')


def ctc_decode(text, blank=len(CHAR_LIST)):
Exemplo n.º 25
0
    def run_test(self, quant=False, *args, **kwargs):
        status = True
        run_flags = []
        for prog_config in self.sample_program_configs(*args, **kwargs):
            # In CI, only run 10% cases
            if np.random.rand() < self.num_percent_cases:
                run_flags.append(True)
            else:
                run_flags.append(False)

        for prog_config, run_flags in zip(
                self.sample_program_configs(*args, **kwargs), run_flags):
            if not run_flags:
                continue

            # if program is invalid, we should skip that cases.
            if not self.is_program_valid(prog_config):
                continue

            model, params = create_fake_model(prog_config)
            if quant:
                model, params = create_quant_model(model, params)

            feed_data = {}
            for name, tensor_config in prog_config.inputs.items():
                feed_data[name] = {
                    'data': tensor_config.data,
                    'lod': tensor_config.lod
                }

            results: List[Dict[str, np.ndarray]] = []

            # baseline: gpu run
            logging.info('RUN program_config: ' + str(prog_config))
            gpu_config = self.create_inference_config(use_trt=False)
            results.append(
                self.run_test_config(model, params, prog_config, gpu_config,
                                     feed_data))
            self.success_log('RUN_GPU_BASELINE done')

            for pred_config, nodes_num, threshold in self.sample_predictor_configs(
                    prog_config):

                if os.path.exists(self.cache_dir):
                    shutil.rmtree(self.cache_dir)

                if isinstance(threshold, float):
                    atol = threshold
                    rtol = 1e-8
                elif isinstance(threshold, list) or isinstance(
                        threshold, tuple):
                    atol = threshold[0]
                    rtol = threshold[1]
                else:
                    raise NotImplementedError

                if quant and pred_config.tensorrt_precision_mode(
                ) != paddle_infer.PrecisionType.Int8:
                    continue
                if pred_config.tensorrt_precision_mode(
                ) == paddle_infer.PrecisionType.Int8 and not quant:
                    continue

                ignore_flag = False
                for ignore_info in self.ignore_cases:
                    if ignore_info[0](prog_config, pred_config):
                        ignore_flag = True
                        if ignore_info[1] == IgnoreReasons.TRT_NOT_IMPLEMENTED:
                            self.ignore_log(
                                "[TRT_NOT_IMPLEMENTED] " + ignore_info[2] +
                                ' ' + ' vs ' +
                                self.inference_config_str(pred_config))
                        elif ignore_info[1] == IgnoreReasons.TRT_NOT_SUPPORT:
                            self.ignore_log(
                                "[TRT_NOT_SUPPORT] " + ignore_info[2] + ' ' +
                                ' vs ' +
                                self.inference_config_str(pred_config))
                        else:
                            raise NotImplementedError
                        break

                try:
                    pred_config_deserialize = paddle_infer.Config(pred_config)
                    results.append(
                        self.run_test_config(model, params, prog_config,
                                             pred_config, feed_data))
                    self.assert_tensors_near(atol, rtol, results[-1],
                                             results[0])
                    if not ignore_flag:
                        self.assert_op_size(nodes_num[0], nodes_num[1])
                    # deserialize test
                    if nodes_num[0] > 0:
                        self.run_test_config(model, params, prog_config,
                                             pred_config_deserialize,
                                             feed_data)
                except Exception as e:
                    self.fail_log(
                        self.inference_config_str(pred_config) +
                        '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)))
                    if not ignore_flag:
                        status = False
                    continue
                self.success_log('RUN predictor_config ' +
                                 self.inference_config_str(pred_config) +
                                 ' done')

        self.assertTrue(status)
Exemplo n.º 26
0
def create_predictor(args, mode, logger):
    if mode == "det":
        model_dir = args.det_model_dir
    elif mode == 'cls':
        model_dir = args.cls_model_dir
    elif mode == 'rec':
        model_dir = args.rec_model_dir
    elif mode == 'table':
        model_dir = args.table_model_dir
    else:
        model_dir = args.e2e_model_dir

    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
        sys.exit(0)
    model_file_path = model_dir + "/inference.pdmodel"
    params_file_path = model_dir + "/inference.pdiparams"
    if not os.path.exists(model_file_path):
        raise ValueError("not find model file path {}".format(model_file_path))
    if not os.path.exists(params_file_path):
        raise ValueError(
            "not find params file path {}".format(params_file_path))

    config = inference.Config(model_file_path, params_file_path)

    if hasattr(args, 'precision'):
        if args.precision == "fp16" and args.use_tensorrt:
            precision = inference.PrecisionType.Half
        elif args.precision == "int8":
            precision = inference.PrecisionType.Int8
        else:
            precision = inference.PrecisionType.Float32
    else:
        precision = inference.PrecisionType.Float32

    if args.use_gpu:
        config.enable_use_gpu(args.gpu_mem, 0)
        if args.use_tensorrt:
            config.enable_tensorrt_engine(
                precision_mode=precision,
                max_batch_size=args.max_batch_size,
                min_subgraph_size=args.min_subgraph_size)
            # skip the minmum trt subgraph
        if mode == "det":
            min_input_shape = {
                "x": [1, 3, 50, 50],
                "conv2d_92.tmp_0": [1, 96, 20, 20],
                "conv2d_91.tmp_0": [1, 96, 10, 10],
                "conv2d_59.tmp_0": [1, 96, 20, 20],
                "nearest_interp_v2_1.tmp_0": [1, 96, 10, 10],
                "nearest_interp_v2_2.tmp_0": [1, 96, 20, 20],
                "conv2d_124.tmp_0": [1, 96, 20, 20],
                "nearest_interp_v2_3.tmp_0": [1, 24, 20, 20],
                "nearest_interp_v2_4.tmp_0": [1, 24, 20, 20],
                "nearest_interp_v2_5.tmp_0": [1, 24, 20, 20],
                "elementwise_add_7": [1, 56, 2, 2],
                "nearest_interp_v2_0.tmp_0": [1, 96, 2, 2]
            }
            max_input_shape = {
                "x": [1, 3, 2000, 2000],
                "conv2d_92.tmp_0": [1, 96, 400, 400],
                "conv2d_91.tmp_0": [1, 96, 200, 200],
                "conv2d_59.tmp_0": [1, 96, 400, 400],
                "nearest_interp_v2_1.tmp_0": [1, 96, 200, 200],
                "conv2d_124.tmp_0": [1, 256, 400, 400],
                "nearest_interp_v2_2.tmp_0": [1, 96, 400, 400],
                "nearest_interp_v2_3.tmp_0": [1, 24, 400, 400],
                "nearest_interp_v2_4.tmp_0": [1, 24, 400, 400],
                "nearest_interp_v2_5.tmp_0": [1, 24, 400, 400],
                "elementwise_add_7": [1, 56, 400, 400],
                "nearest_interp_v2_0.tmp_0": [1, 96, 400, 400]
            }
            opt_input_shape = {
                "x": [1, 3, 640, 640],
                "conv2d_92.tmp_0": [1, 96, 160, 160],
                "conv2d_91.tmp_0": [1, 96, 80, 80],
                "conv2d_59.tmp_0": [1, 96, 160, 160],
                "nearest_interp_v2_1.tmp_0": [1, 96, 80, 80],
                "nearest_interp_v2_2.tmp_0": [1, 96, 160, 160],
                "conv2d_124.tmp_0": [1, 256, 160, 160],
                "nearest_interp_v2_3.tmp_0": [1, 24, 160, 160],
                "nearest_interp_v2_4.tmp_0": [1, 24, 160, 160],
                "nearest_interp_v2_5.tmp_0": [1, 24, 160, 160],
                "elementwise_add_7": [1, 56, 40, 40],
                "nearest_interp_v2_0.tmp_0": [1, 96, 40, 40]
            }
        elif mode == "rec":
            min_input_shape = {"x": [args.rec_batch_num, 3, 32, 10]}
            max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]}
            opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]}
        elif mode == "cls":
            min_input_shape = {"x": [args.rec_batch_num, 3, 48, 10]}
            max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]}
            opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
        else:
            min_input_shape = {"x": [1, 3, 10, 10]}
            max_input_shape = {"x": [1, 3, 1000, 1000]}
            opt_input_shape = {"x": [1, 3, 500, 500]}
        config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
                                          opt_input_shape)

    else:
        config.disable_gpu()
        if hasattr(args, "cpu_threads"):
            config.set_cpu_math_library_num_threads(args.cpu_threads)
        else:
            # default cpu threads as 10
            config.set_cpu_math_library_num_threads(10)
        if args.enable_mkldnn:
            # cache 10 different shapes for mkldnn to avoid memory leak
            config.set_mkldnn_cache_capacity(10)
            config.enable_mkldnn()

    # enable memory optim
    config.enable_memory_optim()
    #config.disable_glog_info()

    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
    if mode == 'table':
        config.delete_pass("fc_fuse_pass")  # not supported for table
    config.switch_use_feed_fetch_ops(False)
    config.switch_ir_optim(True)

    # create predictor
    predictor = inference.create_predictor(config)
    input_names = predictor.get_input_names()
    for name in input_names:
        input_tensor = predictor.get_input_handle(name)
    output_names = predictor.get_output_names()
    output_tensors = []
    for output_name in output_names:
        output_tensor = predictor.get_output_handle(output_name)
        output_tensors.append(output_tensor)
    return predictor, input_tensor, output_tensors, config