예제 #1
0
def tensorrt_backend_pfe_onnx():

    pillar_x = np.ones([1, 1, 12000, 100], dtype=np.float32)
    pillar_y = np.ones([1, 1, 12000, 100], dtype=np.float32)
    pillar_z = np.ones([1, 1, 12000, 100], dtype=np.float32)
    pillar_i = np.ones([1, 1, 12000, 100], dtype=np.float32)

    num_points_per_pillar = np.ones([1, 12000], dtype=np.float32)
    x_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32)
    y_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32)
    mask = np.ones([1, 1, 12000, 100], dtype=np.float32)

    pfe_inputs = [
        pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar,
        x_sub_shaped, y_sub_shaped, mask
    ]

    print("pfe_inputs length is : ", len(pfe_inputs))
    start = time.time()

    pfe_model = onnx.load("pfe.onnx")
    engine = backend.prepare(pfe_model, device="CUDA:0", max_batch_size=1)

    for i in range(1, 1000):
        pfe_outputs = engine.run(pfe_inputs)
    end = time.time()
    print('inference time is : ', (end - start) / 1000)
    print(pfe_outputs)
예제 #2
0
def onnx2trt_infer(
        onnx_model_filename: str,
        input_values: 'Sequence[np.ndarray]',
        batch_size: int = 1,
        workspace_size: int = (1024 * 1024 * 16),
) -> 'Sequence[np.ndarray]':
    r"""infer model with 'onnx_tensorrt' backend"""

    import onnx
    import onnx.optimizer as optimizer
    import onnx_tensorrt.backend as backend

    from onnx.utils import polish_model

    model = onnx.load(onnx_model_filename)
    passes = optimizer.get_available_passes()
    passes = list(filter(lambda name: not name.startswith('split_'), passes))
    logger.debug('optimizations to perform in ONNX:\n\t%s', passes)
    model = optimizer.optimize(model, passes=passes)
    model = polish_model(model)
    onnx.save(model,
              onnx_model_filename.rpartition('.onnx')[0] + '.optimized.onnx')
    engine = backend.prepare(
        model,
        device='CUDA',
        max_batch_size=batch_size,
        max_workspace_size=workspace_size,
    )
    return engine.run(input_values)
예제 #3
0
 def load_model(self, path):
     self.model = onnx.load(path)
     if not torch.cuda.is_available():
         raise NotImplementedError(
             'TensorRT backend does not work for non-CUDA devices.')
     # get first gpu
     self.engine = backend.prepare(self.model, device='CUDA:0')
예제 #4
0
def onnx_infer(image, model_path):
    model = onnx.load(model_path)
    engine = backend.prepare(model, device='CUDA:1')
    # input_data = np.random.random(size=(32, 3, 224, 224)).astype(np.float32)
    output_data = engine.run(image)[0]
    print(output_data)
    print(output_data.shape)
예제 #5
0
def load_model(path, shape):
    model = onnx.load(path)
    engine = backend.prepare(model, device='CUDA:0')
    input_data = np.random.random(size=shape).astype(np.float32)
    # return
    output_data = engine.run(input_data)
    print(output_data['steer'])
    print(output_data)
예제 #6
0
파일: trt_test.py 프로젝트: karlzipser/k3
def inference_model():

    torch.backends.cudnn.deterministic = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = VellaDeconv().to(device).eval()

    points = np.random.rand(1, 3, 6, 1)#(25,100, 2, 2)
    points = np.zeros((1,3,6,1))+1
    points_trch = points #.reshape((25,100, 2, 2))

    points_t = [torch.tensor(points_trch, device=device, dtype=torch.float32)]
    points_t_np = points_t[0].detach().cpu().numpy()

    # import pdb; pdb.set_trace()
    torch.cuda.synchronize()
    with torch.no_grad():
        x_conv_trch = net(*points_t)
    torch.cuda.synchronize()
    x_conv_np = x_conv_trch.detach().cpu().numpy()

    onnx_model = onnx.load(model_onnx)
    tensorrt_engine = backend.prepare(
        onnx_model, 
        device="CUDA:0", 
        max_workspace_size=536870912, 
        max_batch_size=1, 
        using_fp16=False, 
        serialize_engine=False, 
        engine_file_path=model_tensorrt
    )

    # points_trt = points
    points_trt = points_trch
    points_nv = torch.tensor(points_trt, device=device, dtype=torch.float32)
    points_np = [points_nv.detach().cpu().numpy()]

    x_conv_nv = tensorrt_engine.run(points_np)

    print(f"x_conv_np is :{x_conv_np}")
    print(f"x_conv_np.shape is: {x_conv_np.shape}")
    print(f"x_conv_np.dtype is: {x_conv_np.dtype}")
    print(f"x_conv_nv is: {x_conv_nv[0]}")
    print(f"x_conv_nv shape is: {x_conv_nv[0].shape}")
    print(x_conv_nv[0].dtype)
    
    print("Input matching percentage")
    print((np.count_nonzero(points_t_np==points_np[0]))/points_t_np.size)
    print("Output matching percentage")
    print((np.count_nonzero(x_conv_np==x_conv_nv[0]))/x_conv_np.size) 
    print(np.count_nonzero(np.isclose(x_conv_np, x_conv_nv[0], atol=1e-8))/x_conv_np.size)

    # np.savetxt("x_conv_np.txt",x_conv_np)
    print("................")
예제 #7
0
def tensorrt_backend_rpn_onnx():

    rpn_input_features = np.ones([1, 64, 496, 432], dtype=np.float32)

    rpn_start_time = time.time()

    rpn_model = onnx.load("rpn.onnx")
    engine = backend.prepare(rpn_model, device="CUDA:0", max_batch_size=1)

    for i in range(1, 1000):
        rpn_outputs = engine.run(rpn_input_features)

    rpn_end_time = time.time()

    print('rpn inference time is : ', (rpn_end_time - rpn_start_time) / 1000)
    print(rpn_outputs)
예제 #8
0
def main(input_data_dir, output_data_dir, onnx_name):
    # sess = onnxruntime.InferenceSession(onnx_name)
    """
    so = onnxruntime.SessionOptions()
    so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess = onnxruntime.InferenceSession(onnx_name, sess_options=so)
    sess.set_providers(['CUDAExecutionProvider'])
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    """

    model = onnx.load(onnx_name)
    # engine = build_engine(onnx_name)
    engine = backend.prepare(model, device="CUDA:0")

    # print("The model expects input shape: ", sess.get_inputs()[0].shape)
    # sess.run(None, {input_name: np.random.rand(1, 3, 1024, 1024).astype(np.float32)})

    result = engine.run(np.random.rand(1, 3, 1024, 1024).astype(np.float32))[0]
    image_path_list = get_image_pathes(input_data_dir)
    for image_path in tqdm(image_path_list):
        base_name = Path(image_path).name
        rgb_image = cv2.imread(image_path)
        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)

        img_tfmd = transform_image(add_dummy_dim(bgr_image))
        img_tfmd_ary = img_tfmd.to("cpu").detach().numpy()

        # test = add_dummy_dim(bgr_image)
        # result = sess.run(None, {input_name: np.random.rand(1, 3, 1024, 1024).astype(np.float32)})

        start = time.time()
        result = engine.run(img_tfmd_ary.astype(np.float32))[0]
        # result = sess.run(None, {input_name: img_tfmd_ary.astype(np.float32)})
        end = time.time()
        print(end - start)

        prob = result[0][0]
        label_img = (prob.argmax(0) * 255).astype(np.uint8)

        cv2.imwrite(str(Path(output_data_dir, base_name)), label_img)
        cv2.waitKey(10)
예제 #9
0
import sys
import onnx
import onnx_tensorrt.backend as backend
import numpy as np
import time

model = onnx.load("centernet_dla34.onnx")
graph = onnx.helper.printable_graph(model.graph)
#print(graph)

engine = backend.prepare(model, device='CUDA:1')
#input_data = np.random.random(size=(1, 3, 512, 512)).astype(np.float32)
images = np.load('images.npy')
#input_data = np.random.random(size=(1, 3, 512, 512)).astype(np.float32)
output_datas = engine.run(images)
#print(output_datas[0].shape)
#print(output_datas[0])
#print("===============(output_datas[0]===============================")
#print(output_datas[1].shape)
#print(output_datas[1])
#print("================(output_datas[1]==================================")
#print(output_datas[2].shape)
print(output_datas[2][0])
#print("=================output_datas[2]=================================")
#output_hm =  np.load('output_hm.npy')
#output_wh =  np.load('output_wh.npy')
output_reg = np.load('output_reg.npy')
#print(output_hm.shape)
#print(output_hm)
#print("=============  output_hm =====================================")
#print(output_wh.shape)
예제 #10
0
roop = 20
e = 0.0
inp = np.ones((1,4,480,640), dtype=np.float32)
for _ in range(roop):
    s = time.time()
    result = onnx_session.run(
        [output_name],
        {input_name: inp}
    )
    e += (time.time() - s)
print(f'elapsed time: {e/roop*1000}ms')
"""
elapsed time: 57.117438316345215ms
"""


import onnx
import onnx_tensorrt.backend as be

model = onnx.load('saved_model_sony_480x640/model_float32.onnx')
engine = be.prepare(model, device='CUDA:0')
e = 0.0
for _ in range(roop):
    s = time.time()
    output = engine.run(inp)[0]
    e += (time.time() - s)
print(f'elapsed time: {e/roop*1000}ms')
"""
elapsed time: 13.761746883392334ms
"""
    loader = transforms.Compose(
        [transforms.Resize(128), transforms.ToTensor(), transforms.Normalize(std, mean)])
    imagen = loader(imagen).float()
    imagen = imagen.unsqueeze(0)

    return imagen


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


if FLAG_DETECCION:
    # Inicia sesion runtime
    modelo_onnx = onnx.load(path_modelo_deteccion)
    engine = backend.prepare(modelo_onnx, device = device)


    # Lee imagenes e inferencia
    paths_imagenes = glob.glob(f"{path_imagenes_deteccion}/*.png")

    imagenes = [tr_deteccion(path_imagen) for path_imagen in paths_imagenes]

    for imagen in imagenes:

        imagen1 = imagen.numpy()
        time_start = time.time()
        salidas =  engine.run(entradas)
        time_end = time.time()
        total_time = time_end - time_start
        prediccion = salidas
예제 #12
0
            [0, 255, 255],  # empty
        ],
        dtype=np.float32)

    FULL_LABEL_MAP = np.arange(len(LABEL_NAMES)).reshape(len(LABEL_NAMES), 1)
    FULL_COLOR_MAP = colormap[FULL_LABEL_MAP]

    # load model

    model = onnx.load("deeplab_pruned.onnx")
    engine_path = 'deeplab_pruned_' + precision + '.engine'

    engine = backend.prepare(model,
                             device='CUDA:0',
                             serialize_engine=False,
                             precision=precision,
                             max_batch_size=4,
                             load_engine=load_engine,
                             engine_path=engine_path)

    def list_images(folder, pattern='*', ext='bmp'):
        """List the images in a specified folder by pattern and extension

        Args:
            folder (str): folder containing the images to list
            pattern (str, optional): a bash-like pattern of the files to select
                                     defaults to * (everything)
            ext(str, optional): the image extension (defaults to png)

        Returns:
            str list: list of (filenames) images matching the pattern in the folder
예제 #13
0
def test_onnx_for_trt(onnx_path, config_path, model_dir, ckpt_path=None):
    dummy_dev_pillar_x_ = np.random.random(size=(1, 1, 12000,
                                                 100)).astype(np.float32)
    dummy_dev_pillar_y_ = np.random.random(size=(1, 1, 12000,
                                                 100)).astype(np.float32)
    dummy_dev_pillar_z_ = np.random.random(size=(1, 1, 12000,
                                                 100)).astype(np.float32)
    dummy_dev_pillar_i_ = np.random.random(size=(1, 1, 12000,
                                                 100)).astype(np.float32)
    dummy_dev_num_points_per_pillar_ = np.random.random(size=(1, 1, 12000,
                                                              1)).astype(
                                                                  np.float32)
    dummy_dev_x_coors_for_sub_shaped_ = np.random.random(size=(1, 1, 12000,
                                                               100)).astype(
                                                                   np.float32)
    dummy_dev_y_coors_for_sub_shaped_ = np.random.random(size=(1, 1, 12000,
                                                               100)).astype(
                                                                   np.float32)
    dummy_dev_pillar_feature_mask_ = np.random.random(size=(1, 1, 12000,
                                                            100)).astype(
                                                                np.float32)

    model = onnx.load(onnx_path)
    engine = backend.prepare(model, device='CUDA:0', max_batch_size=1)
    print("model read success")
    print()
    output_data = engine.run(
        (dummy_dev_pillar_x_, dummy_dev_pillar_y_, dummy_dev_pillar_z_,
         dummy_dev_pillar_i_, dummy_dev_num_points_per_pillar_,
         dummy_dev_x_coors_for_sub_shaped_, dummy_dev_y_coors_for_sub_shaped_,
         dummy_dev_pillar_feature_mask_))

    # ##########compare with pytorch output #########################
    for i in range(len(output_data)):
        print(output_data[i].shape)
    print(output_data[0][0, 0, 0:100])

    model_dir = pathlib.Path(model_dir)
    config = pipeline_pb2.TrainEvalPipelineConfig()
    with open(config_path, "r") as f:
        proto_str = f.read()
        text_format.Merge(proto_str, config)

    model_cfg = config.model.second
    voxel_generator = voxel_builder.build(model_cfg.voxel_generator)
    bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]]
    box_coder = box_coder_builder.build(model_cfg.box_coder)
    target_assigner_cfg = model_cfg.target_assigner
    target_assigner = target_assigner_builder.build(target_assigner_cfg,
                                                    bv_range, box_coder)
    net = second_builder_for_official_onnx_and_cuda.build(
        model_cfg, voxel_generator, target_assigner)
    net.cuda()
    net.eval()

    # since the model is changed, dont restore first
    if ckpt_path is None:
        torchplus.train.try_restore_latest_checkpoints(model_dir, [net])
    else:
        torchplus.train.restore(ckpt_path, net)

    dummy_dev_pillar_x_ = torch.as_tensor(dummy_dev_pillar_x_, device="cuda")
    dummy_dev_pillar_y_ = torch.as_tensor(dummy_dev_pillar_y_, device="cuda")
    dummy_dev_pillar_z_ = torch.as_tensor(dummy_dev_pillar_z_, device="cuda")
    dummy_dev_pillar_i_ = torch.as_tensor(dummy_dev_pillar_i_, device="cuda")
    dummy_dev_num_points_per_pillar_ = torch.as_tensor(
        dummy_dev_num_points_per_pillar_, device="cuda")
    dummy_dev_x_coors_for_sub_shaped_ = torch.as_tensor(
        dummy_dev_x_coors_for_sub_shaped_, device="cuda")
    dummy_dev_y_coors_for_sub_shaped_ = torch.as_tensor(
        dummy_dev_y_coors_for_sub_shaped_, device="cuda")
    dummy_dev_pillar_feature_mask_ = torch.as_tensor(
        dummy_dev_pillar_feature_mask_, device="cuda")
    output_pytorch = net.voxel_feature_extractor(
        dummy_dev_pillar_x_, dummy_dev_pillar_y_, dummy_dev_pillar_z_,
        dummy_dev_pillar_i_, dummy_dev_num_points_per_pillar_,
        dummy_dev_x_coors_for_sub_shaped_, dummy_dev_y_coors_for_sub_shaped_,
        dummy_dev_pillar_feature_mask_)

    print(output_pytorch[0, 0, 0:100])
    def test(self):
        import onnx
        from ngraph_onnx.onnx_importer.importer import import_onnx_model
        import ngraph as ng
        global dim0, dim2, dim3

        torch.set_grad_enabled(False)

        epoch = self.optimizer.get_last_epoch() + 1
        self.ckp.write_log('\nEvaluation:')
        self.ckp.add_log(
            torch.zeros(1, len(self.loader_test), len(self.scale))
        )
        self.model.eval()

        timer_test = utility.timer()
        if self.args.save_results: self.ckp.begin_background()
        # print(self.loader_test)
        for idx_data, d in enumerate(self.loader_test):
            for idx_scale, scale in enumerate(self.scale):
                d.dataset.set_scale(idx_scale)
                print('idx_scale={}'.format(idx_scale))
                # print("len: {}".format(len(d)))
                # for lr, hr, filename, _ in tqdm(d, ncols=80):
                for batch, (lr, hr, filename, _) in enumerate(d):
                    print('{} '.format(batch), end='', flush=True)
                    lr, hr = self.prepare(lr, hr)
                    print('test lr.size: {}'.format(lr.size()))
                    dim0 = lr.size()[0]
                    dim2 = lr.size()[2]
                    dim3 = lr.size()[3]
                    
                    showbug = False
                    if showbug: print('stage1', flush=True)
                    if self.args.ngraph:
                        
                        pytorch_model_name = self.args.ngraph
                        pytorch_edsr_model = torch.load(pytorch_model_name).cuda()
                        if showbug: print('stage2-1', flush=True)
                        # print(lr.size())
                        # dummy_input = torch.randn_like(lr, device='cuda')
                        if showbug: print('stage2-2', flush=True)
                        edsr_onnx_filename = '{}.onnx'.format(pytorch_model_name)
                        # print('Export to onnx model {}'.format(edsr_onnx_filename))
                        torch.onnx.export(pytorch_edsr_model, lr.to(torch.device('cuda')), edsr_onnx_filename, export_params=True, verbose=False, training=False)
                        if showbug: print('stage2-3', flush=True)

                        edsr_onnx_model = onnx.load(edsr_onnx_filename)
                        # print(onnx.helper.printable_graph(edsr_onnx_model.graph))

                        if showbug: print('stage2-4', flush=True)
                        ng_models = import_onnx_model(edsr_onnx_model)

                        # print('Convert to nGreph Model')

                        ng_model = ng_models[0]
                        if showbug: print('stage2-5', flush=True)
                        runtime = ng.runtime(backend_name='CPU')
                        if showbug: print('stage2-6', flush=True)
                        edsr_ng_model = runtime.computation(ng_model['output'], *ng_model['inputs'])
                        if showbug: print('stage2-7', flush=True)

                        sr = edsr_ng_model(lr, idx_scale)
                        if showbug: print('stage2-8', flush=True)
                        sr = torch.from_numpy(sr)
                        if showbug: print('stage2-9', flush=True)
                    elif self.args.tensorrt:
                        pytorch_model_name = self.args.tensorrt
                        pytorch_edsr_model = torch.load(pytorch_model_name)
                        
                        # lr_np = lr.numpy().astype(np.float32)
                        dummy_input = torch.randn_like(lr, device='cuda')
                        edsr_onnx_filename = '{}.onnx'.format(pytorch_model_name)
                        print('Export to onnx model {}'.format(edsr_onnx_filename))
                        torch.onnx.export(pytorch_edsr_model, dummy_input, edsr_onnx_filename, export_params=True, verbose=False, training=False)

                        import os
                        import onnx

                        edsr_onnx_model = onnx.load(edsr_onnx_filename)
                        # print(onnx.helper.printable_graph(edsr_onnx_model.graph))

                        import tensorrt
                        import onnx_tensorrt.backend as backend
                        import numpy as np

                        tensorrt_engine = backend.prepare(edsr_onnx_model, device='CUDA:0')
                        # lr_np = lr_np.to(torch.device("cuda:0"))
                        # lr.numpy().astype(np.float32)

                        sr = tensorrt_engine.run(lr.numpy().astype(np.float32))[0]
                        sr = torch.from_numpy(sr)

                        print('complete one')   



                        pytorch_model_name = self.args.tensorrt
                        pytorch_edsr_model = torch.load(pytorch_model_name)
                        
                        # lr_np = lr.numpy().astype(np.float32)
                        dummy_input = torch.randn_like(lr, device='cuda')
                        edsr_onnx_filename = '{}.onnx'.format(pytorch_model_name)
                        print('Export to onnx model {}'.format(edsr_onnx_filename))
                        torch.onnx.export(pytorch_edsr_model, dummy_input, edsr_onnx_filename, export_params=True, verbose=False, training=False)

                        import os
                        import onnx

                        edsr_onnx_model = onnx.load(edsr_onnx_filename)
                        # print(onnx.helper.printable_graph(edsr_onnx_model.graph))

                        import tensorrt
                        import onnx_tensorrt.backend as backend
                        import numpy as np

                        tensorrt_engine = backend.prepare(edsr_onnx_model, device='CUDA:0')
                        # lr_np = lr_np.to(torch.device("cuda:0"))
                        # lr.numpy().astype(np.float32)

                        sr = tensorrt_engine.run(lr.numpy().astype(np.float32))[0]
                        sr = torch.from_numpy(sr)
                        
                        print('complete two')   
                    else:
                        sr = self.model(lr, idx_scale)

                    if showbug: print('stage3', flush=True)
                    sr = utility.quantize(sr, self.args.rgb_range)

                    if showbug: print('stage4', flush=True)
                    save_list = [sr]
                    if showbug: print('stage5', flush=True)
                    self.ckp.log[-1, idx_data, idx_scale] += utility.calc_psnr(
                        sr, hr, scale, self.args.rgb_range, dataset=d
                    )
                    if showbug: print('stage6', flush=True)
                    if self.args.save_gt:
                        save_list.extend([lr, hr])
                    if showbug: print('stage7', flush=True)

                    if self.args.save_results:
                        self.ckp.save_results(d, filename[0], save_list, scale)
                    if showbug: print('stage8', flush=True)

                self.ckp.log[-1, idx_data, idx_scale] /= len(d)
                best = self.ckp.log.max(0)
                psnr = self.ckp.log[-1, idx_data, idx_scale].numpy()
                print('')
                self.ckp.write_log(
                    '[{} x{}]\tPSNR: {:.3f} (Best: {:.3f} @epoch {})'.format(
                        d.dataset.name,
                        scale,
                        self.ckp.log[-1, idx_data, idx_scale],
                        best[0][idx_data, idx_scale],
                        best[1][idx_data, idx_scale] + 1
                    )
                )
                
        self.ckp.write_log('Forward: {:.2f}s\n'.format(timer_test.toc()))
        self.ckp.write_log('Saving...')

        if self.args.save_results:
            self.ckp.end_background()

        if not self.args.test_only:
            self.ckp.save(self, epoch, is_best=(best[1][0, 0] + 1 == epoch))

        self.ckp.write_log(
           'Total: {:.2f}s\n'.format(timer_test.toc()), refresh=True
        )

        torch.set_grad_enabled(True)
        return psnr
예제 #15
0
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
# import matplotlib.pyplot as plt
import time
import os
import copy

import onnx
import tensorrt
import onnx_tensorrt.backend as backend

model = onnx.load("resnet18.onnx")
engine = backend.prepare(model, device='CUDA:0', max_batch_size=64)
input_data = np.random.random(size=(1, 3, 224, 224)).astype(np.float32)
output_data = engine.run(input_data)[0]
print(output_data)
print(output_data.shape)

# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation((-180,180)),
        transforms.RandomAffine(degrees=(-30,30), shear=(-20,20)),
        #transforms.Resize(256),
        #transforms.CenterCrop(224),
예제 #16
0
def tensorrt_backend_pointpillars_onnx(config_path=None):
    import torch
    from second.protos import pipeline_pb2
    from google.protobuf import text_format
    from second.builder import voxel_builder
    from second.pytorch.models.pointpillars import PointPillarsScatter

    ############################# PFE-Layer TensorRT ################################
    pillar_x = np.ones([1, 1, 12000, 100], dtype=np.float32)
    pillar_y = np.ones([1, 1, 12000, 100], dtype=np.float32)
    pillar_z = np.ones([1, 1, 12000, 100], dtype=np.float32)
    pillar_i = np.ones([1, 1, 12000, 100], dtype=np.float32)
    num_points_per_pillar = np.ones([1, 12000], dtype=np.float32)
    x_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32)
    y_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32)
    mask = np.ones([1, 1, 12000, 100], dtype=np.float32)

    pfe_inputs = [
        pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar,
        x_sub_shaped, y_sub_shaped, mask
    ]

    pfe_model = onnx.load("pfe.onnx")
    engine = backend.prepare(pfe_model, device="CUDA:0", max_batch_size=1)

    pfe_start_time = time.time()
    pfe_outputs = engine.run(pfe_inputs)
    pfe_end_time = time.time()

    print('inference time is : ', (pfe_end_time - pfe_start_time))

    ###################### PillarScatter Python Coder Transfer #########################
    # numpy --> tensor
    pfe_outs = np.array(pfe_outputs)
    voxel_features_tensor = torch.from_numpy(pfe_outs)

    voxel_features = voxel_features_tensor.squeeze()
    voxel_features = voxel_features.permute(1, 0)

    if isinstance(config_path, str):
        config = pipeline_pb2.TrainEvalPipelineConfig()
        with open(config_path, "r") as f:
            proto_str = f.read()
            text_format.Merge(proto_str, config)
    else:
        config = config_path
    model_cfg = config.model.second
    vfe_num_filters = list(model_cfg.voxel_feature_extractor.num_filters)
    voxel_generator = voxel_builder.build(model_cfg.voxel_generator)
    grid_size = voxel_generator.grid_size
    output_shape = [1] + grid_size[::-1].tolist() + [vfe_num_filters[-1]]
    num_input_features = vfe_num_filters[-1]
    batch_size = 1
    mid_feature_extractor = PointPillarsScatter(output_shape,
                                                num_input_features, batch_size)

    device = torch.device("cuda:0")
    coors_numpy = np.loadtxt('coors.txt', dtype=np.int32)
    coors = torch.from_numpy(coors_numpy)
    coors = coors.to(device).cuda()  #CPU Tensor --> GPU Tensor

    voxel_features = voxel_features.to(device).cuda()
    rpn_input_features = mid_feature_extractor(voxel_features, coors)

    ########################### RPN Network TensorRT #################################

    rpn_input_features = rpn_input_features.data.cpu().numpy()

    rpn_model = onnx.load("rpn.onnx")
    engine_rpn = backend.prepare(rpn_model, device="CUDA:0", max_batch_size=1)

    rpn_start_time = time.time()
    rpn_outputs = engine_rpn.run(rpn_input_features)
    rpn_end_time = time.time()

    print('rpn inference time is : ', (rpn_end_time - rpn_start_time))
    print(rpn_outputs)