示例#1
0
def build_engine_onnx(model_file):
    #创建相应的实例
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
            common.EXPLICIT_BATCH) as network, builder.create_builder_config(
            ) as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
        #设置相应的参数
        config.max_workspace_size = common.GiB(1)
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        #读取相应的模型文件
        with open(model_file, 'rb') as model:
            if not parser.parse(model.read()):
                print('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        #构建相应的引擎
        return builder.build_engine(network, config)
def build_engine_caffe(model_file, deploy_file):
    # You can set the logger severity higher to suppress messages (or lower to display more messages).
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, trt.CaffeParser() as parser:
        # Workspace size is the maximum amount of memory available to the builder while building an engine.
        # It should generally be set as high as possible.
        builder.max_workspace_size = common.GiB(1)
        # Load the Caffe model and parse it in order to populate the TensorRT network.
        # This function returns an object that we can query to find tensors by name.
        model_tensors = parser.parse(deploy=deploy_file,
                                     model=model_file,
                                     network=network,
                                     dtype=ModelData.DTYPE)
        # For Caffe, we need to manually mark the output of the network.
        # Since we know the name of the output tensor, we can find it in model_tensors.
        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
        return builder.build_cuda_engine(network)
def build_int8_engine(deploy_file, model_file, calib, batch_size=32):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, trt.CaffeParser() as parser:
        # We set the builder batch size to be the same as the calibrator's, as we use the same batches
        # during inference. Note that this is not required in general, and inference batch size is
        # independent of calibration batch size.
        builder.max_batch_size = batch_size
        builder.max_workspace_size = common.GiB(1)
        builder.int8_mode = True
        builder.int8_calibrator = calib
        # Parse Caffe model
        model_tensors = parser.parse(deploy=deploy_file,
                                     model=model_file,
                                     network=network,
                                     dtype=ModelData.DTYPE)
        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
        # Build engine and do int8 calibration.
        return builder.build_cuda_engine(network)
示例#4
0
def build_engine_onnx_int8(TRT_LOGGER, model_file, calib):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common.EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = common.GiB(1)
        # import pdb;pdb.set_trace()
        builder.int8_mode = True
        # calibration_cache = "res50_calibration.cache"
        # calib = ResNetEntropyCalibrator(training_data='../datasets/ic15/train_list.txt', cache_file=calibration_cache)
        builder.int8_calibrator = calib

        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
                return None
        # import pdb;pdb.set_trace()
        return builder.build_cuda_engine(network)
示例#5
0
def build_engine_with_some_missing_weights(weights):
    # For more information on TRT basics, refer to the introductory samples.
    #创建builder和network实例
    with trt.Builder(
            TRT_LOGGER) as builder, builder.create_network() as network:
        #设置最大工作空间的大小,GIB的具体实现参考common.py
        builder.max_workspace_size = common.GiB(1)
        # Set the refit flag in the builder
        #在builder中设置refit标志位
        builder.refittable = True
        # Populate the network using weights from the PyTorch model.
        #利用相应模型的权重填充tensorrt网络
        #populate_network_with_some_dummy_weights的具体实现参考本文件的相应实现
        populate_network_with_some_dummy_weights(network, weights)
        # Build and return an engine.
        #建立相应的引擎
        #Builds an ICudaEngine from a INetworkDefinition
        return builder.build_cuda_engine(network)
	def build_engine():
	    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.CaffeParser() as parser:
	        builder.fp16_mode = True
	        builder.strict_type_constraints = True
	        builder.max_batch_size = 16
	        # Workspace size is the maximum amount of memory available to the builder while building an engine.
	        # It should generally be set as high as possible.
	        builder.max_workspace_size = common.GiB(1)
	        # Load the Caffe model and parse it in order to populate the TensorRT network.
	        # This function returns an object that we can query to find tensors by name.
	        model_tensors = parser.parse(deploy=deploy_file, model=model_file, network=network, dtype=ModelData.DTYPE)
	        # For Caffe, we need to manually mark the output of the network.
	        # Since we know the name of the output tensor, we can find it in model_tensors.
	        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
	        engine = builder.build_cuda_engine(network)
	        with open(engine_file_path, "wb") as f:
	            f.write(engine.serialize())
	        return engine
示例#7
0
def build_int8_engine(deploy_file, model_file, calib, batch_size=32):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, builder.create_builder_config() as config, trt.CaffeParser(
    ) as parser, trt.Runtime(TRT_LOGGER) as runtime:
        # We set the builder batch size to be the same as the calibrator's, as we use the same batches
        # during inference. Note that this is not required in general, and inference batch size is
        # independent of calibration batch size.
        builder.max_batch_size = batch_size
        config.max_workspace_size = common.GiB(1)
        config.set_flag(trt.BuilderFlag.INT8)
        config.int8_calibrator = calib
        # Parse Caffe model
        model_tensors = parser.parse(deploy=deploy_file,
                                     model=model_file,
                                     network=network,
                                     dtype=ModelData.DTYPE)
        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
        # Build engine and do int8 calibration.
        plan = builder.build_serialized_network(network, config)
        return runtime.deserialize_cuda_engine(plan)
示例#8
0
def build_engine_uff(model_file):
    # You can set the logger severity higher to suppress messages (or lower to display more messages).
    #创建相关的实例
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, builder.create_builder_config() as config, trt.UffParser(
    ) as parser:
        # Workspace size is the maximum amount of memory available to the builder while building an engine.
        # It should generally be set as high as possible.
        #设定相应的参数
        config.max_workspace_size = common.GiB(1)
        # We need to manually register the input and output nodes for UFF.
        #register_input用来注册一个uff网络的输入名称和相应的维度
        #注册uff的输入和输出结点
        parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
        parser.register_output(ModelData.OUTPUT_NAME)
        # Load the UFF model and parse it in order to populate the TensorRT network.
        #使用uff解析器解析模型
        parser.parse(model_file, network)
        # Build and return an engine.
        #创建相关的引擎
        return builder.build_engine(network, config)
示例#9
0
def build_int8_engine(deploy_file, model_file, batch_size=32, trt_engine_datatype=trt.DataType.FLOAT):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.CaffeParser() as parser:
        # We set the builder batch size to be the same as the calibrator's, as we use the same batches
        # during inference. Note that this is not required in general, and inference batch size is
        # independent of calibration batch size.
        builder.max_batch_size = batch_size
        builder.max_workspace_size = common.GiB(1)
        if trt_engine_datatype == trt.DataType.HALF:
            builder.fp16_mode = True
        elif trt_engine_datatype == trt.DataType.INT8:
            # Now we create a calibrator and give it the location of our calibration data.
            # We also allow it to cache calibration data for faster engine building.
            _, [calib_data] = common.find_sample_data(description="Runs a Caffe MNIST network in Int8 mode", subfolder="mnist", find_files=["t10k-images-idx3-ubyte"])
            calibration_cache = "mnist_calibration.cache"
            builder.int8_mode = True
            builder.int8_calibrator = MNISTEntropyCalibrator(calib_data, cache_file=calibration_cache)
        # Parse Caffe model
        model_tensors = parser.parse(deploy=deploy_file, model=model_file, network=network, dtype=ModelData.DTYPE)
        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
        # Build engine and do int8 calibration.
        return builder.build_cuda_engine(network)
示例#10
0
def build_engine_caffe(model_file, deploy_file, precision):
    # precision: float, half, int8
    # You can set the logger severity higher to suppress messages (or lower to display more messages).
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.CaffeParser() as parser:
        # Workspace size is the maximum amount of memory available to the builder while building an engine.
        # It should generally be set as high as possible.
        builder.max_workspace_size = common.GiB(1)
        # Load the Caffe model and parse it in order to populate the TensorRT network.
        # This function returns an object that we can query to find tensors by name.
        model_tensors = parser.parse(deploy=deploy_file, model=model_file, network=network, dtype=ModelData.DTYPE)
        # For Caffe, we need to manually mark the output of the network.
        # Since we know the name of the output tensor, we can find it in model_tensors.
        print(model_tensors)
        print(ModelData.OUTPUT_NAME)
        print(model_tensors.find(ModelData.OUTPUT_NAME))
        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))

        if precision == "half":
            # enable fp16 (chenrong06)
            builder.fp16_mode = True
            builder.strict_type_constraints = True
            print("pricision: half")
        elif precision == "int8":
            # enable int8 and set quantize (chenrong06)
            # Incomplete version, please refer to workspace/tensorrt/samples/sampleINT8API/sampleINT8API.cpp
            builder.int8_mode = True
            builder.int8_calibrator = None
            builder.strict_type_constraints = True
            print(network.num_layers)
            for i in range(network.num_layers):
                layer = network[i]
                tensor = layer.get_output(0)
                tensor.set_dynamic_range(-1.0, 1.0)
                tensor = layer.get_input(0)
                tensor.set_dynamic_range(-1.0, 1.0)
            print("pricision: int8")
        else:
            print("pricision: float")

        return builder.build_cuda_engine(network)
示例#11
0
def build_int8_engine(onnx_file_path, calib, batch_size=32):
    # with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, builder.create_builder_config() as config, trt.CaffeParser() as parser:
    EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common.EXPLICIT_BATCH) as network, \
            builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
        # We set the builder batch size to be the same as the calibrator's, as we use the same batches
        # during inference. Note that this is not required in general, and inference batch size is
        # independent of calibration batch size.
        builder.max_batch_size = batch_size

        config.max_workspace_size = common.GiB(1)
        config.set_flag(trt.BuilderFlag.INT8)
        config.set_flag(trt.BuilderFlag.STRICT_TYPES)
        config.int8_calibrator = calib

        # Parse Onnx model
        with open(onnx_file_path, 'rb') as model:
            print('Beginning ONNX file parsing')
            if not parser.parse(model.read()):
                print('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        network.get_input(0).shape = [batch_size, 3, 32, 32]

        # Decide which layers fallback to FP32.
        # If all layers fallback to FP32, you can use 'index>-1'
        for index, layer in enumerate(network):
            print('layer index', index, ':', layer.type)
            if index < 10:
                if layer.type == trt.LayerType.ACTIVATION or \
                        layer.type == trt.LayerType.CONVOLUTION or \
                        layer.type == trt.LayerType.FULLY_CONNECTED or \
                        layer.type == trt.LayerType.SCALE:
                    print('fallback to fp32!')
                    layer.precision = trt.float32
                    layer.set_output_type(0, trt.float32)

        # Build engine and do int8 calibration.
        return builder.build_engine(network, config)
示例#12
0
def build_engine_onnx(model_file):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = common.GiB(1)
        builder.max_batch_size = args.batch_size

        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            ok = parser.parse(model.read())
            if not ok:
                print("Error: Parse onnx model \"{}\" failed.".format(
                    model_file))
                error = parser.get_error(0)
                print("  code: {}".format(error.code()))
                print("  desc: {}".format(error.desc()))
                print("  file: {}".format(error.file()))
                print("  func: {}".format(error.func()))
                print("  line: {}".format(error.line()))
                print("  node: {}".format(error.node()))
                exit(-1)

        if args.q:
            # enable int8 and set quantize (chenrong06)
            # Incomplete version, please refer to workspace/tensorrt/samples/sampleINT8API/sampleINT8API.cpp
            builder.int8_mode = True
            builder.int8_calibrator = None
            builder.strict_type_constraints = True
            # print(network.num_layers)
            for i in range(network.num_layers):
                layer = network[i]
                tensor = layer.get_output(0)
                if tensor:
                    tensor.set_dynamic_range(-1.0, 1.0)
                tensor = layer.get_input(0)
                if tensor:
                    tensor.set_dynamic_range(-1.0, 1.0)
            # print("pricision: int8")

        return builder.build_cuda_engine(network)
示例#13
0
def test_trt_export(model_name=ONNX_MODEL_NAME):
    import tensorrt as trt
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
            common.EXPLICIT_BATCH) as network, trt.OnnxParser(
                network, TRT_LOGGER) as parser:
        builder.max_workspace_size = common.GiB(1)
        builder.fp16_mode = False
        builder.max_batch_size = 1

        with open(model_name, 'rb') as model:
            if not parser.parse(model.read()):
                print('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None

        engine = builder.build_cuda_engine(network)
        print("CUDA engine build successfully!")
        return engine
示例#14
0
def build_engine_onnx(model_file):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
            common.EXPLICIT_BATCH) as network, trt.OnnxParser(
                network, TRT_LOGGER) as parser:
        builder.max_workspace_size = common.GiB(1)
        builder.fp16_mode = True
        builder.max_batch_size = 1  # always 1 for explicit batch
        config = builder.create_builder_config()
        # need to be set along with fp16_mode if config is specified.
        config.set_flag(trt.BuilderFlag.FP16)
        profile = builder.create_optimization_profile()
        profile.set_shape('input', (1, 1, 4, 4), (2, 1, 4, 4), (4, 1, 4, 4))
        profile.set_shape('grid', (1, 4, 4, 2), (2, 4, 4, 2), (4, 4, 4, 2))
        config.add_optimization_profile(profile)

        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            if not parser.parse(model.read()):
                print('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        return builder.build_engine(network, config)
示例#15
0
def build_engine_onnx(model_file, calibrator=None):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = common.GiB(1)
        builder.max_batch_size = 8
        precision = "fp32"
        if calibrator:
            builder.int8_mode = True
            builder.int8_calibrator = calibrator
            precision = "int8"
        else:
            builder.fp16_mode = True
            precision = "fp16"
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            parser.parse(model.read())
        engine = builder.build_cuda_engine(network)
        serialized = engine.serialize()
        with open(
                "/work/models/flowers-152-b{}-{}.engine".format(
                    builder.max_batch_size, precision), "wb") as file:
            file.write(serialized)
        return engine
示例#16
0
def build_int8_engine(deploy_file, model_file, calib, batch_size=32):
    #创建相关的实例
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, builder.create_builder_config() as config, trt.CaffeParser(
    ) as parser:
        # We set the builder batch size to be the same as the calibrator's, as we use the same batches
        # during inference. Note that this is not required in general, and inference batch size is
        # independent of calibration batch size.
        #指定相关的参数
        builder.max_batch_size = batch_size
        config.max_workspace_size = common.GiB(1)
        config.set_flag(trt.BuilderFlag.INT8)
        config.int8_calibrator = calib
        # Parse Caffe model
        #使用caffe解析器解析模型,解析一个prototxt文件和一个binaryproto Caffe模型,分别提取网络定义和与网络相关的权值。
        model_tensors = parser.parse(deploy=deploy_file,
                                     model=model_file,
                                     network=network,
                                     dtype=ModelData.DTYPE)
        #标记网络的输出
        network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
        # Build engine and do int8 calibration.
        #构建相应的引擎
        return builder.build_engine(network, config)
示例#17
0
def build_engine(model_dir):
    """Build TensorRT engine through the Python API.
    Args:
        model_dir: the trained TensorFlow PSENet model dir.

    Returns:
        engine: the build TensorRT engine.
    """
    ckpt = tf.train.get_checkpoint_state(model_dir)
    ckpt_path = ckpt.model_checkpoint_path
    reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
    explicit_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
            explicit_flag) as network, builder.create_builder_config(
            ) as config:
        data = network.add_input(INPUT_NAME, trt.float32, (-1, 3, -1, -1))

        w = reader.get_tensor("resnet_v1_50/conv1/weights").transpose(
            3, 2, 0, 1).reshape(-1)
        b = np.zeros(64, dtype=np.float32)
        conv1 = network.add_convolution(data, 64, (7, 7), trt.Weights(w),
                                        trt.Weights(b))
        conv1.stride = (2, 2)
        conv1.padding = (3, 3)

        bn1 = add_batchnorm(reader, network, conv1.get_output(0),
                            "resnet_v1_50/conv1/BatchNorm/", 1e-5)
        relu1 = network.add_activation(bn1.get_output(0),
                                       trt.ActivationType.RELU)

        # C2
        pool1 = network.add_pooling(relu1.get_output(0), trt.PoolingType.MAX,
                                    (3, 3))
        pool1.stride = (2, 2)
        pool1.pre_padding = (0, 0)
        pool1.post_padding = (1, 1)

        x = bottleneck(reader, network, pool1.get_output(0), 64, 1,
                       "resnet_v1_50/block1/unit_1/bottleneck_v1/", 1)

        x = bottleneck(reader, network, x.get_output(0), 64, 1,
                       "resnet_v1_50/block1/unit_2/bottleneck_v1/", 0)

        # C3
        block1 = bottleneck(reader, network, x.get_output(0), 64, 2,
                            "resnet_v1_50/block1/unit_3/bottleneck_v1/", 2)

        x = bottleneck(reader, network, block1.get_output(0), 128, 1,
                       "resnet_v1_50/block2/unit_1/bottleneck_v1/", 1)
        x = bottleneck(reader, network, x.get_output(0), 128, 1,
                       "resnet_v1_50/block2/unit_2/bottleneck_v1/", 0)
        x = bottleneck(reader, network, x.get_output(0), 128, 1,
                       "resnet_v1_50/block2/unit_3/bottleneck_v1/", 0)
        # C4
        block2 = bottleneck(reader, network, x.get_output(0), 128, 2,
                            "resnet_v1_50/block2/unit_4/bottleneck_v1/", 2)

        x = bottleneck(reader, network, block2.get_output(0), 256, 1,
                       "resnet_v1_50/block3/unit_1/bottleneck_v1/", 1)
        x = bottleneck(reader, network, x.get_output(0), 256, 1,
                       "resnet_v1_50/block3/unit_2/bottleneck_v1/", 0)
        x = bottleneck(reader, network, x.get_output(0), 256, 1,
                       "resnet_v1_50/block3/unit_3/bottleneck_v1/", 0)
        x = bottleneck(reader, network, x.get_output(0), 256, 1,
                       "resnet_v1_50/block3/unit_4/bottleneck_v1/", 0)
        x = bottleneck(reader, network, x.get_output(0), 256, 1,
                       "resnet_v1_50/block3/unit_5/bottleneck_v1/", 0)
        block3 = bottleneck(reader, network, x.get_output(0), 256, 2,
                            "resnet_v1_50/block3/unit_6/bottleneck_v1/", 2)

        x = bottleneck(reader, network, block3.get_output(0), 512, 1,
                       "resnet_v1_50/block4/unit_1/bottleneck_v1/", 1)
        x = bottleneck(reader, network, x.get_output(0), 512, 1,
                       "resnet_v1_50/block4/unit_2/bottleneck_v1/", 0)
        # C5
        block4 = bottleneck(reader, network, x.get_output(0), 512, 1,
                            "resnet_v1_50/block4/unit_3/bottleneck_v1/", 0)

        build_p5_r1 = add_conv_relu(reader, network, block4.get_output(0), 256,
                                    1, 1, "build_feature_pyramid/build_P5/")

        build_p4_r1 = add_conv_relu(
            reader, network, block2.get_output(0), 256, 1, 1,
            "build_feature_pyramid/build_P4/reduce_dimension/")

        bfp_layer4_resize = network.add_resize(build_p5_r1.get_output(0))
        build_p4_r1_shape = network.add_shape(
            build_p4_r1.get_output(0)).get_output(0)
        bfp_layer4_resize.set_input(1, build_p4_r1_shape)
        bfp_layer4_resize.resize_mode = trt.ResizeMode.NEAREST
        bfp_layer4_resize.align_corners = False

        bfp_add = network.add_elementwise(build_p4_r1.get_output(0),
                                          bfp_layer4_resize.get_output(0),
                                          trt.ElementWiseOperation.SUM)

        build_p4_r2 = add_conv_relu(
            reader, network, bfp_add.get_output(0), 256, 3, 1,
            "build_feature_pyramid/build_P4/avoid_aliasing/")

        build_p3_r1 = add_conv_relu(
            reader, network, block1.get_output(0), 256, 1, 1,
            "build_feature_pyramid/build_P3/reduce_dimension/")

        bfp_layer3_resize = network.add_resize(build_p4_r2.get_output(0))
        bfp_layer3_resize.resize_mode = trt.ResizeMode.NEAREST
        build_p3_r1_shape = network.add_shape(
            build_p3_r1.get_output(0)).get_output(0)
        bfp_layer3_resize.set_input(1, build_p3_r1_shape)
        bfp_layer3_resize.align_corners = False

        bfp_add1 = network.add_elementwise(bfp_layer3_resize.get_output(0),
                                           build_p3_r1.get_output(0),
                                           trt.ElementWiseOperation.SUM)

        build_p3_r2 = add_conv_relu(
            reader, network, bfp_add1.get_output(0), 256, 3, 1,
            "build_feature_pyramid/build_P3/avoid_aliasing/")

        build_p2_r1 = add_conv_relu(
            reader, network, pool1.get_output(0), 256, 1, 1,
            "build_feature_pyramid/build_P2/reduce_dimension/")

        bfp_layer2_resize = network.add_resize(build_p3_r2.get_output(0))
        bfp_layer2_resize.resize_mode = trt.ResizeMode.NEAREST
        build_p2_r1_shape = network.add_shape(
            build_p2_r1.get_output(0)).get_output(0)
        bfp_layer2_resize.set_input(1, build_p2_r1_shape)
        bfp_layer2_resize.align_corners = False

        bfp_add2 = network.add_elementwise(bfp_layer2_resize.get_output(0),
                                           build_p2_r1.get_output(0),
                                           trt.ElementWiseOperation.SUM)

        # P2
        build_p2_r2 = add_conv_relu(
            reader, network, bfp_add2.get_output(0), 256, 3, 1,
            "build_feature_pyramid/build_P2/avoid_aliasing/")
        build_p2_r2_shape = network.add_shape(
            build_p2_r2.get_output(0)).get_output(0)

        # P3 x2
        layer1_resize = network.add_resize(build_p3_r2.get_output(0))
        layer1_resize.resize_mode = trt.ResizeMode.LINEAR
        layer1_resize.set_input(1, build_p2_r2_shape)
        layer1_resize.align_corners = False

        # P4 x4
        layer2_resize = network.add_resize(build_p4_r2.get_output(0))
        layer2_resize.resize_mode = trt.ResizeMode.LINEAR
        layer2_resize.set_input(1, build_p2_r2_shape)
        layer2_resize.align_corners = False

        # p5 right
        # P5 x8
        layer3_resize = network.add_resize(build_p5_r1.get_output(0))
        layer3_resize.resize_mode = trt.ResizeMode.LINEAR
        layer3_resize.set_input(1, build_p2_r2_shape)
        layer3_resize.align_corners = False

        # C(P5,P4,P3,P2)
        concat = network.add_concatenation([
            layer3_resize.get_output(0),
            layer2_resize.get_output(0),
            layer1_resize.get_output(0),
            build_p2_r2.get_output(0),
        ])

        w = reader.get_tensor("feature_results/Conv/weights").transpose(
            3, 2, 0, 1).reshape(-1)
        b = np.zeros(256, dtype=np.float32)
        feature_result_conv = network.add_convolution(concat.get_output(0),
                                                      256, (3, 3),
                                                      trt.Weights(w),
                                                      trt.Weights(b))
        feature_result_conv.padding = (1, 1)

        feature_result_bn = add_batchnorm(reader, network,
                                          feature_result_conv.get_output(0),
                                          "feature_results/Conv/BatchNorm/",
                                          1e-5)

        feature_result_relu = network.add_activation(
            feature_result_bn.get_output(0), trt.ActivationType.RELU)
        w = reader.get_tensor("feature_results/Conv_1/weights").transpose(
            3, 2, 0, 1).reshape(-1)
        b = reader.get_tensor("feature_results/Conv_1/biases")
        feature_result_conv_1 = network.add_convolution(
            feature_result_relu.get_output(0), 6, (1, 1), trt.Weights(w),
            trt.Weights(b))

        sigmoid = network.add_activation(feature_result_conv_1.get_output(0),
                                         trt.ActivationType.SIGMOID)
        sigmoid.get_output(0).name = OUTPUT_NAME
        network.mark_output(sigmoid.get_output(0))

        profile = builder.create_optimization_profile()
        profile.set_shape("input",
                          min=(1, 3, 128, 128),
                          opt=(1, 3, 640, 640),
                          max=(4, 3, 1200, 1200))
        config.add_optimization_profile(profile)

        config.max_workspace_size = common.GiB(1)
        if USE_FP16:
            config_flags = 1 << int(trt.BuilderFlag.FP16)
            config.flags = config_flags

        engine = builder.build_engine(network, config)

        return engine