예제 #1
0
def do_inference(engine,
                 pics,
                 h_input,
                 d_input,
                 h_output,
                 d_output,
                 stream,
                 batch_size,
                 height,
                 width,
                 output_image=False):
    load_images_to_buffer(pics, h_input)

    with engine.create_execution_context() as context:
        # transfer input data to the GPU
        cuda.memcpy_htod_async(d_input, h_input, stream)

        # run inference
        context.profiler = trt.Profiler()
        context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])

        # transfer predictions batch from the GPU
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # synchronize the stream
        stream.synchronize()

        if output_image:
            out = h_output.reshape(
                (batch_size, -1, height, width)
            )  #TODO: why is the output a picture? Why is it channel first?
        else:
            out = h_output.reshape((batch_size, -1))
        return out
예제 #2
0
파일: fx2trt.py 프로젝트: skidipap/pytorch
 def enable_profiling(self):
     raise RuntimeError(
         "Profiling is not supported right now because it requires calling"
         " execute() instead of execute_async()."
     )
     if not self.context.profiler:
         self.context.profiler = trt.Profiler()
예제 #3
0
파일: fx2trt.py 프로젝트: xsacha/pytorch
    def enable_profiling(self):
        """
        Enable TensorRT profiling. After calling this function, TensorRT will report
        time spent on each layer in stdout for each forward run.
        """
        self._check_initialized()

        if not self.context.profiler:
            self.context.profiler = trt.Profiler()
예제 #4
0
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
   """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host         
      d_input_1: Input in the device 
      h_output_1: Output in the host 
      d_output_1: Output in the device 
      stream: CUDA stream
      batch_size : Batch size for execution time
      height: Height of the output image
      width: Width of the output image
   
   Output:
      The list of output images

   """
   print('load images to buffer')
   load_images_to_buffer(pics_1, h_input_1)

   with engine.create_execution_context() as context:
       context.debug_sync = False
       # Transfer input data to the GPU.
       cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

       # Run inference.
       print('load profiler')
       context.profiler = trt.Profiler()
       print('execute')
       context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
       print('Transfer predictions back from the GPU.')
       # Transfer predictions back from the GPU.
       cuda.memcpy_dtoh_async(h_output, d_output, stream)
       # Synchronize the stream
       stream.synchronize()
       # Return the host output.
       print(h_output.shape)
       out = h_output.reshape((1,-1))
       return out 
예제 #5
0
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output,
                 stream, batch_size, height, width):
    """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine. 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host. 
      d_input_1: Input in the device. 
      h_output_1: Output in the host. 
      d_output_1: Output in the device. 
      stream: CUDA stream.
      batch_size : Batch size for execution time.
      height: Height of the output image.
      width: Width of the output image.
   
   Output:
      The list of output images.

   """

    load_images_to_buffer(pics_1, h_input_1)

    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

        # Run inference.

        context.profiler = trt.Profiler()
        context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])

        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream.
        stream.synchronize()
        # Return the host output.
        out = h_output.reshape((batch_size, 68, 64, 64))
        return out
예제 #6
0
def do_inference(context, engine, pics_1, h_input_1, d_input_1, h_output,
                 d_output, stream, batch_size, height, width):
    """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host         
      d_input_1: Input in the device 
      h_output_1: Output in the host 
      d_output_1: Output in the device 
      stream: CUDA stream
      batch_size : Batch size for execution time
      height: Height of the output image
      width: Width of the output image
   
   Output:
      The list of output images

   """

    load_images_to_buffer(pics_1, h_input_1)

    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

    # Run inference.

    context.profiler = trt.Profiler()
    context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])

    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return the host output.
    return h_output
예제 #7
0
 def enable_profiling(self):
     if not self.context.profiler:
         self.context.profiler = trt.Profiler()
예제 #8
0
def convert_validate_save(
        onnx_model_filename: str,
        golden_data_filename: 'Optional[str]' = '',
        atol: float = 1e-3,
        rtol: float = 1e-3,
        batch_size: int = 1,  #
        debug: bool = False,
        **kwargs) -> bool:
    r"""
        inference model in 'tensorrt'
        validate with given golden data
        save if accuracy passed
    """

    import numpy as np
    import pycuda.autoinit  # noqa: just import, no code check
    import pycuda.driver as cuda
    import tensorrt as trt

    trt_logger = trt.Logger(
        trt.Logger.VERBOSE if debug else trt.Logger.WARNING)
    builder = trt.Builder(trt_logger)
    network = builder.create_network()
    parser = trt.OnnxParser(network, trt_logger)

    logger.info('loading ONNX model: %s ...', onnx_model_filename)
    with open(onnx_model_filename, 'rb') as fp:
        onnx_model_proto_str = fp.read()
    success = parser.parse(onnx_model_proto_str)
    if not success:
        logger.error('model parsing failed:')
        for idx_error in range(parser.num_errors):
            logger.error('\t%s', parser.get_error(idx_error))
        return False
    logger.info('model parsing passed')

    workspace_size = kwargs.pop('workspace_size',
                                1024 * 1024 * 16)  # default to 1024*1024*16
    fp16_mode = kwargs.pop('fp16_mode', builder.platform_has_fast_fp16)
    int8_mode = kwargs.pop('int8_mode', builder.platform_has_fast_int8)

    builder.debug_sync = debug
    builder.fp16_mode = fp16_mode
    builder.max_batch_size = batch_size
    builder.max_workspace_size = workspace_size
    builder.refittable = False
    builder.strict_type_constraints = True

    logger.info('using batch_size: %d', builder.max_batch_size)
    logger.info('I/O type-shape info:')
    if int8_mode:
        default_range = (-127, +127)
        builder.int8_mode = int8_mode
        for layer in network:
            for idx_out in range(layer.num_outputs):
                var_out = layer.get_output(idx_out)
                var_out.set_dynamic_range(-127, +127)
        dynamic_ranges = kwargs.pop('io_dynamic_ranges', dict())
        for idx_inp in range(network.num_inputs):
            var_inp = network.get_input(idx_inp)
            dr_lo, dr_hi = dynamic_ranges.get(var_inp.name, default_range)
            var_inp.set_dynamic_range(dr_lo, dr_hi)
            logger.info('\t input %d (%12s): %s%s in [%d, %d]', idx_inp,
                        var_inp.name, var_inp.dtype, var_inp.shape, dr_lo,
                        dr_hi)
        for idx_out in range(network.num_outputs):
            var_out = network.get_output(idx_out)
            dr_lo, dr_hi = dynamic_ranges.get(var_out.name, default_range)
            var_out.set_dynamic_range(dr_lo, dr_hi)
            logger.info('\toutput %d (%12s): %s%s in [%d, %d]', idx_out,
                        var_out.name, var_out.dtype, var_out.shape, dr_lo,
                        dr_hi)
        # TODO: int8 calibrate
    else:
        for idx_inp in range(network.num_inputs):
            var_inp = network.get_input(idx_inp)
            logger.info('\t input %d (%12s): %s%s', idx_inp, var_inp.name,
                        var_inp.dtype, var_inp.shape)
        for idx_out in range(network.num_outputs):
            var_out = network.get_output(idx_out)
            logger.info('\toutput %d (%12s): %s%s', idx_out, var_out.name,
                        var_out.dtype, var_out.shape)

    # not exposed
#    builder.getNbDLACores() > 0
#    builder.allowGPUFallback(True)
#    builder.setDefaultDeviceType(kDLA)
#    builder.setDLACore(1)

    engine = builder.build_cuda_engine(network)
    if engine is None:
        logger.info('engine building failed')
        return False
    logger.info('engine building passed')

    #    globals().update(locals())

    if golden_data_filename:
        logger.info('using golden data %s', golden_data_filename)
        if golden_data_filename.endswith('.npz'):
            test_data = np.load(
                golden_data_filename,
                encoding='bytes',
                allow_pickle=True,
            )
            input_data = test_data['inputs'].tolist()
            output_data = test_data['outputs'].tolist()
        else:
            test_data = np.load(
                golden_data_filename,
                encoding='bytes',
                allow_pickle=True,
            ).tolist()
            input_data = test_data['inputs']
            output_data = test_data['outputs']

        input_data = flatten_dict(input_data)
        output_data = flatten_dict(output_data)
        #        input_names = input_data.keys()
        output_names = output_data.keys()
        logger.info('with %d inputs and %d outputs', len(input_data),
                    len(output_data))

        input_device_data = {
            name: cuda.to_device(value)
            for name, value in input_data.items()
        }
        output_device_data = {
            name: cuda.mem_alloc(value.nbytes)
            for name, value in output_data.items()
        }
        output_host_data = {
            name: cuda.pagelocked_empty_like(value)
            for name, value in output_data.items()
        }
        logger.info('data transfered to device')

        profiler = trt.Profiler()
        with engine.create_execution_context() as context:
            if debug:
                context.profiler = profiler
            stream = cuda.Stream()

            #            for name in input_names:
            #                cuda.memcpy_htod_async(
            #                        input_data[name], input_device_data[name],
            #                                       stream=stream)

            device_data = list(input_device_data.values()) + list(
                output_device_data.values())
            success = context.execute_async(batch_size,
                                            bindings=list(map(
                                                int, device_data)),
                                            stream_handle=stream.handle,
                                            input_consumed=None)
            if not success:
                logger.error('execution failed')
                return False

            for name in output_names:
                cuda.memcpy_dtoh_async(
                    output_host_data[name],
                    output_device_data[name],
                    stream=stream,
                )

            stream.synchronize()

        logger.info('execution passed')


#        output_host_data[name] = onnx2trt_inference(
#                onnx_model_filename, list(input_data.values()),
#                batch_size, workspace_size)[0]

# validate
    passed = True
    if golden_data_filename:
        for name in output_names:
            pr = output_host_data[name]
            gt = output_data[name]
            logger.info('testing on output %s ...', name)
            try:
                np.testing.assert_allclose(
                    pr,
                    gt,
                    rtol=rtol,
                    atol=atol,
                    equal_nan=False,
                    verbose=True,
                )
            except AssertionError as e:
                passed = False
                logger.error('failed: %s\n', e)
        logger.info('accuracy %spassed', '' if passed else 'not ')

    globals().update(locals())

    if passed:
        trt_engine_filename = onnx_model_filename[:-len('.onnx'
                                                        )] + '.bin'  # or .trt
        with open(trt_engine_filename, 'wb') as fp:
            fp.write(engine.serialize())
        logger.info('engine saved to %s', trt_engine_filename)

    return passed
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
    """
    This is the function to run the inference
    Args:
        engine : Path to the TensorRT engine 
        pics_1 : Input images to the model.  
        h_input_1: Input in the host         
        d_input_1: Input in the device 
        h_output_1: Output in the host 
        d_output_1: Output in the device 
        stream: CUDA stream
        batch_size : Batch size for execution time
        height: Height of the output image
        width: Width of the output image

    Output:
        The list of output images

    """


    #print(h_input_1.shape)

    #print(pics_1[0])

    load_images_to_buffer(pics_1, h_input_1)

    #print("Será que deu load?")
    #print(h_input_1)


    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

        # Run inference.

        context.profiler = trt.Profiler()
        context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
        #context.execute(bindings=[int(d_input_1), int(d_output)], stream_handle=stream.handle)
        
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        # Return the host output.
        out = h_output.reshape((batch_size,-1, 80, 60))
        #out = h_output.reshape((batch_size,-1, 80, 60))

        #print(out[0])

        torch_out = torch.from_numpy(out) 
        

        torch_out = torch_out[0]

        #print("Torch original: " + str(torch_out))

        torch_out = torch.exp(torch_out)
        #print("torch_score: " + str(torch_out))

        #print("\n\n")
        #print(out[0])

        return torch_out