def do_inference(engine, pics, h_input, d_input, h_output, d_output, stream, batch_size, height, width, output_image=False): load_images_to_buffer(pics, h_input) with engine.create_execution_context() as context: # transfer input data to the GPU cuda.memcpy_htod_async(d_input, h_input, stream) # run inference context.profiler = trt.Profiler() context.execute(batch_size=1, bindings=[int(d_input), int(d_output)]) # transfer predictions batch from the GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) # synchronize the stream stream.synchronize() if output_image: out = h_output.reshape( (batch_size, -1, height, width) ) #TODO: why is the output a picture? Why is it channel first? else: out = h_output.reshape((batch_size, -1)) return out
def enable_profiling(self): raise RuntimeError( "Profiling is not supported right now because it requires calling" " execute() instead of execute_async()." ) if not self.context.profiler: self.context.profiler = trt.Profiler()
def enable_profiling(self): """ Enable TensorRT profiling. After calling this function, TensorRT will report time spent on each layer in stdout for each forward run. """ self._check_initialized() if not self.context.profiler: self.context.profiler = trt.Profiler()
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine pics_1 : Input images to the model. h_input_1: Input in the host d_input_1: Input in the device h_output_1: Output in the host d_output_1: Output in the device stream: CUDA stream batch_size : Batch size for execution time height: Height of the output image width: Width of the output image Output: The list of output images """ print('load images to buffer') load_images_to_buffer(pics_1, h_input_1) with engine.create_execution_context() as context: context.debug_sync = False # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. print('load profiler') context.profiler = trt.Profiler() print('execute') context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) print('Transfer predictions back from the GPU.') # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. print(h_output.shape) out = h_output.reshape((1,-1)) return out
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine. pics_1 : Input images to the model. h_input_1: Input in the host. d_input_1: Input in the device. h_output_1: Output in the host. d_output_1: Output in the device. stream: CUDA stream. batch_size : Batch size for execution time. height: Height of the output image. width: Width of the output image. Output: The list of output images. """ load_images_to_buffer(pics_1, h_input_1) with engine.create_execution_context() as context: # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. context.profiler = trt.Profiler() context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream. stream.synchronize() # Return the host output. out = h_output.reshape((batch_size, 68, 64, 64)) return out
def do_inference(context, engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine pics_1 : Input images to the model. h_input_1: Input in the host d_input_1: Input in the device h_output_1: Output in the host d_output_1: Output in the device stream: CUDA stream batch_size : Batch size for execution time height: Height of the output image width: Width of the output image Output: The list of output images """ load_images_to_buffer(pics_1, h_input_1) # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. context.profiler = trt.Profiler() context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return the host output. return h_output
def enable_profiling(self): if not self.context.profiler: self.context.profiler = trt.Profiler()
def convert_validate_save( onnx_model_filename: str, golden_data_filename: 'Optional[str]' = '', atol: float = 1e-3, rtol: float = 1e-3, batch_size: int = 1, # debug: bool = False, **kwargs) -> bool: r""" inference model in 'tensorrt' validate with given golden data save if accuracy passed """ import numpy as np import pycuda.autoinit # noqa: just import, no code check import pycuda.driver as cuda import tensorrt as trt trt_logger = trt.Logger( trt.Logger.VERBOSE if debug else trt.Logger.WARNING) builder = trt.Builder(trt_logger) network = builder.create_network() parser = trt.OnnxParser(network, trt_logger) logger.info('loading ONNX model: %s ...', onnx_model_filename) with open(onnx_model_filename, 'rb') as fp: onnx_model_proto_str = fp.read() success = parser.parse(onnx_model_proto_str) if not success: logger.error('model parsing failed:') for idx_error in range(parser.num_errors): logger.error('\t%s', parser.get_error(idx_error)) return False logger.info('model parsing passed') workspace_size = kwargs.pop('workspace_size', 1024 * 1024 * 16) # default to 1024*1024*16 fp16_mode = kwargs.pop('fp16_mode', builder.platform_has_fast_fp16) int8_mode = kwargs.pop('int8_mode', builder.platform_has_fast_int8) builder.debug_sync = debug builder.fp16_mode = fp16_mode builder.max_batch_size = batch_size builder.max_workspace_size = workspace_size builder.refittable = False builder.strict_type_constraints = True logger.info('using batch_size: %d', builder.max_batch_size) logger.info('I/O type-shape info:') if int8_mode: default_range = (-127, +127) builder.int8_mode = int8_mode for layer in network: for idx_out in range(layer.num_outputs): var_out = layer.get_output(idx_out) var_out.set_dynamic_range(-127, +127) dynamic_ranges = kwargs.pop('io_dynamic_ranges', dict()) for idx_inp in range(network.num_inputs): var_inp = network.get_input(idx_inp) dr_lo, dr_hi = dynamic_ranges.get(var_inp.name, default_range) var_inp.set_dynamic_range(dr_lo, dr_hi) logger.info('\t input %d (%12s): %s%s in [%d, %d]', idx_inp, var_inp.name, var_inp.dtype, var_inp.shape, dr_lo, dr_hi) for idx_out in range(network.num_outputs): var_out = network.get_output(idx_out) dr_lo, dr_hi = dynamic_ranges.get(var_out.name, default_range) var_out.set_dynamic_range(dr_lo, dr_hi) logger.info('\toutput %d (%12s): %s%s in [%d, %d]', idx_out, var_out.name, var_out.dtype, var_out.shape, dr_lo, dr_hi) # TODO: int8 calibrate else: for idx_inp in range(network.num_inputs): var_inp = network.get_input(idx_inp) logger.info('\t input %d (%12s): %s%s', idx_inp, var_inp.name, var_inp.dtype, var_inp.shape) for idx_out in range(network.num_outputs): var_out = network.get_output(idx_out) logger.info('\toutput %d (%12s): %s%s', idx_out, var_out.name, var_out.dtype, var_out.shape) # not exposed # builder.getNbDLACores() > 0 # builder.allowGPUFallback(True) # builder.setDefaultDeviceType(kDLA) # builder.setDLACore(1) engine = builder.build_cuda_engine(network) if engine is None: logger.info('engine building failed') return False logger.info('engine building passed') # globals().update(locals()) if golden_data_filename: logger.info('using golden data %s', golden_data_filename) if golden_data_filename.endswith('.npz'): test_data = np.load( golden_data_filename, encoding='bytes', allow_pickle=True, ) input_data = test_data['inputs'].tolist() output_data = test_data['outputs'].tolist() else: test_data = np.load( golden_data_filename, encoding='bytes', allow_pickle=True, ).tolist() input_data = test_data['inputs'] output_data = test_data['outputs'] input_data = flatten_dict(input_data) output_data = flatten_dict(output_data) # input_names = input_data.keys() output_names = output_data.keys() logger.info('with %d inputs and %d outputs', len(input_data), len(output_data)) input_device_data = { name: cuda.to_device(value) for name, value in input_data.items() } output_device_data = { name: cuda.mem_alloc(value.nbytes) for name, value in output_data.items() } output_host_data = { name: cuda.pagelocked_empty_like(value) for name, value in output_data.items() } logger.info('data transfered to device') profiler = trt.Profiler() with engine.create_execution_context() as context: if debug: context.profiler = profiler stream = cuda.Stream() # for name in input_names: # cuda.memcpy_htod_async( # input_data[name], input_device_data[name], # stream=stream) device_data = list(input_device_data.values()) + list( output_device_data.values()) success = context.execute_async(batch_size, bindings=list(map( int, device_data)), stream_handle=stream.handle, input_consumed=None) if not success: logger.error('execution failed') return False for name in output_names: cuda.memcpy_dtoh_async( output_host_data[name], output_device_data[name], stream=stream, ) stream.synchronize() logger.info('execution passed') # output_host_data[name] = onnx2trt_inference( # onnx_model_filename, list(input_data.values()), # batch_size, workspace_size)[0] # validate passed = True if golden_data_filename: for name in output_names: pr = output_host_data[name] gt = output_data[name] logger.info('testing on output %s ...', name) try: np.testing.assert_allclose( pr, gt, rtol=rtol, atol=atol, equal_nan=False, verbose=True, ) except AssertionError as e: passed = False logger.error('failed: %s\n', e) logger.info('accuracy %spassed', '' if passed else 'not ') globals().update(locals()) if passed: trt_engine_filename = onnx_model_filename[:-len('.onnx' )] + '.bin' # or .trt with open(trt_engine_filename, 'wb') as fp: fp.write(engine.serialize()) logger.info('engine saved to %s', trt_engine_filename) return passed
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine pics_1 : Input images to the model. h_input_1: Input in the host d_input_1: Input in the device h_output_1: Output in the host d_output_1: Output in the device stream: CUDA stream batch_size : Batch size for execution time height: Height of the output image width: Width of the output image Output: The list of output images """ #print(h_input_1.shape) #print(pics_1[0]) load_images_to_buffer(pics_1, h_input_1) #print("Será que deu load?") #print(h_input_1) with engine.create_execution_context() as context: # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. context.profiler = trt.Profiler() context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) #context.execute(bindings=[int(d_input_1), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. out = h_output.reshape((batch_size,-1, 80, 60)) #out = h_output.reshape((batch_size,-1, 80, 60)) #print(out[0]) torch_out = torch.from_numpy(out) torch_out = torch_out[0] #print("Torch original: " + str(torch_out)) torch_out = torch.exp(torch_out) #print("torch_score: " + str(torch_out)) #print("\n\n") #print(out[0]) return torch_out