def allocate_buffers(engine: trt.ICudaEngine, batch_size: int): print('Allocating buffers ...') inputs = [] outputs = [] dbindings = [] stream = cuda.Stream() for binding in engine: size = batch_size * abs(trt.volume(engine.get_binding_shape(binding))) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. dbindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, dbindings, stream
def __init__( self, engine: trt.ICudaEngine, idx_or_name: Union[int, str], max_batch_size: int, device: str, ): if isinstance(idx_or_name, six.string_types): self.name = idx_or_name self.index = engine.get_binding_index(self.name) if self.index == -1: raise IndexError(f"Binding name not found: {self.name}") else: self.index = idx_or_name self.name = engine.get_binding_name(self.index) if self.name is None: raise IndexError(f"Binding index out of range: {self.index}") self._dtype = TYPE_TRT_2_TORCH[engine.get_binding_dtype(self.index)] self._shape = (max_batch_size, ) + tuple( engine.get_binding_shape(self.index))[1:] self._device = torch.device(device) self._is_input = engine.binding_is_input(self.index) if self.is_input: self._binding_data = None else: self._binding_data = torch.zeros(size=self.shape, dtype=self.dtype, device=self.device)
def get_random_inputs( engine: trt.ICudaEngine, context: trt.IExecutionContext, input_binding_idxs: List[int], seed: int = 42, ): # Input data for inference host_inputs = [] print("Generating Random Inputs") print("\tUsing random seed: {}".format(seed)) np.random.seed(seed) for binding_index in input_binding_idxs: # If input shape is fixed, we'll just use it input_shape = context.get_binding_shape(binding_index) input_name = engine.get_binding_name(binding_index) print("\tInput [{}] shape: {}".format(input_name, input_shape)) # If input shape is dynamic, we'll arbitrarily select one of the # the min/opt/max shapes from our optimization profile if is_dynamic(input_shape): profile_index = context.active_optimization_profile profile_shapes = engine.get_profile_shape(profile_index, binding_index) print("\tProfile Shapes for [{}]: [kMIN {} | kOPT {} | kMAX {}]". format(input_name, *profile_shapes)) # 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max) input_shape = profile_shapes[1] print( "\tInput [{}] shape was dynamic, setting inference shape to {}" .format(input_name, input_shape)) host_inputs.append(np.random.random(input_shape).astype(np.float32)) return host_inputs
def allocate_buffers_torch(engine: trt.ICudaEngine, device): import torch inputs = [] outputs = [] bindings = [] index = 0 dtype_map = np_to_torch_dtype_map() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) shape = [engine.max_batch_size] + list( engine.get_binding_shape(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape) device_mem = torch.empty(*host_mem.shape, device=device, dtype=dtype_map[host_mem.dtype]) # Append the device buffer to device bindings. bindings.append(device_mem.data_ptr()) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem, binding, index)) else: outputs.append(HostDeviceMem(host_mem, device_mem, binding, index)) index += 1 return inputs, outputs, bindings
def run_trt_engine(context: trt.IExecutionContext, engine: trt.ICudaEngine, h_tensors: dict): """Run a TRT model. The model output is written in place inside the tensors provided in h_tensors['outputs']. Args: context (trt.IExecutionContext): engine (trt.ICudaEngine): h_tensors (dict): A dictionary with keys "inputs" and "outputs" and values which are another dictionaries with tensor names as keys and numpy.ndarrays as values. """ # Allocate GPU memory. d_tensors = {} d_tensors['inputs'] = { k: cuda.mem_alloc(v.nbytes) for k, v in h_tensors['inputs'].items() } d_tensors['outputs'] = { k: cuda.mem_alloc(v.nbytes) for k, v in h_tensors['outputs'].items() } # Copy input buffers to GPU. for h_tensor, d_tensor in zip(h_tensors['inputs'].values(), d_tensors['inputs'].values()): cuda.memcpy_htod(d_tensor, h_tensor) # Initialise bindings list. bindings = [None] * engine.num_bindings # Populate bindings list. for (name, h_tensor), (_, d_tensor) in zip(h_tensors['inputs'].items(), d_tensors['inputs'].items()): idx = engine.get_binding_index(name) bindings[idx] = int(d_tensor) if engine.is_shape_binding(idx) and is_shape_dynamic( context.get_shape(idx)): context.set_shape_input(idx, h_tensor) elif is_shape_dynamic(engine.get_binding_shape(idx)): context.set_binding_shape(idx, h_tensor.shape) for name, d_tensor in d_tensors['outputs'].items(): idx = engine.get_binding_index(name) bindings[idx] = int(d_tensor) # Run engine. context.execute_v2(bindings=bindings) # Copy output buffers to CPU. for h_tensor, d_tensor in zip(h_tensors['outputs'].values(), d_tensors['outputs'].values()): cuda.memcpy_dtoh(h_tensor, d_tensor)
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int): # Calculate start/end binding indices for current context's profile num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles start_binding = profile_index * num_bindings_per_profile end_binding = start_binding + num_bindings_per_profile print("Engine/Binding Metadata") print("\tNumber of optimization profiles: {}".format( engine.num_optimization_profiles)) print("\tNumber of bindings per profile: {}".format( num_bindings_per_profile)) print("\tFirst binding for profile {}: {}".format(profile_index, start_binding)) print("\tLast binding for profile {}: {}".format(profile_index, end_binding - 1)) # Separate input and output binding indices for convenience input_binding_idxs = [] output_binding_idxs = [] for binding_index in range(start_binding, end_binding): if engine.binding_is_input(binding_index): input_binding_idxs.append(binding_index) else: output_binding_idxs.append(binding_index) return input_binding_idxs, output_binding_idxs
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int): """ :param engine: :param profile_index: :return: """ # Calculate start/end binding indices for current context's profile num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles start_binding = profile_index * num_bindings_per_profile end_binding = start_binding + num_bindings_per_profile # Separate input and output binding indices for convenience input_binding_idxs = [] output_binding_idxs = [] for binding_index in range(start_binding, end_binding): if engine.binding_is_input(binding_index): input_binding_idxs.append(binding_index) else: output_binding_idxs.append(binding_index) return input_binding_idxs, output_binding_idxs
def detect(engine: trt.ICudaEngine, img: np.ndarray) -> "tuple[np.ndarray, np.ndarray, np.ndarray]": #this function performs network execution on the given img #additionally this function does preprocessing and postprocessing of img #param engine: tensor rt engine created from network weights #param img: image to perform detection on #return value: predictions in original image coordinates #predictions: bounding boxes, confidences, class_ids with engine.create_execution_context() as context: h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32) h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32) #preprocess preprocess_start = time.time() preprocessed_img, resize_ratio, padding_size = preprocess_img(img) preprocess_stop = time.time() #copy our input image to buffer np.copyto(h_input, preprocessed_img.flatten()) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Run inference. context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() postprocess_start = time.time() predictions = postprocess(h_output) predictions_in_original_coords = transform_detected_coords_to_original(predictions, resize_ratio, padding_size) postprocess_stop = time.time() print(f"Preprocessing time: {(preprocess_stop - preprocess_start) * 1000:.4f} ms") print(f"Postprocessing time: {(postprocess_stop - postprocess_start) * 1000:.4f} ms") print(f"Complete detection time: {(postprocess_stop - preprocess_start) * 1000:.4f} ms") return predictions_in_original_coords
def setup_binding_shapes(engine: trt.ICudaEngine, context: trt.IExecutionContext, host_inputs, input_binding_idxs, output_binding_idxs): # Explicitly set the dynamic input shapes, so the dynamic output # shapes can be computed internally for host_input, binding_index in zip(host_inputs, input_binding_idxs): context.set_binding_shape(binding_index, host_input.shape) assert context.all_binding_shapes_specified host_outputs = [] device_outputs = [] for binding_index in output_binding_idxs: output_shape = context.get_binding_shape(binding_index) # Allocate buffers to hold output results after copying back to host buffer = np.empty(output_shape, dtype=np.float32) host_outputs.append(buffer) # Allocate output buffers on device device_outputs.append(cuda.mem_alloc(buffer.nbytes)) # 绑定输出shape utput_names = [ engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs ] return host_outputs, device_outputs
def save_engine(engine: trt.ICudaEngine, engine_dest_path: str): logging.info(f"\t{sub_prefix}Saving TensorRT engine") buf = engine.serialize() with open(engine_dest_path, 'wb') as f: f.write(buf)