def allocate_buffers(engine): # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs. h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE)) h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE)) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() return h_input, d_input, h_output, d_output, stream
def inference(self, resized_rgb_image) -> list: """ Inference function sets input tensor to input image and gets the output. The interpreter instance provides corresponding class id output which is used for creating result Args: resized_rgb_image: Array of images with shape (no_images, img_height, img_width, channels) Returns: result: List of class id for each input image. ex: [0, 0, 1, 1, 0] scores: The classification confidence for each class. ex: [.99, .75, .80, 1.0] """ self.INPUT_DATA_TYPE = np.float32 self.trt_logger = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(self.trt_logger) with open(self.model_path, "rb") as f: self.engine = runtime.deserialize_cuda_engine(f.read()) self.context = self.engine.create_execution_context() self.stream = cuda.Stream() self.host_in = cuda.pagelocked_empty(trt.volume(self.engine.get_binding_shape(0)), dtype=self.INPUT_DATA_TYPE) self.host_out = cuda.pagelocked_empty(trt.volume(self.engine.get_binding_shape(1)), dtype=self.INPUT_DATA_TYPE) self.devide_in = cuda.mem_alloc(self.host_in.nbytes) self.devide_out = cuda.mem_alloc(self.host_out.nbytes) if np.shape(resized_rgb_image)[0] == 0: return [], [] result = [] net_results = [] for img in resized_rgb_image: img = np.expand_dims(img, axis=0) bindings = [int(self.devide_in), int(self.devide_out)] np.copyto(self.host_in, img.ravel()) t_begin = time.perf_counter() cuda.memcpy_htod_async(self.devide_in, self.host_in, self.stream) self.context.execute_async(bindings=bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_out, self.devide_out, self.stream) self.stream.synchronize() inference_time = time.perf_counter() - t_begin # Seconds self.fps = convert_infr_time_to_fps(inference_time) out = self.host_out pred = np.argmax(out) net_results.append(out) result.append(pred) # TODO: optimized without for scores = [] for i, itm in enumerate(net_results): scores.append(itm[result[i]]) return result, scores
def evaluate(asr_model, asr_onnx, labels_map, wer, qat): # Eval the model hypotheses = [] references = [] stream = cuda.Stream() vocabulary_size = len(labels_map) + 1 engine_file_path = build_trt_engine(asr_model, asr_onnx, qat) with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: trt_engine = runtime.deserialize_cuda_engine(f.read()) trt_ctx = trt_engine.create_execution_context() profile_shape = trt_engine.get_profile_shape(profile_index=0, binding=0) print("profile shape min:{}, opt:{}, max:{}".format( profile_shape[0], profile_shape[1], profile_shape[2])) max_input_shape = profile_shape[2] input_nbytes = trt.volume(max_input_shape) * trt.float32.itemsize d_input = cuda.mem_alloc(input_nbytes) max_output_shape = [ max_input_shape[0], vocabulary_size, (max_input_shape[-1] + 1) // 2 ] output_nbytes = trt.volume(max_output_shape) * trt.float32.itemsize d_output = cuda.mem_alloc(output_nbytes) for test_batch in asr_model.test_dataloader(): if can_gpu: test_batch = [x.cuda() for x in test_batch] processed_signal, processed_signal_length = asr_model.preprocessor( input_signal=test_batch[0], length=test_batch[1]) greedy_predictions = trt_inference( stream, trt_ctx, d_input, d_output, input_signal=processed_signal, input_signal_length=processed_signal_length, ) hypotheses += wer.ctc_decoder_predictions_tensor( greedy_predictions) for batch_ind in range(greedy_predictions.shape[0]): seq_len = test_batch[3][batch_ind].cpu().detach().numpy() seq_ids = test_batch[2][batch_ind].cpu().detach().numpy() reference = ''.join( [labels_map[c] for c in seq_ids[0:seq_len]]) references.append(reference) del test_batch wer_value = word_error_rate(hypotheses=hypotheses, references=references, use_cer=wer.use_cer) return wer_value
def alloc_buf(engine): # host cpu mem h_in_size = trt.volume(engine.get_binding_shape(0)) h_out_size = trt.volume(engine.get_binding_shape(1)) h_in_dtype = trt.nptype(engine.get_binding_dtype(0)) h_out_dtype = trt.nptype(engine.get_binding_dtype(1)) in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype) out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype) # allocate gpu mem in_gpu = cuda.mem_alloc(in_cpu.nbytes) out_gpu = cuda.mem_alloc(out_cpu.nbytes) stream = cuda.Stream() return in_cpu, out_cpu, in_gpu, out_gpu, stream
def allocate_buffers(engine): host_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), trt.nptype(engine.get_binding_dtype(0))) host_output = cuda.pagelocked_empty( trt.volume(engine.get_binding_shape(1)), trt.nptype(engine.get_binding_dtype(1))) device_input = cuda.mem_alloc(host_input.nbytes) device_output = cuda.mem_alloc(host_output.nbytes) stream = cuda.Stream() return host_input, device_input, host_output, device_output, stream
def initialize_engine(self): print("initializing engine") self.h_input = cuda.pagelocked_empty(trt.volume( self.engine.get_binding_shape(0)), dtype=np.float32) self.h_output = cuda.pagelocked_empty(trt.volume( self.engine.get_binding_shape(1)), dtype=np.float32) self.d_input = cuda.mem_alloc(self.h_input.nbytes) self.d_output = cuda.mem_alloc(self.h_output.nbytes) self.stream = cuda.Stream() self.execution_context = self.engine.create_execution_context() print("engine initialized")
def _context_init(self): volume = trt.volume(self.trt_engine.get_binding_shape( 0)) * self.trt_engine.max_batch_size self.input_dtype = trt.nptype(self.trt_engine.get_binding_dtype(0)) self.host_input = cuda.pagelocked_empty(volume, dtype=self.input_dtype) volume = trt.volume(self.trt_engine.get_binding_shape( 1)) * self.trt_engine.max_batch_size dtype = trt.nptype(self.trt_engine.get_binding_dtype(1)) self.host_output = cuda.pagelocked_empty(volume, dtype=dtype) # Allocate device memory for inputs and outputs. self.cuda_input = cuda.mem_alloc(self.host_input.nbytes) self.cuda_output = cuda.mem_alloc(self.host_output.nbytes) self.context = self.trt_engine.create_execution_context() self.context.active_optimization_profile = 0 self.stream = cuda.Stream()
def allocate_buffers(self, engine): print('allocate buffers') h_input = cuda.pagelocked_empty( trt.volume(engine.get_binding_shape(0)), trt.nptype(engine.get_binding_dtype(0))) h_output = cuda.pagelocked_empty( trt.volume(engine.get_binding_shape(1)), trt.nptype(engine.get_binding_dtype(1))) d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) stream = cuda.Stream() return stream, h_input, d_input, h_output, d_output
def allocate_buffers(engine, batch_size, data_type): """ allocate buffers for input and output in the device """ h_input = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type)) h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type)) d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) stream = cuda.Stream() return h_input, d_input, h_output, d_output, stream
def allocate_buffers(engine): """Allocates all host/device in/out buffers required for an engine.""" inputs = [] outputs = [] bindings = [] output_idx = 0 stream = cuda.Stream() assert 3 <= len(engine) <= 4 # expect 1 input, plus 2 or 3 outpus for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * \ engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: # each grid has 3 anchors, each anchor generates a detection # output of 7 float32 values assert size % 7 == 0 outputs.append(HostDeviceMem(host_mem, device_mem)) output_idx += 1 return inputs, outputs, bindings, stream
def __init__(self, model): # Initialize TRT environment self.input_shape = (300, 300) trt_logger = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(trt_logger, '') with open(model, 'rb') as f, trt.Runtime(trt_logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.host_inputs = [] self.cuda_inputs = [] self.host_outputs = [] self.cuda_outputs = [] self.bindings = [] self.stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size host_mem = cuda.pagelocked_empty(size, np.float32) cuda_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(cuda_mem)) if engine.binding_is_input(binding): self.host_inputs.append(host_mem) self.cuda_inputs.append(cuda_mem) else: self.host_outputs.append(host_mem) self.cuda_outputs.append(cuda_mem) self.context = engine.create_execution_context() self.watch = Stopwatch()
def _allocate_buffers(self, context): """ Allocate device memory space for data. :param context: :return: """ inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in self._engine: size = trt.volume(self._engine.get_binding_shape( binding)) * self._engine.max_batch_size dtype = trt.nptype(self._engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if self._engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def get_batch(self, names): # if there are not enough calibration images to form a batch, # we have reached the end of our data set if self.counter == self.num_calib_imgs: return None batch_imgs = np.zeros((self.batch_size, trt.volume(self.model_shape))) for i in range(self.batch_size): img = cv2.imread(self.calib_imgs[self.counter + i]) img = cv2.resize(img, (self.model_shape[2], self.model_shape[1])) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # HWC -> CHW img = img.transpose((2, 0, 1)) # Normalize to [-1.0, 1.0] interval (expected by model) img = (2.0 / 255.0) * img - 1.0 # add this image to the batch array batch_imgs[i, :] = img.ravel() # increase the counter for this batch self.counter += self.batch_size # Copy to device, then return a list containing pointers to input device buffers. cuda.memcpy_htod(self.device_input, batch_imgs.astype(np.float32)) return [int(self.device_input)]
def _allocate_buffers(self, engine): # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. inputs = [] outputs = [] bindings = [] stream = cuda.Stream() out_shapes = [] input_shapes = [] out_names = [] input_names = [] max_batch_size = engine.max_batch_size for binding in engine: # get binding_shape (value == -1 means dynamic shape) binding_shape = engine.get_binding_shape(binding) # compute max_size and dtype size = abs(trt.volume(binding_shape)) * max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # collect info to appropriate list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) input_shapes.append(binding_shape) input_names.append(binding) else: outputs.append(HostDeviceMem(host_mem, device_mem)) out_shapes.append(binding_shape) out_names.append(binding) return bindings, stream, max_batch_size, inputs, input_shapes, input_names, outputs, out_shapes, out_names
def __init__(self, images, width=256, height=256, channel=3, batch_size=1, cache_file='./{}.cache'.format('int8')): """ :param images: type: list, e.g: [img1.jpg, img2.jpg, ...] :param width: :param height: :param channel: :param batch_size: :param cache_file: """ super(ImageCalibrator, self).__init__() self.cache_file = cache_file self.batch_size = batch_size self.channel = channel self.height = height self.width = width assert isinstance(images, list) and len(images) > 0 self.imgs = images self.batch_idx = 0 self.max_batch_idx = len(self.imgs) // self.batch_size self.data_size = trt.volume([ self.batch_size, self.channel, self.height, self.width ]) * trt.float32.itemsize self.device_input = cuda.mem_alloc(self.data_size) self.one_batch = self.batch_generator()
def _setup_bindings(self, engine): """ :param engine: """ self._inputs = {} self._outputs = [] self._stream = cuda.Stream() for binding in engine: name = binding shape = engine.get_binding_shape(binding) shape[0] = engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) size = trt.volume(shape) # Allocate host and device buffers # https://documen.tician.de/pycuda/util.html host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append to the appropriate list. if engine.binding_is_input(binding): self._inputs[name] = MemoryBinding(name, shape, dtype, host_mem, device_mem, True) else: self._outputs.append( MemoryBinding(name, shape, dtype, host_mem, device_mem, False))
def __init__(self, loader, cache_file, c, h, w): # Whenever you specify a custom constructor for a TensorRT class, # you MUST call the constructor of the parent explicitly. trt.IInt8EntropyCalibrator2.__init__(self) self.cache_file = cache_file #data, targets = torch.load(datafolder) for cal_data in loader: #print("============data==========", cal_data.shape) self.all_files = cal_data.numpy() # Find out the shape of a batch and then allocate a device buffer of that size. # self.shape, _, _ = self.read_batch_file(self.batch_files[0]) self.shape = [1, c, h, w] #print("==================self.shape=================", self.shape) # Each element of the calibration data is a float32. self.device_input = cuda.mem_alloc( trt.volume(self.shape) * trt.float32.itemsize) #print("==================self.device_input=================", self.device_input) # Create a generator that will give us batches. We can use next() to iterate over the result. def load_batches(): for idx in range(len(self.all_files)): cal_data = self.read_batch_file(idx) yield cal_data self.batches = load_batches()
def __init__(self, batch_data_dir, cache_file): # Whenever you specify a custom constructor for a TensorRT class, # you MUST call the constructor of the parent explicitly. trt.IInt8EntropyCalibrator2.__init__(self) self.cache_file = cache_file # Get a list of all the batch files in the batch folder. self.batch_files = [ os.path.join(batch_data_dir, f) for f in os.listdir(batch_data_dir) ] # Find out the shape of a batch and then allocate a device buffer of that size. self.batch_size = 1 self.batch_round = 100 self.shape = self.read_batch_file( self.batch_files[0:self.batch_size]).shape print(self.shape) # Each element of the calibration data is a float32. self.device_input = cuda.mem_alloc( trt.volume(self.shape) * trt.float32.itemsize) # Create a generator that will give us batches. We can use next() to iterate over the result. def load_batches(): start = 0 for i in range(self.batch_round): print("Start Calibration using batch {:d}".format(i)) yield self.read_batch_file(self.batch_files[start:start + self.batch_size]) start = start + self.batch_size self.batches = load_batches()
def alloc_buf(engine): # h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32) # h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32) dtype = trt.nptype(DTYPE) h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=dtype) h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=dtype) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) stream = cuda.Stream() # np.copyto(h_input, (np.random.random((1, 3, input_size, input_size)).astype(np.float32)).reshape(-1)) return h_input, h_output, d_input, d_output, stream
def init_model(self, trt_path, ctx_id): TRT_LOGGER = trt.Logger(trt.Logger.WARNING) cuda.init() device = cuda.Device(ctx_id) self.ctx = device.make_context() with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.input_buffs = {} self.output_buffs = {} self.bindings = [] self.stream = cuda.Stream() for name in engine: shape = engine.get_binding_shape(name) size = trt.volume(shape) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(name)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. self.bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(name): self.input_buffs[name] = HostDeviceMem(host_mem, device_mem, shape) else: self.output_buffs[name] = HostDeviceMem(host_mem, device_mem, shape) self.model = engine.create_execution_context() self.logger.info("Warmup up...") self.inference_loops(10)
def __init__(self): logger = trt.Logger(trt.Logger.INFO) model = 'models/yolov5s-simple-2.trt' with open(model, 'rb') as f, trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.context = engine.create_execution_context() # allocate memory inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) # * \ # engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append({ 'host': host_mem, 'device': device_mem }) else: outputs.append({ 'host': host_mem, 'device': device_mem }) # save to class self.inputs = inputs self.outputs = outputs self.bindings = bindings self.stream = stream
def allocate_buffers(self): """Allocates GPU memory for future use and creates an asynchronous stream""" # determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host i/o self.h_input = cuda.pagelocked_empty( trt.volume(self.engine.get_binding_shape(0)), dtype=trt.nptype(self.CONSTANTS["dtype"])) self.h_output = cuda.pagelocked_empty( trt.volume(self.engine.get_binding_shape(1)), dtype=trt.nptype(self.CONSTANTS["dtype"])) # allocate device memory for inputs and outputs self.d_input = cuda.mem_alloc(self.h_input.nbytes) self.d_output = cuda.mem_alloc(self.h_output.nbytes) self.stream = cuda.Stream()
def allocate_buffers(engine: trt.ICudaEngine, batch_size: int): print('Allocating buffers ...') inputs = [] outputs = [] dbindings = [] stream = cuda.Stream() for binding in engine: size = batch_size * abs(trt.volume(engine.get_binding_shape(binding))) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. dbindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, dbindings, stream
def __allocate_buffers(engine): """Allocates all buffers required for the specified engine.""" inputs = [] outputs = [] bindings = [] for binding in engine: # Get binding (tensor/buffer) size size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size # Get binding (tensor/buffer) data type (numpy-equivalent) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate page-locked memory (i.e., pinned memory) buffers host_mem = cuda.pagelocked_empty(size, dtype) # Allocate linear piece of device memory device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append(__HostDeviceTuple(host_mem, device_mem)) else: outputs.append(__HostDeviceTuple(host_mem, device_mem)) stream = cuda.Stream() return inputs, outputs, bindings, stream
def _allocate_buffers(self): self.inputs = [] self.outputs = [] self.bindings = [] self.stream = cuda.Stream() # NMS implementation in TRT 6 only supports DataType.FLOAT binding_to_type = { "Input": np.float32, "NMS": np.float32, "NMS_1": np.int32 } for binding in self.__trt_engine: shape = self.__trt_engine.get_binding_shape(binding) size = trt.volume(shape) * self.__trt_engine.max_batch_size dtype = binding_to_type[str(binding)] # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. self.bindings.append(int(device_mem)) # Append to the appropriate list. if self.__trt_engine.binding_is_input(binding): self.inputs.append(HostDeviceMem(host_mem, device_mem)) else: self.outputs.append(HostDeviceMem(host_mem, device_mem))
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] #创建一个cuda流 stream = cuda.Stream() for binding in engine: #trt.volume用来计算可迭代对象的体积 #get_binding_shape用来获取相应绑定的维度 #size表示engine中绑定的所需要的最大维度 size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size #get_binding_dtype用来获取相应绑定的数据类型 dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers #给主机和设备分配缓冲区 #cuda.pagelocked_empty给主机分配相关的页面锁定内存 host_mem = cuda.pagelocked_empty(size, dtype) #给设备分配内存 device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. #将分配给设备的内存添加到设备绑定 bindings.append(int(device_mem)) # Append to the appropriate list. #确定绑定是否是一个输入绑定 if engine.binding_is_input(binding): #如果是的话 #HostDeviceMem的实现参考common.py #将相应的内存地址添加到对应的列表里面 inputs.append(HostDeviceMem(host_mem, device_mem)) else: #如果不是的话 outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): """ Allocates all buffers required for the specified engine """ inputs = [] outputs = [] bindings = [] # Iterate over binding names in engine for binding in engine: # Get binding (tensor/buffer) size size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size # Get binding (tensor/buffer) data type (numpy-equivalent) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate page-locked memory (i.e., pinned memory) buffers host_mem = cuda.pagelocked_empty(size, dtype) # Allocate linear piece of device memory device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings bindings.append(int(device_mem)) # Append to inputs/ouputs list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) # Create a stream (to eventually copy inputs/outputs and run inference) stream = cuda.Stream() return inputs, outputs, bindings, stream
def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]): inputs = [] outputs = [] bindings = [] class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() for binding in engine: dims = engine.get_binding_shape(binding) if dims[0] == -1: assert(len(dynamic_shapes) > 0) dims[0] = dynamic_shapes[0] size = trt.volume(dims) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings
def allocate_buffers(engine, batch_size): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * batch_size dims = engine.get_binding_shape(binding) # in case batch dimension is -1 (dynamic) if dims[0] < 0: size *= -1 dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def main_tensorrt(): """Executes TensorRT test board predictions.""" print("TensorRT predictions") if cuda is None or trt is None: raise ImportError("Unable to import pycuda or tensorrt") trt_logger = trt.Logger(trt.Logger.VERBOSE) # Read and deserialize the serialized ICudaEngine with open(MODEL_PATH_TRT, 'rb') as f, trt.Runtime(trt_logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) inputs, outputs, bindings, stream = __allocate_buffers(engine) img_array = np.zeros( (engine.max_batch_size, trt.volume((IMG_SIZE_TRT, IMG_SIZE_TRT, 3)))) # Create an IExecutionContext (context for executing inference) with engine.create_execution_context() as context: def obtain_pieces_probs(pieces): # Assuming batch size == 64 for i, piece in enumerate(pieces): img_array[i] = load_image(piece, IMG_SIZE_TRT, PRE_INPUT_TRT).ravel() np.copyto(inputs[0].host, img_array.ravel()) trt_outputs = __infer( context, bindings, inputs, outputs, stream)[-1] return [trt_outputs[ind:ind + 13] for ind in range(0, 13 * 64, 13)] test_predict_board(obtain_pieces_probs)