예제 #1
0
def allocate_buffers(engine: trt.ICudaEngine, batch_size: int):
    print('Allocating buffers ...')

    inputs = []
    outputs = []
    dbindings = []

    stream = cuda.Stream()

    for binding in engine:
        size = batch_size * abs(trt.volume(engine.get_binding_shape(binding)))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        dbindings.append(int(device_mem))

        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, dbindings, stream
예제 #2
0
    def __init__(
        self,
        engine: trt.ICudaEngine,
        idx_or_name: Union[int, str],
        max_batch_size: int,
        device: str,
    ):
        if isinstance(idx_or_name, six.string_types):
            self.name = idx_or_name
            self.index = engine.get_binding_index(self.name)
            if self.index == -1:
                raise IndexError(f"Binding name not found: {self.name}")
        else:
            self.index = idx_or_name
            self.name = engine.get_binding_name(self.index)
            if self.name is None:
                raise IndexError(f"Binding index out of range: {self.index}")

        self._dtype = TYPE_TRT_2_TORCH[engine.get_binding_dtype(self.index)]
        self._shape = (max_batch_size, ) + tuple(
            engine.get_binding_shape(self.index))[1:]
        self._device = torch.device(device)
        self._is_input = engine.binding_is_input(self.index)
        if self.is_input:
            self._binding_data = None
        else:
            self._binding_data = torch.zeros(size=self.shape,
                                             dtype=self.dtype,
                                             device=self.device)
예제 #3
0
def get_random_inputs(
    engine: trt.ICudaEngine,
    context: trt.IExecutionContext,
    input_binding_idxs: List[int],
    seed: int = 42,
):
    # Input data for inference
    host_inputs = []
    print("Generating Random Inputs")
    print("\tUsing random seed: {}".format(seed))
    np.random.seed(seed)
    for binding_index in input_binding_idxs:
        # If input shape is fixed, we'll just use it
        input_shape = context.get_binding_shape(binding_index)
        input_name = engine.get_binding_name(binding_index)
        print("\tInput [{}] shape: {}".format(input_name, input_shape))
        # If input shape is dynamic, we'll arbitrarily select one of the
        # the min/opt/max shapes from our optimization profile
        if is_dynamic(input_shape):
            profile_index = context.active_optimization_profile
            profile_shapes = engine.get_profile_shape(profile_index,
                                                      binding_index)
            print("\tProfile Shapes for [{}]: [kMIN {} | kOPT {} | kMAX {}]".
                  format(input_name, *profile_shapes))
            # 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max)
            input_shape = profile_shapes[1]
            print(
                "\tInput [{}] shape was dynamic, setting inference shape to {}"
                .format(input_name, input_shape))

        host_inputs.append(np.random.random(input_shape).astype(np.float32))

    return host_inputs
예제 #4
0
def allocate_buffers_torch(engine: trt.ICudaEngine, device):
    import torch
    inputs = []
    outputs = []
    bindings = []
    index = 0
    dtype_map = np_to_torch_dtype_map()
    for binding in engine:
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        shape = [engine.max_batch_size] + list(
            engine.get_binding_shape(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype).reshape(shape)
        device_mem = torch.empty(*host_mem.shape,
                                 device=device,
                                 dtype=dtype_map[host_mem.dtype])
        # Append the device buffer to device bindings.
        bindings.append(device_mem.data_ptr())
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem, binding, index))
        index += 1
    return inputs, outputs, bindings
예제 #5
0
def run_trt_engine(context: trt.IExecutionContext, engine: trt.ICudaEngine,
                   h_tensors: dict):
    """Run a TRT model.

  The model output is written in place inside the tensors provided in h_tensors['outputs'].

  Args:
      context (trt.IExecutionContext): 
      engine (trt.ICudaEngine): 
      h_tensors (dict): A dictionary with keys "inputs" and "outputs" and values which are another 
      dictionaries with tensor names as keys and numpy.ndarrays as values.
  """
    # Allocate GPU memory.
    d_tensors = {}
    d_tensors['inputs'] = {
        k: cuda.mem_alloc(v.nbytes)
        for k, v in h_tensors['inputs'].items()
    }
    d_tensors['outputs'] = {
        k: cuda.mem_alloc(v.nbytes)
        for k, v in h_tensors['outputs'].items()
    }

    # Copy input buffers to GPU.
    for h_tensor, d_tensor in zip(h_tensors['inputs'].values(),
                                  d_tensors['inputs'].values()):
        cuda.memcpy_htod(d_tensor, h_tensor)

    # Initialise bindings list.
    bindings = [None] * engine.num_bindings

    # Populate bindings list.
    for (name, h_tensor), (_, d_tensor) in zip(h_tensors['inputs'].items(),
                                               d_tensors['inputs'].items()):
        idx = engine.get_binding_index(name)
        bindings[idx] = int(d_tensor)
        if engine.is_shape_binding(idx) and is_shape_dynamic(
                context.get_shape(idx)):
            context.set_shape_input(idx, h_tensor)
        elif is_shape_dynamic(engine.get_binding_shape(idx)):
            context.set_binding_shape(idx, h_tensor.shape)

    for name, d_tensor in d_tensors['outputs'].items():
        idx = engine.get_binding_index(name)
        bindings[idx] = int(d_tensor)

    # Run engine.
    context.execute_v2(bindings=bindings)

    # Copy output buffers to CPU.
    for h_tensor, d_tensor in zip(h_tensors['outputs'].values(),
                                  d_tensors['outputs'].values()):
        cuda.memcpy_dtoh(h_tensor, d_tensor)
예제 #6
0
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int):
    # Calculate start/end binding indices for current context's profile
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    start_binding = profile_index * num_bindings_per_profile
    end_binding = start_binding + num_bindings_per_profile
    print("Engine/Binding Metadata")
    print("\tNumber of optimization profiles: {}".format(
        engine.num_optimization_profiles))
    print("\tNumber of bindings per profile: {}".format(
        num_bindings_per_profile))
    print("\tFirst binding for profile {}: {}".format(profile_index,
                                                      start_binding))
    print("\tLast binding for profile {}: {}".format(profile_index,
                                                     end_binding - 1))

    # Separate input and output binding indices for convenience
    input_binding_idxs = []
    output_binding_idxs = []
    for binding_index in range(start_binding, end_binding):
        if engine.binding_is_input(binding_index):
            input_binding_idxs.append(binding_index)
        else:
            output_binding_idxs.append(binding_index)

    return input_binding_idxs, output_binding_idxs
예제 #7
0
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int):
    """
    :param engine:
    :param profile_index:
    :return:
    """
    # Calculate start/end binding indices for current context's profile
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    start_binding = profile_index * num_bindings_per_profile
    end_binding = start_binding + num_bindings_per_profile  # Separate input and output binding indices for convenience
    input_binding_idxs = []
    output_binding_idxs = []
    for binding_index in range(start_binding, end_binding):
        if engine.binding_is_input(binding_index):
            input_binding_idxs.append(binding_index)
        else:
            output_binding_idxs.append(binding_index)
    return input_binding_idxs, output_binding_idxs
예제 #8
0
def detect(engine: trt.ICudaEngine, img: np.ndarray) -> "tuple[np.ndarray, np.ndarray, np.ndarray]":
    #this function performs network execution on the given img
    #additionally this function does preprocessing and postprocessing of img
    #param engine: tensor rt engine created from network weights
    #param img: image to perform detection on
    #return value: predictions in original image coordinates
    #predictions: bounding boxes, confidences, class_ids

    with engine.create_execution_context() as context:
        h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
        h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
        #preprocess
        preprocess_start = time.time()
        preprocessed_img, resize_ratio, padding_size = preprocess_img(img)
        preprocess_stop = time.time()
        #copy our input image to buffer
        np.copyto(h_input, preprocessed_img.flatten())
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        postprocess_start = time.time()
        predictions = postprocess(h_output)
        predictions_in_original_coords = transform_detected_coords_to_original(predictions, resize_ratio, padding_size)
        postprocess_stop = time.time()
        print(f"Preprocessing time: {(preprocess_stop - preprocess_start) * 1000:.4f} ms")
        print(f"Postprocessing time: {(postprocess_stop - postprocess_start) * 1000:.4f} ms")
        print(f"Complete detection time: {(postprocess_stop - preprocess_start) * 1000:.4f} ms")
    return predictions_in_original_coords
예제 #9
0
def setup_binding_shapes(engine: trt.ICudaEngine,
                         context: trt.IExecutionContext, host_inputs,
                         input_binding_idxs, output_binding_idxs):
    # Explicitly set the dynamic input shapes, so the dynamic output
    # shapes can be computed internally
    for host_input, binding_index in zip(host_inputs, input_binding_idxs):
        context.set_binding_shape(binding_index, host_input.shape)
    assert context.all_binding_shapes_specified
    host_outputs = []
    device_outputs = []
    for binding_index in output_binding_idxs:
        output_shape = context.get_binding_shape(binding_index)
    # Allocate buffers to hold output results after copying back to host
    buffer = np.empty(output_shape, dtype=np.float32)
    host_outputs.append(buffer)
    # Allocate output buffers on device
    device_outputs.append(cuda.mem_alloc(buffer.nbytes))
    # 绑定输出shape
    utput_names = [
        engine.get_binding_name(binding_idx)
        for binding_idx in output_binding_idxs
    ]
    return host_outputs, device_outputs
예제 #10
0
def save_engine(engine: trt.ICudaEngine, engine_dest_path: str):
    logging.info(f"\t{sub_prefix}Saving TensorRT engine")
    buf = engine.serialize()
    with open(engine_dest_path, 'wb') as f:
        f.write(buf)