def generate_multidata(num_batches): for _ in range(num_batches): yield { "x0": np.zeros((4, 5), dtype=np.float32), "x1": cuda.DeviceArray(dtype=np.float32).copy_from( np.ones((4, 5), dtype=np.float32)), "x2": cuda.DeviceArray(dtype=np.float32).copy_from( np.ones((4, 5), dtype=np.float32) * 2).ptr, }
def allocate_buffers(engine): input_buffers = OrderedDict() output_buffers = OrderedDict() bindings = [] stream = cuda.Stream() G_LOGGER.verbose("Using batch size: " + str(engine.max_batch_size) + " during buffer allocation") for binding in engine: shape = (engine.max_batch_size, ) + tuple( engine.get_binding_shape(binding)) dtype = engine.get_binding_dtype(binding) device_mem = cuda.DeviceArray(shape=shape, dtype=trt.nptype(dtype)) G_LOGGER.extra_verbose("Tensor: " "{:35} | Allocated: {:}".format( binding, device_mem)) if engine.binding_is_input(binding): input_buffers[binding] = TrtLegacyRunner.HostDeviceMem( None, device_mem) else: host_mem = np.empty(shape=shape, dtype=trt.nptype(dtype)) output_buffers[binding] = TrtLegacyRunner.HostDeviceMem( host_mem, device_mem) return input_buffers, output_buffers, stream
def test_cannot_use_device_view_shape_tensor(self): model = ONNX_MODELS["empty_tensor_expand"] with TrtRunner(EngineFromNetwork(NetworkFromOnnxBytes( model.loader))) as runner, cuda.DeviceArray( shape=(5, ), dtype=np.int32) as arr: with pytest.raises(PolygraphyException, match="it must reside in host memory"): runner.infer({ "data": np.ones((2, 0, 3, 0), dtype=np.float32), "new_shape": arr })
def test_subsequent_infers_with_different_input_types(self): model = ONNX_MODELS["identity"] network_loader = NetworkFromOnnxBytes(model.loader) with TrtRunner(EngineFromNetwork(network_loader)) as runner: inp = np.ones(shape=(1, 1, 2, 2), dtype=np.float32) def check(outputs): assert np.all(outputs["y"] == inp) check(runner.infer({"x": inp})) check(runner.infer({"x": cuda.DeviceArray().copy_from(inp)})) check(runner.infer({"x": inp}))
def test_device_view_dynamic_shapes(self, use_view): model = ONNX_MODELS["dynamic_identity"] profiles = [ Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)), ] runner = TrtRunner(EngineFromNetwork(NetworkFromOnnxBytes(model.loader), CreateConfig(profiles=profiles))) with runner, cuda.DeviceArray(shape=(1, 2, 3, 3), dtype=np.float32) as arr: inp = np.random.random_sample(size=(1, 2, 3, 3)).astype(np.float32) arr.copy_from(inp) outputs = runner.infer({"X": cuda.DeviceView(arr.ptr, arr.shape, arr.dtype) if use_view else arr}) assert np.all(outputs["Y"] == inp) assert outputs["Y"].shape == (1, 2, 3, 3)
def test_device_views(self, use_view): model = ONNX_MODELS["reducable"] network_loader = NetworkFromOnnxBytes(model.loader) with TrtRunner(EngineFromNetwork(network_loader)) as runner, cuda.DeviceArray((1,), dtype=np.float32) as x: x.copy_from(np.ones((1,), dtype=np.float32)) outputs = runner.infer( { "X0": x.view() if use_view else x, "Y0": np.ones((1,), dtype=np.float32), } ) assert outputs["identity_out_6"][0] == 2 assert outputs["identity_out_8"][0] == 2
def generate_dev_data(num_batches): with cuda.DeviceArray(shape=(1, ), dtype=np.float32) as x: for _ in range(num_batches): x.copy_from(np.ones((1, ), dtype=np.float32)) xdata = { "array": x, "view": cuda.DeviceView(x.ptr, x.shape, x.dtype), "pointer": x.ptr }[mode] yield { "X0": xdata, "Y0": np.zeros((1, ), dtype=np.float32) }
def test_infer_overhead(self, copy_inputs, copy_outputs): inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32) dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp) out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32) # Using identity model! dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype) stream = cuda.Stream() model = ONNX_MODELS["dynamic_identity"] profiles = [ Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)), ] inp_name = list(model.input_metadata.keys())[0] with engine_from_network( network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles) ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out: # Inference outside the TrtRunner def infer(): if copy_inputs: dev_inp.copy_from(inp, stream=stream) context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr) if copy_outputs: dev_out.copy_to(out, stream=stream) stream.synchronize() native_time = time_func(infer) feed_dict = {inp_name: (inp if copy_inputs else dev_inp)} runner_time = time_func( lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs) ) # The overhead should be less than 0.5ms, or the runtime should be within 5% print("Absolute difference: {:.5g}".format(runner_time - native_time)) print("Relative difference: {:.5g}".format(runner_time / native_time)) assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)
def get_batch(self, names): if not self.is_active: G_LOGGER.error( "Calibrator must be activated prior to use. Please use a context manager. " "For example:\nwith calibrator:\n\t# Use calibrator here") return None try: buffers = next(self.data_loader_iter) except StopIteration: if not self.num_batches: G_LOGGER.error( "Calibrator data loader provided no data.\nPossible reasons for this include:\n(1) data loader " "has no data to provide\n(2) data loader was a generator, and the calibrator is being " "used multiple times (generators cannot be rewound)") return None else: self.num_batches += 1 if not util.check_dict_contains(buffers, names, dict_name="calibration data", log_func=G_LOGGER.error): return None ptrs = [] for name in names: buf = buffers[name] if isinstance(buf, cuda.DeviceView): ptrs.append(buf.ptr) elif isinstance(buf, np.ndarray): if name not in self.device_buffers: self.device_buffers[name] = cuda.DeviceArray( shape=buf.shape, dtype=buf.dtype) G_LOGGER.verbose("Allocated: {:}".format( self.device_buffers[name])) ptrs.append(self.device_buffers[name].copy_from(buf).ptr) elif isinstance(buf, int): ptrs.append(buf) else: G_LOGGER.error( "Calibration data loader provided an unrecognized type: {:} for input: {:}.\n" "Please provide either a NumPy array, Polygraphy DeviceView, or GPU pointer. " .format(type(buf).__name__, name)) return None return ptrs
def make_buffers(engine): """ Creates empty host and device buffers for the specified engine. Always uses binding names from Profile 0. """ device_buffers = OrderedDict() host_output_buffers = OrderedDict() for idx in range(trt_util.get_bindings_per_profile(engine)): binding = engine[idx] dtype = trt_util.np_dtype_from_trt(engine.get_binding_dtype(binding)) device_buffers[binding] = cuda.DeviceArray(dtype=dtype) if not engine.binding_is_input(binding): host_output_buffers[binding] = np.empty(shape=tuple(), dtype=dtype) G_LOGGER.extra_verbose("Created device buffers: {:}".format(device_buffers)) return device_buffers, host_output_buffers