def create_set_either_shm_region(shm_region_names, input_list, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory): if use_cuda_shared_memory and use_system_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [] if use_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( shm_region_names[0] + "_data", input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( shm_region_names[1] + "_data", output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list) elif use_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( shm_region_names[0] + "_data", "/" + shm_region_names[0], input_byte_size) shm_op_handle = shm.create_shared_memory_region( shm_region_names[1] + "_data", "/" + shm_region_names[1], output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list) return [shm_ip_handle, shm_op_handle]
def _configure_sever(self): shm_ip0_handle = shm.create_shared_memory_region( "input0_data", "/input0_data", 64) shm_ip1_handle = shm.create_shared_memory_region( "input1_data", "/input1_data", 64) shm_op0_handle = shm.create_shared_memory_region( "output0_data", "/output0_data", 64) shm_op1_handle = shm.create_shared_memory_region( "output1_data", "/output1_data", 64) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.register_system_shared_memory("input0_data", "/input0_data", 64) triton_client.register_system_shared_memory("input1_data", "/input1_data", 64) triton_client.register_system_shared_memory("output0_data", "/output0_data", 64) triton_client.register_system_shared_memory("output1_data", "/output1_data", 64) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np): byte_size = tensor_np.size * tensor_np.dtype.itemsize self.shm_handles.append( shm.create_shared_memory_region(name, name, byte_size)) # Put data values into shared memory shm.set_shared_memory_region(self.shm_handles[-1], [tensor_np]) # Register shared memory with Triton Server self.triton_client.register_system_shared_memory(name, name, byte_size) # Set the parameters to use data from shared memory tensor.set_shared_memory(name, byte_size)
def predict(self, input_images): # Put input data values into shared memory shm.set_shared_memory_region(self.input_images_handle, [input_images]) results = self.triton_client.infer(model_name=self.model_name, inputs=self.inputs, outputs=self.outputs) # Read results from the shared memory. output = results.get_output("output") output_data = shm.get_contents_as_numpy( self.output_handle, utils.triton_to_np_dtype(output.datatype), output.shape) return output_data
def test_valid_create_set_register(self): # Create a valid system shared memory region, fill data in it and register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) shm.set_shared_memory_region(shm_op0_handle, [np.array([1, 2], dtype=np.float32)]) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle)
"/output0_simple", output0_byte_size) triton_client.register_system_shared_memory("output1_data", "/output1_simple", output1_byte_size) # Create Input0 and Input1 in Shared Memory and store shared memory handles shm_ip0_handle = shm.create_shared_memory_region("input0_data", "/input0_simple", input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region("input1_data", "/input1_simple", input1_byte_size) # Put input data values into shared memory shm.set_shared_memory_region(shm_ip0_handle, [input0_data_serialized]) shm.set_shared_memory_region(shm_ip1_handle, [input1_data_serialized]) # Register Input0 and Input1 shared memory with Triton Server triton_client.register_system_shared_memory("input0_data", "/input0_simple", input0_byte_size) triton_client.register_system_shared_memory("input1_data", "/input1_simple", input1_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input0_data", input0_byte_size)
def infer_shape_tensor(tester, pf, tensor_dtype, input_shape_values, dummy_input_shapes, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, priority=0, timeout_us=0, batch_size=1): tester.assertTrue(use_http or use_grpc or use_streaming) tester.assertTrue(pf == "plan" or pf == "plan_nobatch") tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) configs = [] if use_http: configs.append(("localhost:8000", "http", False)) if use_grpc: configs.append(("localhost:8001", "grpc", False)) if use_streaming: configs.append(("localhost:8001", "grpc", True)) io_cnt = len(input_shape_values) # FIXME wrap up shm handle cleanup # item is (handle, byte_size) input_shm_handle_list = [] output_shm_handle_list = [] dummy_input_list = [] input_list = [] expected_dict = dict() # Prepare IO in advance for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()], dtype=object).reshape(dummy_in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected value for the output. Skip dummy output as we # only care about its shape (== value of OUTPUT*) expected_dict[output_name] = np.ndarray.copy(in0) # Only need to create region once input_byte_size = in0.size * np.dtype(np.int32).itemsize output_byte_size = input_byte_size * batch_size if use_system_shared_memory: input_shm_handle_list.append( (shm.create_shared_memory_region(input_name + shm_suffix, '/' + input_name + shm_suffix, input_byte_size), input_byte_size)) output_shm_handle_list.append((shm.create_shared_memory_region( output_name + shm_suffix, '/' + output_name + shm_suffix, output_byte_size), output_byte_size)) shm.set_shared_memory_region(input_shm_handle_list[-1][0], [ in0, ]) model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) # Run inference and check results for each config for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) inputs = [] outputs = [] # Set IOs for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) inputs.append( client_utils.InferInput(dummy_input_name, dummy_input_shapes[io_num], np_to_triton_dtype(tensor_dtype))) inputs.append( client_utils.InferInput(input_name, input_list[io_num].shape, "INT32")) outputs.append( client_utils.InferRequestedOutput(dummy_output_name)) outputs.append(client_utils.InferRequestedOutput(output_name)) # -2: dummy; -1: input inputs[-2].set_data_from_numpy(dummy_input_list[io_num]) if (not use_system_shared_memory): inputs[-1].set_data_from_numpy(input_list[io_num]) else: input_byte_size = input_shm_handle_list[io_num][1] output_byte_size = output_shm_handle_list[io_num][1] triton_client.register_system_shared_memory( input_name + shm_suffix, "/" + input_name + shm_suffix, input_byte_size) triton_client.register_system_shared_memory( output_name + shm_suffix, "/" + output_name + shm_suffix, output_byte_size) inputs[-1].set_shared_memory(input_name + shm_suffix, input_byte_size) outputs[-1].set_shared_memory(output_name + shm_suffix, output_byte_size) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) for io_num in range(io_cnt): output_name = "OUTPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) expected = expected_dict[output_name] # get outputs as numpy array dummy_out = results.as_numpy(dummy_output_name) if (not use_system_shared_memory): out = results.as_numpy(output_name) else: output = results.get_output(output_name) if config[1] == "grpc": output_shape = output.shape else: output_shape = output["shape"] out = shm.get_contents_as_numpy( output_shm_handle_list[io_num][0], np.int32, output_shape) # if out shape is 2D, it is batched if (len(out.shape) == 2): # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue( np.array_equal(dummy_out.shape[1:], out[0]), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out[0], dummy_out.shape[1:])) for b in range(1, out.shape[0]): tester.assertTrue( np.array_equal(out[b - 1], out[b]), "expect shape tensor has consistent value, " "expected: {}, got {}".format(out[b - 1], out[b])) out = out[0] else: tester.assertTrue( np.array_equal(dummy_out.shape, out), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out, dummy_out.shape)) tester.assertTrue( np.array_equal(out, expected), "{}, {}, expected: {}, got {}".format(model_name, output_name, expected, out)) # unregister shared memory region for next config if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_name + shm_suffix) triton_client.unregister_system_shared_memory(output_name + shm_suffix) for handle in input_shm_handle_list: shm.destroy_shared_memory_region(handle[0]) for handle in output_shm_handle_list: shm.destroy_shared_memory_region(handle[0])
def create_set_shm_regions(input0_list, input1_list, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory): if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [], [] input0_byte_size = sum([i0.nbytes for i0 in input0_list]) input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if shm_region_names is None: shm_region_names = ['input0', 'input1', 'output0', 'output1'] shm_op0_handle = None shm_op1_handle = None if use_system_shared_memory: shm_ip0_handle = shm.create_shared_memory_region( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = shm.create_shared_memory_region( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = shm.create_shared_memory_region( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) else: shm_op1_handle = precreated_shm_regions[i] shm.set_shared_memory_region(shm_ip0_handle, input0_list) shm.set_shared_memory_region(shm_ip1_handle, input1_list) if use_cuda_shared_memory: shm_ip0_handle = cudashm.create_shared_memory_region( shm_region_names[0] + '_data', input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( shm_region_names[1] + '_data', input1_byte_size, 0) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = cudashm.create_shared_memory_region( shm_region_names[2] + '_data', output0_byte_size, 0) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = cudashm.create_shared_memory_region( shm_region_names[2 + i] + '_data', output1_byte_size, 0) else: shm_op1_handle = precreated_shm_regions[i] cudashm.set_shared_memory_region(shm_ip0_handle, input0_list) cudashm.set_shared_memory_region(shm_ip1_handle, input1_list) return shm_region_names, [ shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle ]
shm_op_handle = shm.create_shared_memory_region("output_data", "/output_simple", output_byte_size * 2) # Register shared memory region for outputs with Triton Server triton_client.register_system_shared_memory("output_data", "/output_simple", output_byte_size * 2) # Create shared memory region for input and store shared memory handle shm_ip_handle = shm.create_shared_memory_region("input_data", "/input_simple", input_byte_size * 2) # Put input data values into shared memory shm.set_shared_memory_region(shm_ip_handle, [input0_data]) shm.set_shared_memory_region(shm_ip_handle, [input1_data], offset=input_byte_size) # Register shared memory region for inputs with Triton Server triton_client.register_system_shared_memory("input_data", "/input_simple", input_byte_size * 2) # Set the parameters to use data from shared memory inputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs[-1].set_shared_memory("input_data", input_byte_size) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) inputs[-1].set_shared_memory("input_data", input_byte_size,
def create_set_shm_regions(input0_list, input1_list, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory): # Lazy shm imports... if use_system_shared_memory: import tritonclient.utils.shared_memory as shm if use_cuda_shared_memory: import tritonclient.utils.cuda_shared_memory as cudashm if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [], [] if input0_list[0].dtype == np.object_: input0_byte_size = sum( [serialized_byte_size(i0) for i0 in input0_list]) else: input0_byte_size = sum([i0.nbytes for i0 in input0_list]) if input1_list[0].dtype == np.object_: input1_byte_size = sum( [serialized_byte_size(i1) for i1 in input1_list]) else: input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if shm_region_names is None: shm_region_names = ['input0', 'input1', 'output0', 'output1'] shm_op0_handle = None shm_op1_handle = None if use_system_shared_memory: shm_ip0_handle = shm.create_shared_memory_region( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = shm.create_shared_memory_region( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = shm.create_shared_memory_region( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) else: shm_op1_handle = precreated_shm_regions[i] shm.set_shared_memory_region(shm_ip0_handle, input0_list) shm.set_shared_memory_region(shm_ip1_handle, input1_list) if use_cuda_shared_memory: shm_ip0_handle = cudashm.create_shared_memory_region( shm_region_names[0] + '_data', input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( shm_region_names[1] + '_data', input1_byte_size, 0) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = cudashm.create_shared_memory_region( shm_region_names[2] + '_data', output0_byte_size, 0) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = cudashm.create_shared_memory_region( shm_region_names[2 + i] + '_data', output1_byte_size, 0) else: shm_op1_handle = precreated_shm_regions[i] cudashm.set_shared_memory_region(shm_ip0_handle, input0_list) cudashm.set_shared_memory_region(shm_ip1_handle, input1_list) return shm_region_names, [ shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle ]
def check_sequence(self, trial, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, protocol, batch_size=1, sequence_name="<unknown>", tensor_shape=(1,)): """Perform sequence of inferences. The 'values' holds a list of tuples, one for each inference with format: (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): self.assertFalse(True, "unknown trial type: " + trial) # Can only send the request exactly once since it is a # sequence model with state, so can have only a single config. configs = [] if protocol == "http": configs.append((f"{_tritonserver_ipaddr}:8000", "http", False)) if protocol == "grpc": configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False)) if protocol == "streaming": configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", True)) self.assertFalse( _test_system_shared_memory and _test_cuda_shared_memory, "Cannot set both System and CUDA shared memory flags to 1") self.assertEqual(len(configs), 1) full_shape = tensor_shape if "nobatch" in trial else ( batch_size,) + tensor_shape # create and register shared memory output region in advance, # knowing that this function will not be called concurrently. if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() output_byte_size = 512 if _test_system_shared_memory: shm_op_handle = shm.create_shared_memory_region( "output_data", "/output", output_byte_size) self.triton_client_.register_system_shared_memory( "output_data", "/output", output_byte_size) elif _test_cuda_shared_memory: shm_op_handle = cudashm.create_shared_memory_region( "output_data", output_byte_size, 0) self.triton_client_.register_cuda_shared_memory( "output_data", cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_ip_handles = [] for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) if config[2]: user_data = UserData() triton_client.start_stream( partial(completion_callback, user_data)) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT" OUTPUT = "OUTPUT__0" if trial.startswith( "libtorch") else "OUTPUT" for flag_str, value, thresholds, delay_ms in values: if _test_valgrind or _test_jetson: if delay_ms is not None: delay_ms[0] = max(_valgrind_delay_ms, delay_ms[0]) delay_ms[1] = max(_valgrind_delay_ms, delay_ms[1]) else: delay_ms = (_valgrind_delay_ms, _valgrind_delay_ms) if delay_ms is not None: time.sleep(delay_ms[0] / 1000.0) seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) # Construct request IOs inputs = [] outputs = [] inputs.append( client_utils.InferInput( INPUT, full_shape, np_to_triton_dtype(input_dtype))) outputs.append(client_utils.InferRequestedOutput(OUTPUT)) if input_dtype == np.object_: in0 = np.full(full_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(full_shape) else: in0 = np.full(full_shape, value, dtype=input_dtype) # create input shared memory and copy input data values into it if _test_system_shared_memory or _test_cuda_shared_memory: if input_dtype == np.object_: input_list_tmp = iu.serialize_byte_tensor_list( [in0]) input_byte_size = sum([ serialized_byte_size(i0) for i0 in input_list_tmp ]) else: input_list_tmp = [in0] input_byte_size = sum( [i0.nbytes for i0 in input_list_tmp]) ip_name = "ip{}".format(len(shm_ip_handles)) if _test_system_shared_memory: shm_ip_handles.append( shm.create_shared_memory_region( ip_name, "/" + ip_name, input_byte_size)) shm.set_shared_memory_region( shm_ip_handles[-1], input_list_tmp) triton_client.register_system_shared_memory( ip_name, "/" + ip_name, input_byte_size) elif _test_cuda_shared_memory: shm_ip_handles.append( cudashm.create_shared_memory_region( ip_name, input_byte_size, 0)) cudashm.set_shared_memory_region( shm_ip_handles[-1], input_list_tmp) triton_client.register_cuda_shared_memory( ip_name, cudashm.get_raw_handle(shm_ip_handles[-1]), 0, input_byte_size) inputs[0].set_shared_memory(ip_name, input_byte_size) outputs[0].set_shared_memory("output_data", output_byte_size) else: inputs[0].set_data_from_numpy(in0) start_ms = int(round(time.time() * 1000)) if config[2]: triton_client.async_stream_infer( model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer( model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) end_ms = int(round(time.time() * 1000)) # Get value of "OUTPUT", for shared memory, need to get it via # shared memory utils if (not _test_system_shared_memory) and ( not _test_cuda_shared_memory): out = results.as_numpy(OUTPUT) else: output = results.get_output(OUTPUT) if config[1] == "http": output_shape = output["shape"] else: output_shape = output.shape output_type = input_dtype if _test_system_shared_memory: out = shm.get_contents_as_numpy( shm_op_handle, output_type, output_shape) else: out = cudashm.get_contents_as_numpy( shm_op_handle, output_type, output_shape) result = out[0] if "nobatch" in trial else out[0][0] print("{}: {}".format(sequence_name, result)) if thresholds is not None: lt_ms = thresholds[0] gt_ms = thresholds[1] if lt_ms is not None: self.assertTrue((end_ms - start_ms) < lt_ms, "expected less than " + str(lt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, "expected greater than " + str(gt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if delay_ms is not None: time.sleep(delay_ms[1] / 1000.0) seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object_: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: if _test_jetson: lt_ms *= _jetson_slowdown_factor self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) if config[2]: triton_client.stop_stream() if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region destroy_func(shm_op_handle) for shm_ip_handle in shm_ip_handles: destroy_func(shm_ip_handle)
def precreate_register_dynaseq_shape_tensor_regions(self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)): self.assertFalse(_test_cuda_shared_memory, "Shape tensors does not support CUDA shared memory") if _test_system_shared_memory: shm_region_handles = [] for j, (shape_value, value) in enumerate(value_list): input_list = list() shape_input_list = list() dummy_input_list = list() for b in range(batch_size): if dtype == np.object_: dummy_in0 = np.full(tensor_shape, value, dtype=np.int32) dummy_in0n = np.array( [str(x) for x in dummy_in0.reshape(in0.size)], dtype=object) dummy_in0 = dummy_in0n.reshape(tensor_shape) else: dummy_in0 = np.full(tensor_shape, value, dtype=dtype) dummy_input_list.append(dummy_in0) in0 = np.full(tensor_shape, value, dtype=np.int32) input_list.append(in0) # Only one shape tensor input per batch shape_input_list.append( np.full(tensor_shape, shape_value, dtype=np.int32)) if dtype == np.object_: input_list_tmp = iu.serialize_byte_tensor_list(input_list) input_byte_size = sum( [serialized_byte_size(i0) for i0 in input_list_tmp]) else: input_list_tmp = input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) dummy_input_byte_size = sum( [i0.nbytes for i0 in dummy_input_list]) shape_input_byte_size = sum( [i0.nbytes for i0 in shape_input_list]) shape_output_byte_size = shape_input_byte_size output_byte_size = np.dtype(np.int32).itemsize + 2 resized_output_byte_size = 32 * shape_value # create shared memory regions and copy data for input values ip_name = 'ip{}{}'.format(i, j) shape_ip_name = 'shape_ip{}{}'.format(i, j) dummy_ip_name = 'dummy_ip{}{}'.format(i, j) shape_op_name = 'shape_op{}{}'.format(i, j) op_name = 'op{}{}'.format(i, j) resized_op_name = 'resized_op{}{}'.format(i, j) shm_ip_handle = shm.create_shared_memory_region( ip_name, '/' + ip_name, input_byte_size) shm_shape_ip_handle = shm.create_shared_memory_region( shape_ip_name, '/' + shape_ip_name, shape_input_byte_size) shm_dummy_ip_handle = shm.create_shared_memory_region( dummy_ip_name, '/' + dummy_ip_name, dummy_input_byte_size) shm_shape_op_handle = shm.create_shared_memory_region( shape_op_name, '/' + shape_op_name, shape_output_byte_size) shm_op_handle = shm.create_shared_memory_region( op_name, '/' + op_name, output_byte_size) shm_resized_op_handle = shm.create_shared_memory_region( resized_op_name, '/' + resized_op_name, resized_output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) shm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list) shm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list) self.triton_client_.register_system_shared_memory( ip_name, '/' + ip_name, input_byte_size) self.triton_client_.register_system_shared_memory( shape_ip_name, '/' + shape_ip_name, shape_input_byte_size) self.triton_client_.register_system_shared_memory( dummy_ip_name, '/' + dummy_ip_name, dummy_input_byte_size) self.triton_client_.register_system_shared_memory( shape_op_name, '/' + shape_op_name, shape_output_byte_size) self.triton_client_.register_system_shared_memory( op_name, '/' + op_name, output_byte_size) self.triton_client_.register_system_shared_memory( resized_op_name, '/' + resized_op_name, resized_output_byte_size) shm_region_handles.append( (ip_name, input_byte_size, shm_ip_handle)) shm_region_handles.append( (shape_ip_name, shape_input_byte_size, shm_shape_ip_handle)) shm_region_handles.append( (dummy_ip_name, dummy_input_byte_size, shm_dummy_ip_handle)) shm_region_handles.append( (shape_op_name, shape_output_byte_size, shm_shape_op_handle)) shm_region_handles.append( (op_name, output_byte_size, shm_op_handle)) shm_region_handles.append( (resized_op_name, resized_output_byte_size, shm_resized_op_handle)) return shm_region_handles else: return []
def precreate_register_regions(self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)): if _test_system_shared_memory or _test_cuda_shared_memory: shm_region_handles = [] for j, value in enumerate(value_list): # For string we can't know the size of the output # so we conservatively assume 64 bytes for each # element of the output if dtype == np.object_: output_byte_size = 4 # size of empty string else: output_byte_size = 0 # create data input_list = list() for b in range(batch_size): if dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array([ str(x).encode('utf-8') for x in in0.reshape(in0.size) ], dtype=object) in0 = in0n.reshape(tensor_shape) output_byte_size += 64 * in0.size else: in0 = np.full(tensor_shape, value, dtype=dtype) output_byte_size += np.dtype(dtype).itemsize * in0.size input_list.append(in0) if dtype == np.object_: input_list_tmp = iu.serialize_byte_tensor_list(input_list) input_byte_size = sum( [serialized_byte_size(i0) for i0 in input_list_tmp]) else: input_list_tmp = input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) # create shared memory regions and copy data for input values ip_name = 'ip{}{}'.format(i, j) op_name = 'op{}{}_data'.format(i, j) if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( ip_name, '/' + ip_name, input_byte_size) shm_op_handle = shm.create_shared_memory_region( op_name, '/' + op_name, output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) self.triton_client_.register_system_shared_memory( ip_name, '/' + ip_name, input_byte_size) self.triton_client_.register_system_shared_memory( op_name, '/' + op_name, output_byte_size) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( ip_name, input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( op_name, output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp) self.triton_client_.register_cuda_shared_memory( ip_name, cudashm.get_raw_handle(shm_ip_handle), 0, input_byte_size) self.triton_client_.register_cuda_shared_memory( op_name, cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_region_handles.append( (ip_name, input_byte_size, shm_ip_handle)) shm_region_handles.append( (op_name, output_byte_size, shm_op_handle)) return shm_region_handles else: return []