def _configure_sever(self): shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0) shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0) shm_op0_handle = cshm.create_shared_memory_region( "output0_data", 64, 0) shm_op1_handle = cshm.create_shared_memory_region( "output1_data", 64, 0) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) cshm.set_shared_memory_region(shm_ip0_handle, [input0_data]) cshm.set_shared_memory_region(shm_ip1_handle, [input1_data]) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.register_cuda_shared_memory( "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64) triton_client.register_cuda_shared_memory( "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64) triton_client.register_cuda_shared_memory( "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64) triton_client.register_cuda_shared_memory( "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def register_add_either_shm_regions(inputs, outputs, shm_region_prefix, shm_handles, io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client): if use_system_shared_memory or use_cuda_shared_memory: # Unregister then register required shared memory regions input_shm_name = shm_region_prefix[0] + str(io_num) output_shm_name = shm_region_prefix[1] + str(io_num) if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_shm_name + '_data') triton_client.unregister_system_shared_memory(output_shm_name + '_data') triton_client.register_system_shared_memory( input_shm_name + '_data', '/' + input_shm_name, input_byte_size) triton_client.register_system_shared_memory( output_shm_name + '_data', '/' + output_shm_name, output_byte_size) if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(input_shm_name + '_data') triton_client.unregister_cuda_shared_memory(output_shm_name + '_data') triton_client.register_cuda_shared_memory( input_shm_name + '_data', cudashm.get_raw_handle(shm_handles[0][io_num]), 0, input_byte_size) triton_client.register_cuda_shared_memory( output_shm_name + '_data', cudashm.get_raw_handle(shm_handles[1][io_num]), 0, output_byte_size) # Add shared memory regions to inputs inputs[io_num].set_shared_memory(input_shm_name + '_data', input_byte_size) outputs[io_num].set_shared_memory(output_shm_name + '_data', output_byte_size)
def test_reregister_after_register(self): # Create a valid cuda shared memory region and unregister after register if _protocol == "http": triton_client = httpclient.InferenceServerClient( _url, verbose=True) else: triton_client = grpcclient.InferenceServerClient( _url, verbose=True) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) triton_client.register_cuda_shared_memory( "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8) try: triton_client.register_cuda_shared_memory( "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8) except Exception as ex: self.assertTrue( "shared memory region 'dummy_data' already in manager" in str(ex)) shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) cshm.destroy_shared_memory_region(shm_op0_handle)
def test_valid_create_set_register(self): # Create a valid cuda shared memory region, fill data in it and register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) cshm.set_shared_memory_region(shm_op0_handle, [np.array([1, 2], dtype=np.float32)]) triton_client.register_cuda_shared_memory( "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8) shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) cshm.destroy_shared_memory_region(shm_op0_handle)
def test_too_big_shm(self): # Shared memory input region larger than needed - Throws error error_msg = [] shm_handles = self._configure_sever() shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0) if _protocol == "http": triton_client = httpclient.InferenceServerClient( _url, verbose=True) else: triton_client = grpcclient.InferenceServerClient( _url, verbose=True) triton_client.register_cuda_shared_memory( "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128) self._basic_inference( shm_handles[0], shm_ip2_handle, shm_handles[2], shm_handles[3], error_msg, "input2_data", 128) if len(error_msg) > 0: self.assertTrue( "unexpected size 128 for inference input 'INPUT1', expecting 64" in error_msg[-1]) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
def test_register_after_inference(self): # Register after inference error_msg = [] shm_handles = self._configure_sever() if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg) if len(error_msg) > 0: raise Exception(str(error_msg)) shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0) triton_client.register_cuda_shared_memory( "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64) shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 5) else: self.assertTrue(len(shm_status.regions) == 5) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
def infer_shape_tensor(tester, pf, tensor_dtype, input_shape_values, dummy_input_shapes, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0, batch_size=1): tester.assertTrue(use_http or use_grpc or use_streaming) tester.assertTrue(pf == "plan" or pf == "plan_nobatch") tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") configs = [] if use_http: configs.append(("localhost:8000", "http", False)) if use_grpc: configs.append(("localhost:8001", "grpc", False)) if use_streaming: configs.append(("localhost:8001", "grpc", True)) io_cnt = len(input_shape_values) # FIXME wrap up shm handle cleanup # For (cuda) shared memory, it's only set for shape tensor for simplicity. # Regular tensor with (cuda) shared memory should be well-tested in other # tests. # item is (handle, byte_size, is_cuda) input_shm_handle_list = [] output_shm_handle_list = [] dummy_input_list = [] input_list = [] expected_dict = dict() # Prepare IO in advance for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()], dtype=object).reshape(dummy_in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected value for the output. Skip dummy output as we # only care about its shape (== value of OUTPUT*) expected_dict[output_name] = np.ndarray.copy(in0) # Only need to create region once input_byte_size = in0.size * np.dtype(np.int32).itemsize output_byte_size = input_byte_size * batch_size if use_system_shared_memory: input_shm_handle_list.append( (shm.create_shared_memory_region(input_name + shm_suffix, '/' + input_name + shm_suffix, input_byte_size), input_byte_size, False)) output_shm_handle_list.append((shm.create_shared_memory_region( output_name + shm_suffix, '/' + output_name + shm_suffix, output_byte_size), output_byte_size, False)) shm.set_shared_memory_region(input_shm_handle_list[-1][0], [ in0, ]) elif use_cuda_shared_memory: input_shm_handle_list.append( (cudashm.create_shared_memory_region(input_name + shm_suffix, input_byte_size, 0), input_byte_size, True)) output_shm_handle_list.append( (cudashm.create_shared_memory_region(output_name + shm_suffix, output_byte_size, 0), output_byte_size, True)) cudashm.set_shared_memory_region(input_shm_handle_list[-1][0], [ in0, ]) model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) # Run inference and check results for each config for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) inputs = [] outputs = [] # Set IOs for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) inputs.append( client_utils.InferInput(dummy_input_name, dummy_input_shapes[io_num], np_to_triton_dtype(tensor_dtype))) inputs.append( client_utils.InferInput(input_name, input_list[io_num].shape, "INT32")) outputs.append( client_utils.InferRequestedOutput(dummy_output_name)) outputs.append(client_utils.InferRequestedOutput(output_name)) # -2: dummy; -1: input inputs[-2].set_data_from_numpy(dummy_input_list[io_num]) if (not use_system_shared_memory) and (not use_cuda_shared_memory): inputs[-1].set_data_from_numpy(input_list[io_num]) else: input_byte_size = input_shm_handle_list[io_num][1] output_byte_size = output_shm_handle_list[io_num][1] if use_system_shared_memory: triton_client.register_system_shared_memory( input_name + shm_suffix, "/" + input_name + shm_suffix, input_byte_size) triton_client.register_system_shared_memory( output_name + shm_suffix, "/" + output_name + shm_suffix, output_byte_size) else: triton_client.register_cuda_shared_memory( input_name + shm_suffix, cudashm.get_raw_handle( input_shm_handle_list[io_num][0]), 0, input_byte_size) triton_client.register_cuda_shared_memory( output_name + shm_suffix, cudashm.get_raw_handle( output_shm_handle_list[io_num][0]), 0, output_byte_size) inputs[-1].set_shared_memory(input_name + shm_suffix, input_byte_size) outputs[-1].set_shared_memory(output_name + shm_suffix, output_byte_size) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) for io_num in range(io_cnt): output_name = "OUTPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) expected = expected_dict[output_name] # get outputs as numpy array dummy_out = results.as_numpy(dummy_output_name) if (not use_system_shared_memory) and (not use_cuda_shared_memory): out = results.as_numpy(output_name) else: output = results.get_output(output_name) if config[1] == "grpc": output_shape = output.shape else: output_shape = output["shape"] if use_system_shared_memory: out = shm.get_contents_as_numpy( output_shm_handle_list[io_num][0], np.int32, output_shape) else: out = cudashm.get_contents_as_numpy( output_shm_handle_list[io_num][0], np.int32, output_shape) # if out shape is 2D, it is batched if (len(out.shape) == 2): # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue( np.array_equal(dummy_out.shape[1:], out[0]), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out[0], dummy_out.shape[1:])) for b in range(1, out.shape[0]): tester.assertTrue( np.array_equal(out[b - 1], out[b]), "expect shape tensor has consistent value, " "expected: {}, got {}".format(out[b - 1], out[b])) out = out[0] else: tester.assertTrue( np.array_equal(dummy_out.shape, out), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out, dummy_out.shape)) tester.assertTrue( np.array_equal(out, expected), "{}, {}, expected: {}, got {}".format(model_name, output_name, expected, out)) # unregister shared memory region for next config if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_name + shm_suffix) triton_client.unregister_system_shared_memory(output_name + shm_suffix) elif use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(input_name + shm_suffix) triton_client.unregister_cuda_shared_memory(output_name + shm_suffix) for handle in input_shm_handle_list: if (handle[2]): cudashm.destroy_shared_memory_region(handle[0]) else: shm.destroy_shared_memory_region(handle[0]) for handle in output_shm_handle_list: if (handle[2]): cudashm.destroy_shared_memory_region(handle[0]) else: shm.destroy_shared_memory_region(handle[0])
def register_add_shm_regions(inputs, outputs, shm_region_names, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client): if use_system_shared_memory or use_cuda_shared_memory: # Unregister then register required shared memory regions if use_system_shared_memory: triton_client.unregister_system_shared_memory(shm_region_names[0] + '_data') triton_client.unregister_system_shared_memory(shm_region_names[1] + '_data') triton_client.register_system_shared_memory( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) triton_client.register_system_shared_memory( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: triton_client.unregister_system_shared_memory( shm_region_names[2] + '_data') triton_client.register_system_shared_memory( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: triton_client.unregister_system_shared_memory( shm_region_names[2 + i] + '_data') triton_client.register_system_shared_memory( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_region_names[0] + '_data') triton_client.unregister_cuda_shared_memory(shm_region_names[1] + '_data') triton_client.register_cuda_shared_memory( shm_region_names[0] + '_data', cudashm.get_raw_handle(shm_handles[0]), 0, input0_byte_size) triton_client.register_cuda_shared_memory( shm_region_names[1] + '_data', cudashm.get_raw_handle(shm_handles[1]), 0, input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: triton_client.unregister_cuda_shared_memory( shm_region_names[2] + '_data') triton_client.register_cuda_shared_memory( shm_region_names[2] + '_data', cudashm.get_raw_handle(shm_handles[2]), 0, output0_byte_size) i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: triton_client.unregister_cuda_shared_memory( shm_region_names[2 + i] + '_data') triton_client.register_cuda_shared_memory( shm_region_names[2 + i] + '_data', cudashm.get_raw_handle(shm_handles[3]), 0, output1_byte_size) # Add shared memory regions to inputs inputs[0].set_shared_memory(shm_region_names[0] + '_data', input0_byte_size) inputs[1].set_shared_memory(shm_region_names[1] + '_data', input1_byte_size)
# to unique integers and the second to all ones. input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) input_byte_size = input0_data.size * input0_data.itemsize output_byte_size = input_byte_size # Create Output0 and Output1 in Shared Memory and store shared memory handles shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", output_byte_size, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", output_byte_size, 0) # Register Output0 and Output1 shared memory with Triton Server triton_client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, output_byte_size) triton_client.register_cuda_shared_memory( "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, output_byte_size) # Create Input0 and Input1 in Shared Memory and store shared memory handles shm_ip0_handle = cudashm.create_shared_memory_region( "input0_data", input_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( "input1_data", input_byte_size, 0) # Put input data values into shared memory cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data]) cudashm.set_shared_memory_region(shm_ip1_handle, [input1_data])