def test_reregister_after_register(self): # Create a valid system shared memory region and unregister after register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) try: triton_client.register_system_shared_memory( "dummy_data", "/dummy_data", 8) except Exception as ex: self.assertTrue( "shared memory region 'dummy_data' already in manager" in str( ex)) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle)
def test_invalid_create_shm(self): # Raises error since tried to create invalid system shared memory region try: shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", -1) shm.destroy_shared_memory_region(shm_op0_handle) except Exception as ex: self.assertTrue(str(ex) == "unable to initialize the size")
def cleanup_shm_regions(self, shm_handles): # Make sure unregister is before shared memory destruction self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() for shm_tmp_handle in shm_handles: if _test_system_shared_memory: shm.destroy_shared_memory_region(shm_tmp_handle[2]) elif _test_cuda_shared_memory: cudashm.destroy_shared_memory_region(shm_tmp_handle[2])
def test_unregister_before_register(self): # Create a valid system shared memory region and unregister before register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) triton_client.unregister_system_shared_memory("dummy_data") shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 0) else: self.assertTrue(len(shm_status.regions) == 0) shm.destroy_shared_memory_region(shm_op0_handle)
def test_valid_create_set_register(self): # Create a valid system shared memory region, fill data in it and register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) shm.set_shared_memory_region(shm_op0_handle, [np.array([1, 2], dtype=np.float32)]) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle)
def unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory): if not (use_system_shared_memory or use_cuda_shared_memory): return None triton_client = httpclient.InferenceServerClient("localhost:8000") if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_regions[0] + '_data') triton_client.unregister_cuda_shared_memory(shm_regions[1] + '_data') cudashm.destroy_shared_memory_region(shm_handles[0]) cudashm.destroy_shared_memory_region(shm_handles[1]) else: triton_client.unregister_system_shared_memory(shm_regions[0] + '_data') triton_client.unregister_system_shared_memory(shm_regions[1] + '_data') shm.destroy_shared_memory_region(shm_handles[0]) shm.destroy_shared_memory_region(shm_handles[1]) if precreated_shm_regions is None: i = 0 if "OUTPUT0" in outputs: if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_regions[2] + '_data') cudashm.destroy_shared_memory_region(shm_handles[2]) else: triton_client.unregister_system_shared_memory(shm_regions[2] + '_data') shm.destroy_shared_memory_region(shm_handles[2]) i += 1 if "OUTPUT1" in outputs: if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_regions[2 + i] + '_data') cudashm.destroy_shared_memory_region(shm_handles[3]) else: triton_client.unregister_system_shared_memory(shm_regions[2 + i] + '_data') shm.destroy_shared_memory_region(shm_handles[3])
output1 = results.get_output("OUTPUT1") if output1 is not None: output1_data = shm.get_contents_as_numpy( shm_op1_handle, utils.triton_to_np_dtype(output1['datatype']), output1['shape']) else: print("OUTPUT1 is missing in the response.") sys.exit(1) for i in range(16): r0 = output0_data[0][i].decode("utf-8") r1 = output1_data[0][i].decode("utf-8") print(str(input0_data[i]) + " + " + str(input1_data[i]) + " = " + r0) print(str(input0_data[i]) + " - " + str(input1_data[i]) + " = " + r1) if expected_sum[i] != r0: print("shm infer error: incorrect sum") sys.exit(1) if expected_diff[i] != r1: print("shm infer error: incorrect difference") sys.exit(1) print(triton_client.get_system_shared_memory_status()) triton_client.unregister_system_shared_memory() shm.destroy_shared_memory_region(shm_ip0_handle) shm.destroy_shared_memory_region(shm_ip1_handle) shm.destroy_shared_memory_region(shm_op0_handle) shm.destroy_shared_memory_region(shm_op1_handle) print('PASS: system shared memory')
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue( use_http or use_grpc or use_http_json_tensors or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if use_http_json_tensors and (tensor_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_shape = input_shapes[io_num] output_shape = output_shapes[io_num] rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shape, dtype=rtensor_dtype) else: input_array = np.random.choice(a=[False, True], size=input_shape) if tensor_dtype != np.object: input_array = input_array.astype(tensor_dtype) expected_array = np.ndarray.copy(input_array) else: expected_array = np.array([unicode(str(x), encoding='utf-8') for x in input_array.flatten()], dtype=object) input_array = np.array([str(x) for x in input_array.flatten()], dtype=object).reshape(input_array.shape) expected_array = expected_array.reshape(output_shape) expected_dict[output_name] = expected_array output_byte_size = expected_array.nbytes if batch_size == 1: input_list = [input_array] else: input_list = [x for x in input_array] # Serialization of string tensors in the case of shared memory must be done manually if tensor_dtype == np.object: input_list_tmp = serialize_byte_tensor_list(input_list) else: input_list_tmp = input_list input_byte_size = sum([ip.nbytes for ip in input_list_tmp]) # create and register shared memory region for inputs and outputs shm_io_handles = su.create_set_either_shm_region([shm_region_name_prefix[0]+str(io_num), shm_region_name_prefix[1]+str(io_num)], input_list_tmp, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handles) != 0: shm_ip_handles.append(shm_io_handles[0]) shm_op_handles.append(shm_io_handles[1]) input_dict[input_name] = input_array if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient( config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( config[0], verbose=True) inputs = [] output_req = [] for io_num, (input_name, output_name) in enumerate(zip(input_dict.keys(), expected_dict.keys())): input_data = input_dict[input_name] input_byte_size = input_data.nbytes output_byte_size = expected_dict[output_name].nbytes if config[1] == "http": inputs.append(httpclient.InferInput( input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append(httpclient.InferRequestedOutput( output_name, binary_data=config[3])) else: inputs.append(grpcclient.InferInput( input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append( grpcclient.InferRequestedOutput(output_name)) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[-1].set_data_from_numpy(input_data, binary_data=config[3]) else: inputs[-1].set_data_from_numpy(input_data) else: # Register necessary shared memory regions/handles su.register_add_either_shm_regions(inputs, output_req, shm_region_name_prefix, (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) last_response = results.get_response() if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(response_model_version, model_version) tester.assertEqual(len(response_outputs), io_cnt) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name tester.assertTrue(result_name in expected_dict) if use_system_shared_memory or use_cuda_shared_memory: if pf == "libtorch" or pf == "libtorch_nobatch": io_num = int(result_name.split("OUTPUT__")[1]) else: io_num = int(result_name.split("OUTPUT")[1]) shm_handle = shm_op_handles[io_num] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) expected = expected_dict[result_name] tester.assertEqual(output_data.shape, expected.shape) tester.assertTrue(np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format( model_name, result_name, expected, output_data)) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0]+str(io_num)+'_data') triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0]+str(io_num)+'_data') cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: triton_client.unregister_system_shared_memory( shm_region_name_prefix[1]+str(io_num)+'_data') triton_client.unregister_system_shared_memory( shm_region_name_prefix[1]+str(io_num)+'_data') shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def infer_shape_tensor(tester, pf, tensor_dtype, input_shape_values, dummy_input_shapes, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, priority=0, timeout_us=0, batch_size=1): tester.assertTrue(use_http or use_grpc or use_streaming) tester.assertTrue(pf == "plan" or pf == "plan_nobatch") tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) configs = [] if use_http: configs.append(("localhost:8000", "http", False)) if use_grpc: configs.append(("localhost:8001", "grpc", False)) if use_streaming: configs.append(("localhost:8001", "grpc", True)) io_cnt = len(input_shape_values) # FIXME wrap up shm handle cleanup # item is (handle, byte_size) input_shm_handle_list = [] output_shm_handle_list= [] dummy_input_list = [] input_list = [] expected_dict = dict() # Prepare IO in advance for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()], dtype=object).reshape(dummy_in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected value for the output. Skip dummy output as we # only care about its shape (== value of OUTPUT*) expected_dict[output_name] = np.ndarray.copy(in0) # Only need to create region once input_byte_size = in0.size * np.dtype(np.int32).itemsize output_byte_size = input_byte_size * batch_size if use_system_shared_memory: input_shm_handle_list.append((shm.create_shared_memory_region(input_name+shm_suffix, '/'+input_name+shm_suffix, input_byte_size), input_byte_size)) output_shm_handle_list.append((shm.create_shared_memory_region(output_name+shm_suffix, '/'+output_name+shm_suffix, output_byte_size), output_byte_size)) shm.set_shared_memory_region(input_shm_handle_list[-1][0], [in0,]) model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) # Run inference and check results for each config for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) inputs = [] outputs = [] # Set IOs for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) inputs.append(client_utils.InferInput( dummy_input_name, dummy_input_shapes[io_num], np_to_triton_dtype(tensor_dtype))) inputs.append(client_utils.InferInput( input_name, input_list[io_num].shape, "INT32")) outputs.append(client_utils.InferRequestedOutput( dummy_output_name)) outputs.append(client_utils.InferRequestedOutput( output_name)) # -2: dummy; -1: input inputs[-2].set_data_from_numpy(dummy_input_list[io_num]) if (not use_system_shared_memory): inputs[-1].set_data_from_numpy(input_list[io_num]) else: input_byte_size = input_shm_handle_list[io_num][1] output_byte_size = output_shm_handle_list[io_num][1] triton_client.register_system_shared_memory(input_name+shm_suffix, "/"+input_name+shm_suffix, input_byte_size) triton_client.register_system_shared_memory(output_name+shm_suffix, "/"+output_name+shm_suffix, output_byte_size) inputs[-1].set_shared_memory(input_name+shm_suffix, input_byte_size) outputs[-1].set_shared_memory(output_name+shm_suffix, output_byte_size) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) for io_num in range(io_cnt): output_name = "OUTPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) expected = expected_dict[output_name] # get outputs as numpy array dummy_out = results.as_numpy(dummy_output_name) if (not use_system_shared_memory): out = results.as_numpy(output_name) else: output = results.get_output(output_name) if config[1] == "grpc": output_shape = output.shape else: output_shape = output["shape"] out = shm.get_contents_as_numpy(output_shm_handle_list[io_num][0], np.int32, output_shape) # if out shape is 2D, it is batched if (len(out.shape) == 2): # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue(np.array_equal(dummy_out.shape[1:], out[0]), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out[0], dummy_out.shape[1:])) for b in range(1, out.shape[0]): tester.assertTrue(np.array_equal(out[b-1], out[b]), "expect shape tensor has consistent value, " "expected: {}, got {}".format(out[b-1], out[b])) out = out[0] else: tester.assertTrue(np.array_equal(dummy_out.shape, out), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out, dummy_out.shape)) tester.assertTrue(np.array_equal(out, expected), "{}, {}, expected: {}, got {}".format( model_name, output_name, expected, out)) # unregister shared memory region for next config if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_name+shm_suffix) triton_client.unregister_system_shared_memory(output_name+shm_suffix) for handle in input_shm_handle_list: shm.destroy_shared_memory_region(handle[0]) for handle in output_shm_handle_list: shm.destroy_shared_memory_region(handle[0])
def _cleanup_server(self, shm_handles): for shm_handle in shm_handles: shm.destroy_shared_memory_region(shm_handle)