def run(self, client_metadata): trial = self.get_trial() model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_) triton_client = client_metadata[0] input_name = self.input_name_ if "librotch" in trial: input_name = "INPUT__0" tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) // np.dtype(self.input_dtype_).itemsize), ) in0 = np.random.random(tensor_shape).astype(self.input_dtype_) inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(self.input_dtype_)), ] inputs[0].set_data_from_numpy(in0) # Expect an exception for small timeout values. try: triton_client.infer(model_name, inputs, client_timeout=0.1) assert False, "expected inference failure from deadline exceeded" except Exception as ex: if "Deadline Exceeded" not in ex.message(): assert False, "timeout_client failed {}".format(self.name_) # Expect timeout error as success case return 1
def test_custom(self): tensor_shape = (self.input_size_, ) small_tensor_shape = (1, ) # custom_zero_1_float32 is identity model with input shape [-1] for protocol, url in self.protocols_: model_name = tu.get_zero_model_name("custom", 1, self.data_type_) ctx = InferContext(url, protocol, model_name, None, True) self._test_helper(ctx, tensor_shape, small_tensor_shape)
def test_libtorch(self): tensor_shape = (self.input_size_,) small_tensor_shape = (1,) # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1] for protocol, url in self.protocols_: model_name = tu.get_zero_model_name("libtorch_nobatch", 1, self.data_type_) ctx = InferContext(url, protocol, model_name, None, True) self._test_helper(ctx, tensor_shape, small_tensor_shape, 'INPUT__0', 'OUTPUT__0')
def create_identity_ensemble_modelconfig(ensemble_test_type, models_dir, model_version, max_batch, dtype, input_shapes, input_model_shapes, output_shapes, output_model_shapes, predefined_schedule=None): io_cnt = len(input_shapes) for ensemble_type in BASIC_ENSEMBLE_TYPES: # Use a different model name for the non-batching variant ensemble_prefix = "{}_{}".format(ensemble_type, ensemble_test_type) model_name = tu.get_zero_model_name( ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt, dtype) # [TODO] Temp fix for infer_zero ensemble_schedule = predefined_schedule if predefined_schedule is None: ensemble_schedule = IdentityEnsembleSchedule( ensemble_type, ensemble_test_type).get_schedule(dtype, input_shapes, input_model_shapes, output_shapes, output_model_shapes) config_dir = models_dir + "/" + model_name config = create_general_modelconfig(model_name, "ensemble", max_batch, repeat(dtype, io_cnt), input_shapes, input_model_shapes, repeat(dtype, io_cnt), output_shapes, output_model_shapes, repeat(None, io_cnt)) config += ensemble_schedule try: os.makedirs(config_dir) except OSError as ex: pass # ignore existing dir with open(config_dir + "/config.pbtxt", "w") as cfile: cfile.write(config)
def create_identity_ensemble_modelfile(ensemble_test_type, models_dir, model_version, max_batch, dtype, input_shapes, output_shapes): io_cnt = len(input_shapes) # Use a different model name for the non-batching variant for ensemble_type in BASIC_ENSEMBLE_TYPES: ensemble_prefix = "{}_{}".format(ensemble_type, ensemble_test_type) model_name = tu.get_zero_model_name( ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt, dtype) model_version_dir = models_dir + "/" + model_name + "/" + str( model_version) try: os.makedirs(model_version_dir) except OSError as ex: pass # ignore existing dir
def test_dynamic_different_shape_values(self): # Send two requests with sum of static batch sizes == # preferred size, but with different shape values. This # should cause the requests to not be batched. The first # response will come back immediately and the second # delayed by the max batch queue delay try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_zero_model_name("plan", 1, np.float32) self.check_setup(url, protocol, model_name) self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) threads = [] threads.append( threading.Thread(target=self.check_response, args=(3, (6000, None)), kwargs={ 'shape_values': [[2, 2]], 'dummy_input_shapes': [[16, 16]], 'shm_suffix': '{}'.format(len(threads)) })) threads.append( threading.Thread(target=self.check_response, args=(3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)), kwargs={ 'shape_values': [[4, 4]], 'dummy_input_shapes': [[16, 16]], 'shm_suffix': '{}'.format(len(threads)) })) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (3, ), 2, 6) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_dynamic_identical_shape_values(self): # Send two requests with sum of static batch sizes == # preferred size, but with identical shape values. This # should cause the requests to get batched. Both # responses should come back immediately. try: model_name = tu.get_zero_model_name("plan", 1, np.float32) self.check_setup(model_name) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) threads = [] threads.append( threading.Thread(target=self.check_response, args=(4, (6000, None)), kwargs={ 'shape_values': [[4, 4]], 'dummy_input_shapes': [[16, 16]], 'shm_suffix': '{}'.format(len(threads)) })) threads.append( threading.Thread(target=self.check_response, args=(2, (6000, None)), kwargs={ 'shape_values': [[4, 4]], 'dummy_input_shapes': [[16, 16]], 'shm_suffix': '{}'.format(len(threads)) })) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(model_name, {6: 1}, 1, 6) except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} output_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_list = list() expected_list = list() for b in range(batch_size): rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shapes[io_num], dtype=rtensor_dtype) else: in0 = np.random.choice(a=[False, True], size=input_shapes[io_num]) if tensor_dtype != np.object: in0 = in0.astype(tensor_dtype) expected0 = np.ndarray.copy(in0) else: expected0 = np.array( [unicode(str(x), encoding='utf-8') for x in in0.flatten()], dtype=object) in0 = np.array([str(x) for x in in0.flatten()], dtype=object).reshape(in0.shape) expected0 = expected0.reshape(output_shapes[io_num]) input_list.append(in0) expected_list.append(expected0) expected_dict[output_name] = expected_list input_byte_size = tu.shape_element_count(input_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size output_byte_size = tu.shape_element_count(output_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size # create and register shared memory region for inputs and outputs shm_io_handle = su.create_register_set_either_shm_region( [ shm_region_name_prefix[0] + str(io_num), shm_region_name_prefix[1] + str(io_num) ], input_list, input_byte_size, output_byte_size, shared_memory_ctx, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handle) != 0: shm_ip_handles.append(shm_io_handle[0]) shm_op_handles.append(shm_io_handle[1]) input_dict[input_name] = (shm_ip_handles[io_num], input_shapes) output_dict[output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[io_num]) else: input_dict[input_name] = input_list output_dict[output_name] = InferContext.ResultFormat.RAW # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=0, streaming=config[2], verbose=True) results = ctx.run(input_dict, output_dict, batch_size) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), io_cnt) for (result_name, result_val) in iteritems(results): tester.assertTrue(result_name in output_dict) tester.assertTrue(result_name in expected_dict) for b in range(batch_size): expected = expected_dict[result_name][b] tester.assertEqual(result_val[b].shape, expected.shape) tester.assertTrue( np.array_equal(result_val[b], expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): shared_memory_ctx.unregister(shm_ip_handles[io_num]) shared_memory_ctx.unregister(shm_op_handles[io_num]) su.destroy_either_shm_region(shm_ip_handles[io_num], use_system_shared_memory, use_cuda_shared_memory) su.destroy_either_shm_region(shm_op_handles[io_num], use_system_shared_memory, use_cuda_shared_memory) return results
def test_custom(self): # custom_zero_1_float32 is identity model with input shape [-1] for client in self._clients: model_name = tu.get_zero_model_name("custom", 1, self._data_type) self._test_helper(client, model_name)
def test_libtorch(self): # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1] for client in self._clients: model_name = tu.get_zero_model_name("libtorch_nobatch", 1, self._data_type) self._test_helper(client, model_name, 'INPUT__0', 'OUTPUT__0')
def test_plan(self): # plan_nobatch_zero_1_float32 is identity model with input shape [-1] for client in self._clients: model_name = tu.get_zero_model_name("plan_nobatch", 1, self._data_type) self._test_helper(client, model_name)
def test_onnx(self): # onnx_nobatch_zero_1_float32 is identity model with input shape [-1] for client in self._clients: model_name = tu.get_zero_model_name("onnx_nobatch", 1, self._data_type)
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_streaming=True): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: if TEST_SHARED_MEMORY: configs.append(("localhost:8000", ProtocolType.HTTP, False, True)) else: configs.append(("localhost:8000", ProtocolType.HTTP, False, False)) if use_grpc: if TEST_SHARED_MEMORY: configs.append(("localhost:8001", ProtocolType.GRPC, False, True)) else: configs.append(("localhost:8001", ProtocolType.GRPC, False, False)) if use_streaming: if TEST_SHARED_MEMORY: configs.append(("localhost:8001", ProtocolType.GRPC, True, True)) else: configs.append(("localhost:8001", ProtocolType.GRPC, True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) input_dict = {} output_dict = {} expected_dict = {} if config[3]: # create and register shared memory region for inputs and outputs shm_ip_handles = list() shm_op_handles = list() shared_memory_ctx = SharedMemoryControlContext(config[0], config[1], verbose=True) for io_num in range(io_cnt): input0_byte_size = tu.shape_element_count(input_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size output0_byte_size = tu.shape_element_count(output_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data",\ "/input"+str(io_num), input0_byte_size)) shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data",\ "/output"+str(io_num), output0_byte_size)) shm.register(shm_ip_handles[io_num]) shm.register(shm_op_handles[io_num]) offset_input = 0 offset_output = 0 for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_list = list() expected_list = list() for b in range(batch_size): rtensor_dtype = _range_repr_dtype(tensor_dtype) in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shapes[io_num], dtype=rtensor_dtype) if tensor_dtype != np.object: in0 = in0.astype(tensor_dtype) expected0 = np.ndarray.copy(in0) else: expected0 = np.array([ unicode(str(x), encoding='utf-8') for x in in0.flatten() ], dtype=object) in0 = np.array([str(x) for x in in0.flatten()], dtype=object).reshape(in0.shape) expected0 = expected0.reshape(output_shapes[io_num]) input_list.append(in0) expected_list.append(expected0) expected_dict[output_name] = expected_list if config[3]: # copy data into shared memory region for input values shm.set_shared_memory_region(shm_ip_handles[io_num], input_list) input_dict[input_name] = shm_ip_handles[io_num] output_dict[output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[io_num]) else: input_dict[input_name] = input_list output_dict[output_name] = InferContext.ResultFormat.RAW ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=0, streaming=config[2], verbose=True) results = ctx.run(input_dict, output_dict, batch_size) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), io_cnt) for (result_name, result_val) in iteritems(results): tester.assertTrue(result_name in output_dict) tester.assertTrue(result_name in expected_dict) for b in range(batch_size): expected = expected_dict[result_name][b] tester.assertEqual(result_val[b].shape, expected.shape) tester.assertTrue( np.array_equal(result_val[b], expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) if config[3]: for io_num in range(io_cnt): shared_memory_ctx.unregister(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shared_memory_ctx.unregister(shm_op_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if use_http_json_tensors and (tensor_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() # Get model platform model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform for io_num in range(io_cnt): if platform == "pytorch_libtorch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_shape = input_shapes[io_num] output_shape = output_shapes[io_num] rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shape, dtype=rtensor_dtype) else: input_array = np.random.choice(a=[False, True], size=input_shape) if tensor_dtype != np.object: input_array = input_array.astype(tensor_dtype) expected_array = np.ndarray.copy(input_array) else: expected_array = np.array([ unicode(str(x), encoding='utf-8') for x in input_array.flatten() ], dtype=object) input_array = np.array([str(x) for x in input_array.flatten()], dtype=object).reshape(input_array.shape) expected_array = expected_array.reshape(output_shape) expected_dict[output_name] = expected_array output_byte_size = expected_array.nbytes if batch_size == 1: input_list = [input_array] else: input_list = [x for x in input_array] # Serialization of string tensors in the case of shared memory must be done manually if tensor_dtype == np.object: input_list_tmp = serialize_byte_tensor_list(input_list) else: input_list_tmp = input_list input_byte_size = sum([ip.nbytes for ip in input_list_tmp]) # create and register shared memory region for inputs and outputs shm_io_handles = su.create_set_either_shm_region( [ shm_region_name_prefix[0] + str(io_num), shm_region_name_prefix[1] + str(io_num) ], input_list_tmp, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handles) != 0: shm_ip_handles.append(shm_io_handles[0]) shm_op_handles.append(shm_io_handles[1]) input_dict[input_name] = input_array if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] output_req = [] for io_num, (input_name, output_name) in enumerate( zip(input_dict.keys(), expected_dict.keys())): input_data = input_dict[input_name] input_byte_size = input_data.nbytes output_byte_size = expected_dict[output_name].nbytes if config[1] == "http": inputs.append( httpclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append( httpclient.InferRequestedOutput(output_name, binary_data=config[3])) else: inputs.append( grpcclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append(grpcclient.InferRequestedOutput(output_name)) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[-1].set_data_from_numpy(input_data, binary_data=config[3]) else: inputs[-1].set_data_from_numpy(input_data) else: # Register necessary shared memory regions/handles su.register_add_either_shm_regions( inputs, output_req, shm_region_name_prefix, (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) last_response = results.get_response() if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(response_model_version, model_version) tester.assertEqual(len(response_outputs), io_cnt) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name tester.assertTrue(result_name in expected_dict) if use_system_shared_memory or use_cuda_shared_memory: if platform == "pytorch_libtorch": io_num = int(result_name.split("OUTPUT__")[1]) else: io_num = int(result_name.split("OUTPUT")[1]) shm_handle = shm_op_handles[io_num] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) expected = expected_dict[result_name] tester.assertEqual(output_data.shape, expected.shape) tester.assertTrue( np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format(model_name, result_name, expected, output_data)) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def infer_shape_tensor(tester, pf, tensor_dtype, input_shape_values, dummy_input_shapes, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, priority=0, timeout_us=0, batch_size=1): tester.assertTrue(use_http or use_grpc or use_streaming) tester.assertTrue(pf == "plan" or pf == "plan_nobatch") tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) configs = [] if use_http: configs.append(("localhost:8000", "http", False)) if use_grpc: configs.append(("localhost:8001", "grpc", False)) if use_streaming: configs.append(("localhost:8001", "grpc", True)) io_cnt = len(input_shape_values) # FIXME wrap up shm handle cleanup # item is (handle, byte_size) input_shm_handle_list = [] output_shm_handle_list = [] dummy_input_list = [] input_list = [] expected_dict = dict() # Prepare IO in advance for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()], dtype=object).reshape(dummy_in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected value for the output. Skip dummy output as we # only care about its shape (== value of OUTPUT*) expected_dict[output_name] = np.ndarray.copy(in0) # Only need to create region once input_byte_size = in0.size * np.dtype(np.int32).itemsize output_byte_size = input_byte_size * batch_size if use_system_shared_memory: input_shm_handle_list.append( (shm.create_shared_memory_region(input_name + shm_suffix, '/' + input_name + shm_suffix, input_byte_size), input_byte_size)) output_shm_handle_list.append((shm.create_shared_memory_region( output_name + shm_suffix, '/' + output_name + shm_suffix, output_byte_size), output_byte_size)) shm.set_shared_memory_region(input_shm_handle_list[-1][0], [ in0, ]) model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) # Run inference and check results for each config for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) inputs = [] outputs = [] # Set IOs for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) inputs.append( client_utils.InferInput(dummy_input_name, dummy_input_shapes[io_num], np_to_triton_dtype(tensor_dtype))) inputs.append( client_utils.InferInput(input_name, input_list[io_num].shape, "INT32")) outputs.append( client_utils.InferRequestedOutput(dummy_output_name)) outputs.append(client_utils.InferRequestedOutput(output_name)) # -2: dummy; -1: input inputs[-2].set_data_from_numpy(dummy_input_list[io_num]) if (not use_system_shared_memory): inputs[-1].set_data_from_numpy(input_list[io_num]) else: input_byte_size = input_shm_handle_list[io_num][1] output_byte_size = output_shm_handle_list[io_num][1] triton_client.register_system_shared_memory( input_name + shm_suffix, "/" + input_name + shm_suffix, input_byte_size) triton_client.register_system_shared_memory( output_name + shm_suffix, "/" + output_name + shm_suffix, output_byte_size) inputs[-1].set_shared_memory(input_name + shm_suffix, input_byte_size) outputs[-1].set_shared_memory(output_name + shm_suffix, output_byte_size) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) for io_num in range(io_cnt): output_name = "OUTPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) expected = expected_dict[output_name] # get outputs as numpy array dummy_out = results.as_numpy(dummy_output_name) if (not use_system_shared_memory): out = results.as_numpy(output_name) else: output = results.get_output(output_name) if config[1] == "grpc": output_shape = output.shape else: output_shape = output["shape"] out = shm.get_contents_as_numpy( output_shm_handle_list[io_num][0], np.int32, output_shape) # if out shape is 2D, it is batched if (len(out.shape) == 2): # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue( np.array_equal(dummy_out.shape[1:], out[0]), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out[0], dummy_out.shape[1:])) for b in range(1, out.shape[0]): tester.assertTrue( np.array_equal(out[b - 1], out[b]), "expect shape tensor has consistent value, " "expected: {}, got {}".format(out[b - 1], out[b])) out = out[0] else: tester.assertTrue( np.array_equal(dummy_out.shape, out), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out, dummy_out.shape)) tester.assertTrue( np.array_equal(out, expected), "{}, {}, expected: {}, got {}".format(model_name, output_name, expected, out)) # unregister shared memory region for next config if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_name + shm_suffix) triton_client.unregister_system_shared_memory(output_name + shm_suffix) for handle in input_shm_handle_list: shm.destroy_shared_memory_region(handle[0]) for handle in output_shm_handle_list: shm.destroy_shared_memory_region(handle[0])
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_streaming=True): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) input_dict = {} output_dict = {} expected_dict = {} for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_list = list() expected_list = list() for b in range(batch_size): rtensor_dtype = _range_repr_dtype(tensor_dtype) in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shapes[io_num], dtype=rtensor_dtype) if tensor_dtype != np.object: in0 = in0.astype(tensor_dtype) expected0 = np.ndarray.copy(in0) else: expected0 = np.array([unicode(str(x), encoding='utf-8') for x in in0.flatten()], dtype=object) in0 = np.array([str(x) for x in in0.flatten()], dtype=object).reshape(in0.shape) expected0 = expected0.reshape(output_shapes[io_num]) input_list.append(in0) expected_list.append(expected0) input_dict[input_name] = input_list output_dict[output_name] = InferContext.ResultFormat.RAW expected_dict[output_name] = expected_list ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=0, streaming=config[2], verbose=True) results = ctx.run(input_dict, output_dict, batch_size) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), io_cnt) for (result_name, result_val) in iteritems(results): tester.assertTrue(result_name in output_dict) tester.assertTrue(result_name in expected_dict) for b in range(batch_size): expected = expected_dict[result_name][b] tester.assertEqual(result_val[b].shape, expected.shape) tester.assertTrue(np.array_equal(result_val[b], expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) return results
def __init__(self, name, rng, sequence_trials, identity_trials, queue_latency_range_us=(10000, 100000), sequence_id_range=None, verbose=False, out_stream=sys.stdout): super().__init__(name, [], verbose, out_stream) self.rng_ = rng self.sequence_id_range_ = sequence_id_range # List of tuples # (model_name, max_concurrency, batch_size, list(more PA options), # real_data_file), self.options_ = [] # Add no validation models self.options_.append( PerfAnalyzerScenario.ModelOption("resnet_v1_50_graphdef_def", 32, (1, 4, 1), queue_latency_range_us)) for trial in sequence_trials: dtype = self.get_datatype(trial) # Skip string sequence model for now, it is hard for PA to generate # valid input if dtype == np.dtype(object): continue model_name = tu.get_sequence_model_name(trial, dtype) self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us)) for trial in identity_trials: dtype = np.float32 model_name = tu.get_zero_model_name(trial, 1, dtype) if "libtorch" in trial: input_shapes = [("INPUT__0", "16")] else: input_shapes = [("INPUT0", "16")] self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us, input_shapes)) # Add output validation version of the models # Skip resnet as the output data has variation which makes exact # matching hard for trial in sequence_trials: dtype = self.get_datatype(trial) model_name = tu.get_sequence_model_name(trial, dtype) data_file = os.path.join("validation_data", "{}.json".format(model_name)) self.generate_sequence_data(trial, dtype, data_file) self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us, input_file=data_file)) for trial in identity_trials: dtype = np.float32 model_name = tu.get_zero_model_name(trial, 1, dtype) data_file = os.path.join("validation_data", "{}.json".format(model_name)) self.generate_identity_data(trial, dtype, data_file) self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us, input_file=data_file))
def infer_shape_tensor(tester, pf, batch_size, tensor_dtype, input_shape_values, dummy_input_shapes, model_version=None, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) io_cnt = len(input_shape_values) if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") input_dict = {} output_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) for io_num in range(io_cnt): tester.assertTrue(pf == "plan" or pf == "plan_nobatch") input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) dummy_input_name = "DUMMY_INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) input_list = list() dummy_input_list = list() expected_list = list() for b in range(batch_size): # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in in0.flatten()], dtype=object).reshape(in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor. Only one tensor per batch in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected list for the output expected0 = np.ndarray.copy(in0) expected_list.append(expected0) expected_dict[output_name] = expected_list input_byte_size = len(in0) * np.dtype(tensor_dtype).itemsize output_byte_size = input_byte_size * batch_size dummy_input_byte_size = tu.shape_element_count(dummy_input_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size # The dimension of this tensor will be the value of the shape tensor dummy_output_byte_size = tu.shape_element_count(in0) *\ np.dtype(tensor_dtype).itemsize * batch_size # create and register shared memory region for inputs and outputs if use_cuda_shared_memory: shm_ip_handles.append( cudashm.create_shared_memory_region( "input" + str(io_num) + "_data" + shm_suffix, input_byte_size, 0)) shm_ip_handles.append( cudashm.create_shared_memory_region( "dummy_input" + str(io_num) + "_data" + shm_suffix, dummy_input_byte_size, 0)) shm_op_handles.append( cudashm.create_shared_memory_region( "output" + str(io_num) + "_data" + shm_suffix, output_byte_size, 0)) shm_op_handles.append( cudashm.create_shared_memory_region( "dummy_output" + str(io_num) + "_data" + shm_suffix, dummy_output_byte_size, 0)) shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num]) shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num + 1]) shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num]) shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num + 1]) # copy data into shared memory region for input values cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num], input_list) cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1], dummy_input_list) elif use_system_shared_memory: shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data"+shm_suffix,\ "/input"+str(io_num)+shm_suffix, input_byte_size)) shm_ip_handles.append(shm.create_shared_memory_region("dumy_input"+str(io_num)+"_data"+shm_suffix,\ "/dummy_input"+str(io_num)+shm_suffix, dummy_input_byte_size)) shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data"+shm_suffix,\ "/output"+str(io_num)+shm_suffix, output_byte_size)) shm_op_handles.append(shm.create_shared_memory_region("dummy_output"+str(io_num)+"_data"+shm_suffix,\ "/dummy_output"+str(io_num)+shm_suffix, dummy_output_byte_size)) shared_memory_ctx.register(shm_ip_handles[2 * io_num]) shared_memory_ctx.register(shm_ip_handles[2 * io_num + 1]) shared_memory_ctx.register(shm_op_handles[2 * io_num]) shared_memory_ctx.register(shm_op_handles[2 * io_num + 1]) # copy data into shared memory region for input values shm.set_shared_memory_region(shm_ip_handles[2 * io_num], input_list) shm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1], dummy_input_list) if use_system_shared_memory or use_cuda_shared_memory: input_dict[input_name] = (shm_ip_handles[2 * io_num], [len(input_shape_values[0])]) input_dict[dummy_input_name] = (shm_ip_handles[2 * io_num + 1], dummy_input_shapes[io_num]) output_dict[output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[2 * io_num]) output_dict[dummy_output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[2 * io_num + 1]) else: input_dict[input_name] = input_list input_dict[dummy_input_name] = dummy_input_list output_dict[output_name] = InferContext.ResultFormat.RAW output_dict[dummy_output_name] = InferContext.ResultFormat.RAW # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=0, streaming=config[2], verbose=True) results = ctx.run(input_dict, output_dict, batch_size, priority=priority, timeout_us=timeout_us) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), 2 * io_cnt) for (result_name, result_val) in iteritems(results): tester.assertTrue(result_name in output_dict) expected = expected_dict[output_name][0] for b in range(batch_size): if result_name == output_name: tester.assertEqual(result_val[b].shape, expected.shape) tester.assertTrue( np.array_equal(result_val[b], expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) elif result_name == dummy_output_name: # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue( np.array_equal(result_val[b].shape, expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) if use_cuda_shared_memory or use_system_shared_memory: for io_num in range(2 * io_cnt): shared_memory_ctx.unregister(shm_ip_handles[io_num]) shared_memory_ctx.unregister(shm_op_handles[io_num]) if use_cuda_shared_memory: cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
existing_shm.close() results = triton_client.infer(model_name, inputs) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-t', '--trial', type=str, required=True, help='Set trial for the crashing client') FLAGS = parser.parse_args() trial = FLAGS.trial dtype = np.float32 model_name = tu.get_zero_model_name(trial, 1, dtype) tensor_shape = (1, ) if "nobatch" in trial else (1, 1) triton_client = grpcclient.InferenceServerClient(url="localhost:8001", verbose=True) shm = shared_memory.SharedMemory(create=True, size=8) count = np.ndarray((1, ), dtype=np.int32, buffer=shm.buf) count[0] = 0 p = Process(target=crashing_client, name="crashing_client", args=( model_name, dtype, tensor_shape,
def test_savedmodel(self): # savedmodel_nobatch_zero_1_float32 is identity model with input shape [-1] for client in self.clients_: model_name = tu.get_zero_model_name("savedmodel_nobatch", 1, self.data_type_) self._test_helper(client, model_name)