def _infer_exact_helper(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, skip_request_id_check=False, use_streaming=True, correlation_id=0): for bs in (1, batch_size): # model that does not support batching if bs == 1: iu.infer_exact( tester, pf + "_nobatch", tensor_shape, bs, input_dtype, output0_dtype, output1_dtype, output0_raw=output0_raw, output1_raw=output1_raw, model_version=model_version, swap=swap, outputs=outputs, use_http=use_http, use_grpc=use_grpc, skip_request_id_check=skip_request_id_check, use_streaming=use_streaming, correlation_id=correlation_id, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) # model that supports batching iu.infer_exact( tester, pf, (bs,) + tensor_shape, bs, input_dtype, output0_dtype, output1_dtype, output0_raw=output0_raw, output1_raw=output1_raw, model_version=model_version, swap=swap, outputs=outputs, use_http=use_http, use_grpc=use_grpc, skip_request_id_check=skip_request_id_check, use_streaming=use_streaming, correlation_id=correlation_id, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def test_ensemble_add_sub(self): for bs in (1, 8): iu.infer_exact(self, "ensemble_add_sub", (bs, 16), bs, np.int32, np.int32, np.int32) infer_count = self._get_infer_count_per_version("simple") # The two 'simple' versions should have the same infer count if (infer_count[0] != infer_count[1]): self.assertTrue(False, "unexpeced different infer count for different 'simple' versions")
def test_select_optimization_profile(self): # Different profile has different optimized input shape batch_size = 4 tensor_shape = (16, ) try: iu.infer_exact(self, self.model_name_, tensor_shape, batch_size, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_ensemble_mix_platform(self): # Skip on CPU only machine as TensorRT model is used in this ensemble if CPU_ONLY: return for bs in (1, 8): iu.infer_exact(self, "mix_platform", (bs, 16), bs, np.float32, np.float32, np.float32, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def check_response(self, trial, bs, less_than, threshold_ms, requested_outputs=("OUTPUT0", "OUTPUT1")): global _check_exception try: input_size = 16 start_ms = int(round(time.time() * 1000)) if trial == "graphdef" or trial == "netdef": tensor_shape = (input_size, ) iu.infer_exact(self, trial, tensor_shape, bs, True, np.float32, np.float32, np.float32, swap=True, outputs=requested_outputs, use_grpc=False, skip_request_id_check=True) elif trial == "plan": tensor_shape = (input_size, 1, 1) iu.infer_exact(self, trial, tensor_shape, bs, True, np.float32, np.float32, np.float32, swap=True, outputs=requested_outputs, use_grpc=False, skip_request_id_check=True) else: self.assertFalse(True, "unknown trial type: " + trial) end_ms = int(round(time.time() * 1000)) if less_than: self.assertTrue( (end_ms - start_ms) < threshold_ms, "expected less than " + str(threshold_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") else: self.assertTrue( (end_ms - start_ms) > threshold_ms, "expected greater than " + str(threshold_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") except Exception as ex: _check_exception = ex
def test_ensemble_mix_ensemble(self): for bs in (1, 8): iu.infer_exact(self, "mix_ensemble", (16, ), bs, np.int32, np.float32, np.float32, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def test_ensemble_add_sub_one_output(self): for bs in (1, 8): iu.infer_exact(self, "ensemble_add_sub", (bs, 16), bs, np.int32, np.int32, np.int32, outputs=("OUTPUT0",)) infer_count = self._get_infer_count_per_version("simple") # Only 'simple' version 2 should have non-zero infer count # as it is in charge of producing OUTPUT0 if (infer_count[0] != 0): self.assertTrue(False, "unexpeced non-zero infer count for 'simple' version 1") elif (infer_count[1] == 0): self.assertTrue(False, "unexpeced zero infer count for 'simple' version 2")
def _check_infer(self, tensor_shape, batch_size=1): try: iu.infer_exact(self, self.model_name_, tensor_shape, batch_size, self.dtype_, self.dtype_, self.dtype_, model_version=1, use_grpc=False, use_streaming=False) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_ensemble_label_lookup(self): if all(x in BACKENDS for x in ['graphdef', 'netdef', 'savedmodel']): # Ensemble needs to look up label from the actual model for bs in (1, 8): iu.infer_exact(self, "mix_platform", (bs, 16), bs, np.float32, np.float32, np.float32, output0_raw=False, output1_raw=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) if all(x in BACKENDS for x in ['graphdef', 'netdef', 'savedmodel']): # Label from the actual model will be passed along the nested ensemble for bs in (1, 8): iu.infer_exact(self, "mix_ensemble", (bs, 16), bs, np.int32, np.float32, np.float32, output0_raw=False, output1_raw=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) if "graphdef" in BACKENDS: # If label file is provided, it will use the provided label file directly try: iu.infer_exact(self, "wrong_label", (1, 16), 1, np.int32, np.float32, np.float32, output0_raw=False, output1_raw=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) except AssertionError: # Sanity check that infer_exact failed since this ensemble is provided # with unexpected labels pass if "graphdef" in BACKENDS: for bs in (1, 8): iu.infer_exact(self, "label_override", (bs, 16), bs, np.int32, np.float32, np.float32, output0_raw=False, output1_raw=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def test_load_specific_optimization_profile(self): # Only OP 5 should be available, which only allow batch size 8 tensor_shape = (1,) try: iu.infer_exact(self, self.model_name_, (1,) + tensor_shape, 1, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue( "model expected the shape of dimension 0 to be between 6 and 8 but received 1" in ex.message()) try: iu.infer_exact(self, self.model_name_, (8,) + tensor_shape, 8, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_parse_error_modelfail(self): # --strict-readiness=true so server is live but not ready input_size = 16 tensor_shape = (input_size, ) # Server was started but with a model that fails to load try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: model_name = tu.get_model_name('graphdef', np.float32, np.float32, np.float32) ctx = ServerStatusContext(pair[0], pair[1], model_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) uptime = ss.uptime_ns self.assertGreater(uptime, 0) self.assertEqual(len(ss.model_status), 1) self.assertTrue(model_name in ss.model_status, "expected status for model " + model_name) for (k, v) in iteritems( ss.model_status[model_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_UNAVAILABLE) hctx = ServerHealthContext(pair[0], pair[1], True) self.assertFalse(hctx.is_ready()) self.assertTrue(hctx.is_live()) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) try: iu.infer_exact(self, 'graphdef', tensor_shape, 1, True, np.float32, np.float32, np.float32) self.assertTrue( False, "expected error for unavailable model " + model_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertGreater(ex.request_id(), 0) self.assertTrue(ex.message().startswith( "Inference request for unknown model 'graphdef_float32_float32_float32'" ))
def test_load_specific_optimization_profile(self): # Only OP 5 should be available, which only allow batch size 8 tensor_shape = (1, ) try: iu.infer_exact(self, self.model_name_, tensor_shape, 1, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( "The shape of dimension 0 is expected to be in range from 6 to 8, Got: 1" in ex.message()) try: iu.infer_exact(self, self.model_name_, tensor_shape, 8, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_load_default_optimization_profile(self): # Only default OP (OP 0) has max tensor shape 33 tensor_shape = (33,) try: iu.infer_exact(self, self.model_name_, (8,) + tensor_shape, 8, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) over_tensor_shape = (34,) try: iu.infer_exact(self, self.model_name_, (8,) + over_tensor_shape, 8, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue( "model expected the shape of dimension 1 to be between 1 and 33 but received 34" in ex.message())
def test_load_default_optimization_profile(self): # Only default OP (OP 0) has max tensor shape 33 tensor_shape = (33, ) try: iu.infer_exact(self, self.model_name_, tensor_shape, 8, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) over_tensor_shape = (34, ) try: iu.infer_exact(self, self.model_name_, over_tensor_shape, 8, self.dtype_, self.dtype_, self.dtype_) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( "The shape of dimension 1 is expected to be in range from 1 to 33, Got: 34" in ex.message())
def test_raw_version_specific_1_3(self): input_size = 16 # There are 3 versions of *_float32_float32_float32 but only # versions 1 and 3 should be available. for platform in ('graphdef', 'savedmodel', 'netdef', 'plan'): if platform == 'plan' and CPU_ONLY: continue if platform not in BACKENDS: continue tensor_shape = (1, input_size) iu.infer_exact(self, platform, tensor_shape, 1, np.float32, np.float32, np.float32, model_version=1, swap=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) try: iu.infer_exact(self, platform, tensor_shape, 1, np.float32, np.float32, np.float32, model_version=2, swap=True, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) except InferenceServerException as ex: self.assertTrue( ex.message().startswith("Request for unknown model")) iu.infer_exact(self, platform, tensor_shape, 1, np.float32, np.float32, np.float32, model_version=3, swap=True, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def test_raw_version_latest_1(self): input_size = 16 tensor_shape = (1, input_size) # There are 3 versions of graphdef_int8_int8_int8 but # only version 3 should be available for platform in ('graphdef', 'savedmodel'): if platform not in BACKENDS: continue try: iu.infer_exact(self, platform, tensor_shape, 1, np.int8, np.int8, np.int8, model_version=1, swap=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) except InferenceServerException as ex: self.assertTrue( ex.message().startswith("Request for unknown model")) try: iu.infer_exact(self, platform, tensor_shape, 1, np.int8, np.int8, np.int8, model_version=2, swap=True, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) except InferenceServerException as ex: self.assertTrue( ex.message().startswith("Request for unknown model")) iu.infer_exact(self, platform, tensor_shape, 1, np.int8, np.int8, np.int8, model_version=3, swap=True, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def test_ensemble_mix_batch_nobatch(self): base_names = ["batch_to_nobatch", "nobatch_to_batch"] for name in base_names: for bs in (1, 8): iu.infer_exact( self, name, (bs, 16), bs, np.float32, np.float32, np.float32, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) iu.infer_exact( self, name + "_nobatch", (8, 16), 1, np.float32, np.float32, np.float32, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) # batch -> nobatch -> batch for bs in (1, 8): iu.infer_exact( self, "mix_nobatch_batch", (bs, 16), bs, np.float32, np.float32, np.float32, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
def test_raw_version_all(self): input_size = 16 tensor_shape = (input_size, ) # There are 3 versions of *_int32_int32_int32 and all should # be available. for platform in ('graphdef', 'savedmodel', 'netdef'): iu.infer_exact(self, platform, tensor_shape, 1, np.int32, np.int32, np.int32, model_version=1, swap=False) iu.infer_exact(self, platform, tensor_shape, 1, np.int32, np.int32, np.int32, model_version=2, swap=True) iu.infer_exact(self, platform, tensor_shape, 1, np.int32, np.int32, np.int32, model_version=3, swap=True)
def check_response(self, trial, bs, thresholds, requested_outputs=("OUTPUT0", "OUTPUT1"), input_size=16): global _check_exception try: start_ms = int(round(time.time() * 1000)) if trial == "savedmodel" or trial == "graphdef" or trial == "netdef" \ or trial == "custom" or trial == "libtorch" or trial == "onnx": tensor_shape = (input_size,) iu.infer_exact(self, trial, tensor_shape, bs, np.float32, np.float32, np.float32, swap=False, model_version=1, outputs=requested_outputs, use_grpc=False, skip_request_id_check=True, use_streaming=False) elif trial == "plan": tensor_shape = (input_size,1,1) iu.infer_exact(self, trial, tensor_shape, bs, np.float32, np.float32, np.float32, swap=False, model_version=1, outputs=requested_outputs, use_grpc=False, skip_request_id_check=True, use_streaming=False) else: self.assertFalse(True, "unknown trial type: " + trial) end_ms = int(round(time.time() * 1000)) lt_ms = thresholds[0] gt_ms = thresholds[1] if lt_ms is not None: self.assertTrue((end_ms - start_ms) < lt_ms, "expected less than " + str(lt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if gt_ms is not None: self.assertTrue((end_ms - start_ms) > gt_ms, "expected greater than " + str(gt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") except Exception as ex: _check_exception = ex
def test_ensemble_label_lookup(self): # Ensemble needs to look up label from the actual model for bs in (1, 8): iu.infer_exact(self, "mix_platform", (16, ), bs, np.float32, np.float32, np.float32, output0_raw=False, output1_raw=False) # Label from the actual model will be passed along the nested ensemble for bs in (1, 8): iu.infer_exact(self, "mix_ensemble", (16, ), bs, np.int32, np.float32, np.float32, output0_raw=False, output1_raw=False) # If label file is provided, it will use the provided label file directly try: iu.infer_exact(self, "wrong_label", (16, ), 1, np.int32, np.float32, np.float32, output0_raw=False, output1_raw=False) except AssertionError: # Sanity check that infer_exact failed since this ensemble is provided # with unexpected labels pass for bs in (1, 8): iu.infer_exact(self, "label_override", (16, ), bs, np.int32, np.float32, np.float32, output0_raw=False, output1_raw=False)
def test_raw_version_specific_1(self): input_size = 16 tensor_shape = (input_size, ) # There are 3 versions of *_float16_float16_float16 but only # version 1 should be available. for platform in ('graphdef', 'savedmodel'): iu.infer_exact(self, platform, tensor_shape, 1, np.float16, np.float16, np.float16, model_version=1, swap=False, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) try: iu.infer_exact( self, platform, tensor_shape, 1, np.float16, np.float16, np.float16, model_version=2, swap=True, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue(ex.message().startswith( "Inference request for unknown model")) try: iu.infer_exact( self, platform, tensor_shape, 1, np.float16, np.float16, np.float16, model_version=3, swap=True, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue(ex.message().startswith( "Inference request for unknown model"))
def test_ensemble_mix_batch_nobatch(self): base_names = ["batch_to_nobatch", "nobatch_to_batch"] for name in base_names: for bs in (1, 8): iu.infer_exact(self, name, (16,), bs, np.float32, np.float32, np.float32) iu.infer_exact(self, name + "_nobatch", (8, 16,), 1, np.float32, np.float32, np.float32) # batch -> nobatch -> batch for bs in (1, 8): iu.infer_exact(self, "mix_nobatch_batch", (16,), bs, np.float32, np.float32, np.float32)