示例#1
0
 def test_register_after_inference(self):
     # Register after inference
     error_msg = []
     shm_handles = self._configure_sever()
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url,
                                                          verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url,
                                                          verbose=True)
     self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
                           shm_handles[3], error_msg)
     if len(error_msg) > 0:
         raise Exception(str(error_msg))
     shm_ip2_handle = shm.create_shared_memory_region(
         "input2_data", "/input2_data", 64)
     triton_client.register_system_shared_memory("input2_data",
                                                 "/input2_data", 64)
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 5)
     else:
         self.assertTrue(len(shm_status.regions) == 5)
     shm_handles.append(shm_ip2_handle)
     self._cleanup_server(shm_handles)
示例#2
0
    def _configure_sever(self):
        shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0)
        shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0)
        shm_op0_handle = cshm.create_shared_memory_region(
            "output0_data", 64, 0)
        shm_op1_handle = cshm.create_shared_memory_region(
            "output1_data", 64, 0)

        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        cshm.set_shared_memory_region(shm_ip0_handle, [input0_data])
        cshm.set_shared_memory_region(shm_ip1_handle, [input1_data])
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url,
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(_url,
                                                             verbose=True)
        triton_client.register_cuda_shared_memory(
            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64)
        triton_client.register_cuda_shared_memory(
            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64)
        triton_client.register_cuda_shared_memory(
            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64)
        triton_client.register_cuda_shared_memory(
            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64)
        return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
示例#3
0
    def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url,
                                                             verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url,
                                                             verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=False)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all())
        except Exception as ex:
            error_msg.append(str(ex))
示例#4
0
 def test_reregister_after_register(self):
     # Create a valid system shared memory region and unregister after register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url,
                                                          verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url,
                                                          verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     triton_client.register_system_shared_memory("dummy_data",
                                                 "/dummy_data", 8)
     try:
         triton_client.register_system_shared_memory(
             "dummy_data", "/dummy_data", 8)
     except Exception as ex:
         self.assertTrue(
             "shared memory region 'dummy_data' already in manager" in str(
                 ex))
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 1)
     else:
         self.assertTrue(len(shm_status.regions) == 1)
     shm.destroy_shared_memory_region(shm_op0_handle)
示例#5
0
    def _full_exact(self, model_name, request_concurrency, shape):

        # Run async requests to make sure backend handles concurrent requests
        # correctly.
        client = httpclient.InferenceServerClient(
            "localhost:8000", concurrency=request_concurrency)
        input_datas = []
        requests = []
        for i in range(request_concurrency):
            input_data = (16384 * np.random.randn(*shape)).astype(np.float32)
            input_datas.append(input_data)
            inputs = [
                httpclient.InferInput("INPUT__0", input_data.shape, "FP32")
            ]
            inputs[0].set_data_from_numpy(input_data)
            requests.append(client.async_infer(model_name, inputs))

        for i in range(request_concurrency):
            # Get the result from the initiated asynchronous inference request.
            # Note the call will block until the server responds.
            results = requests[i].get_result()

            output_data = results.as_numpy("OUTPUT__0")
            self.assertIsNotNone(output_data,
                                 "error: expected 'OUTPUT__0' to be found")
            np.testing.assert_allclose(output_data, input_datas[i])
    def test_batch_request_for_batching_model(self):
        input_size = 16

        # graphdef_nobatch_int32_int8_int8 is non batching version.
        # The server should return an error if the batch size dimension 
        # is included in the shape
        tensor_shape = (1, input_size)
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True)
                inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True)
                inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            results = triton_client.infer(model_name,
                                          inputs,
                                          outputs=outputs)
    def test_unknown_model(self):
        try:
            for pair in [("localhost:8000", "http"),
                         ("localhost:8001", "grpc")]:
                model_name = "foo"
                if pair[1] == "http":
                    triton_client = httpclient.InferenceServerClient(
                        url=pair[0], verbose=True)
                else:
                    triton_client = grpcclient.InferenceServerClient(
                        url=pair[0], verbose=True)

                self.assertTrue(triton_client.is_server_live())
                self.assertTrue(triton_client.is_server_ready())
                server_metadata = triton_client.get_server_metadata()
                if pair[1] == "http":
                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
                                     server_metadata['version'])
                    self.assertEqual("triton", server_metadata['name'])
                else:
                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
                                     server_metadata.version)
                    self.assertEqual("triton", server_metadata.name)

                model_metadata = triton_client.get_model_metadata(model_name)
                self.assertTrue(False, "expected unknown model failure")
        except InferenceServerException as ex:
            self.assertTrue(ex.message().startswith(
                "Request for unknown model: 'foo' is not found"))
    def _addsub_infer(self, model_name):
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)

        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32"))

        # Initialize the data
        inputs[0].set_data_from_numpy(self.input0_, binary_data=False)
        inputs[1].set_data_from_numpy(self.input1_, binary_data=True)

        outputs.append(
            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
        outputs.append(
            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))

        results = triton_client.infer(model_name, inputs, outputs=outputs)

        output0_data = results.as_numpy('OUTPUT0')
        output1_data = results.as_numpy('OUTPUT1')

        self.assertTrue(np.array_equal(self.expected_output0_, output0_data),
                        "incorrect sum")
        self.assertTrue(np.array_equal(self.expected_output1_, output1_data),
                        "incorrect difference")
示例#9
0
    def setUp(self):
        self._data_type = np.float32

        # Very large tensor will always fail for gRPC because the Protobuf has
        # a hard limit on 2GBs for the size of input tensors. All backends
        # except the Python and plan backend should be able to handle payloads
        # larger than 2GBs using HTTP.
        very_large_tensor_shape = (math.trunc(
            3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize), )
        self._very_large_in0 = np.random.random(
            very_large_tensor_shape).astype(self._data_type)

        # 1.9 GBs allows us to test gRPC with moderate sizes too.
        large_tensor_shape = (math.trunc(1.9 * (1024 * 1024 * 1024) //
                                         np.dtype(self._data_type).itemsize), )
        self._large_in0 = np.random.random(large_tensor_shape).astype(
            self._data_type)

        small_tensor_shape = (1, )
        self._small_in0 = np.random.random(small_tensor_shape).astype(
            self._data_type)

        self._clients = ((httpclient,
                          httpclient.InferenceServerClient('localhost:8000')),
                         (grpcclient,
                          grpcclient.InferenceServerClient('localhost:8001')))
示例#10
0
    def _full_exact(self, batch_size, model_name, plugin_name):
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)

        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT0', [batch_size, 16],
                                            "FP32"))

        input0_data = np.random.randn(batch_size, 16).astype(np.float32)
        inputs[0].set_data_from_numpy(input0_data, binary_data=False)

        outputs.append(
            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))

        results = triton_client.infer(model_name + '_' + plugin_name,
                                      inputs,
                                      outputs=outputs)

        output0_data = results.as_numpy('OUTPUT0')

        # Verify values of Leaky RELU (it uses 0.1 instead of the default 0.01)
        # and for CustomClipPlugin min_clip = 0.1, max_clip = 0.5
        for b in range(batch_size):
            if plugin_name == 'LReLU_TRT':
                test_input = np.where(input0_data > 0, input0_data,
                                      input0_data * 0.1)
                self.assertTrue(np.isclose(output0_data, test_input).all())
            else:
                # [TODO] Add test for CustomClip output
                test_input = np.clip(input0_data, 0.1, 0.5)
    def _full_exact(self, model_name, plugin_name, shape):
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)

        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT0', list(shape), "FP32"))

        input0_data = np.ones(shape=shape).astype(np.float32)
        inputs[0].set_data_from_numpy(input0_data, binary_data=True)

        outputs.append(
            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))

        results = triton_client.infer(model_name + '_' + plugin_name,
                                      inputs,
                                      outputs=outputs)

        output0_data = results.as_numpy('OUTPUT0')

        # Verify values of Normalize and GELU
        if plugin_name == 'CustomGeluPluginDynamic':
            # Add bias
            input0_data += 1
            # Calculate Gelu activation
            test_output = (input0_data *
                           0.5) * (1 + np.tanh((0.797885 * input0_data) +
                                               (0.035677 * (input0_data**3))))
            self.assertTrue(np.isclose(output0_data, test_output).all())
        else:
            # L2 norm is sqrt(sum([1]*16)))
            test_output = input0_data / np.sqrt(sum([1] * 16))
            self.assertTrue(np.isclose(output0_data, test_output).all())
    def test_infer_stats_no_model(self):
        # Test get_inference_statistics when no model/model_version is passed.
        try:
            for pair in [("localhost:8000", "http"),
                         ("localhost:8001", "grpc")]:
                if pair[1] == "http":
                    triton_client = httpclient.InferenceServerClient(
                        url=pair[0], verbose=True)
                else:
                    triton_client = grpcclient.InferenceServerClient(
                        url=pair[0], verbose=True)

                self.assertTrue(triton_client.is_server_live())
                self.assertTrue(triton_client.is_server_ready())

                # Returns infer stats for ALL models + ready versions
                infer_stats = triton_client.get_inference_statistics()
                if pair[1] == "http":
                    stats = infer_stats['model_stats']
                else:
                    stats = infer_stats.model_stats
                self.assertEqual(
                    len(stats), 207,
                    "expected 207 infer stats for all ready versions of all model"
                )

        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
示例#13
0
 def infer_unknown(self, model_name, tensor_shape):
     print("About to run the test")
     input_data = np.random.random_sample(tensor_shape).astype(np.float32)
     client = tritonhttpclient.InferenceServerClient('localhost:8000')
     inputs = [
         tritonhttpclient.InferInput("INPUT", input_data.shape,
                                     np_to_triton_dtype(input_data.dtype))
     ]
     inputs[0].set_data_from_numpy(input_data)
     results = client.infer(model_name, inputs)
     self.assertTrue(np.array_equal(results.as_numpy('OUTPUT'), input_data))
    def _no_streaming_helper(self, protocol):
        data_offset = 100
        repeat_count = 1
        delay_time = 1000
        wait_time = 2000

        input_data = np.arange(start=data_offset,
                               stop=data_offset + repeat_count,
                               dtype=np.int32)
        input_data = np.expand_dims(input_data, axis=0)
        delay_data = (np.ones([1, repeat_count], dtype=np.uint32)) * delay_time
        wait_data = np.array([[wait_time]], dtype=np.uint32)

        if protocol is "grpc":
            # Use the inputs and outputs from the setUp
            this_inputs = self.inputs_
            this_outputs = self.outputs_
        else:
            this_inputs = []
            this_inputs.append(
                httpclient.InferInput('IN', [1, repeat_count], "INT32"))
            this_inputs.append(httpclient.InferInput('DELAY', [1, 1],
                                                     "UINT32"))
            this_inputs.append(httpclient.InferInput('WAIT', [1, 1], "UINT32"))
            this_outputs = []
            this_outputs.append(httpclient.InferRequestedOutput('OUT'))

        # Initialize data for IN
        this_inputs[0].set_shape([1, repeat_count])
        this_inputs[0].set_data_from_numpy(input_data)

        # Initialize data for DELAY
        this_inputs[1].set_shape([1, repeat_count])
        this_inputs[1].set_data_from_numpy(delay_data)

        # Initialize data for WAIT
        this_inputs[2].set_data_from_numpy(wait_data)

        if protocol is "grpc":
            triton_client = grpcclient.InferenceServerClient(
                url="localhost:8001", verbose=True)
        else:
            triton_client = httpclient.InferenceServerClient(
                url="localhost:8000", verbose=True)
        try:
            triton_client.infer(model_name=self.model_name_,
                                inputs=this_inputs,
                                outputs=this_outputs)
            self.assertTrue(False, "expected to fail for decoupled models")
        except InferenceServerException as ex:
            self.assertTrue(
                "doesn't support models with decoupled transaction policy" in
                ex.message())
示例#15
0
def triton_init(url="localhost:8000"):
    """Initializes the triton client to point at the specified URL

    Parameter
    ----------
    url : str
        The URL on which to address the Triton server, defaults to
        localhost:8000
    """
    global triton_client
    triton_client = tritonhttpclient.InferenceServerClient(url)
    return triton_client
示例#16
0
 def setUp(self):
     self.dtype_ = np.float32
     self.inputs = []
     # 4 set of inputs with shape [2], [4], [1], [3]
     for value in [2, 4, 1, 3]:
         self.inputs.append([
             tritonhttpclient.InferInput('RAGGED_INPUT', [1, value], "FP32")
         ])
         self.inputs[-1][0].set_data_from_numpy(
             np.full([1, value], value, np.float32))
     self.client = tritonhttpclient.InferenceServerClient(
         url="localhost:8000", concurrency=len(self.inputs))
 def test_unregister_before_register(self):
     # Create a valid cuda shared memory region and unregister before register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     triton_client.unregister_cuda_shared_memory("dummy_data")
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertEqual(len(shm_status), 0)
     else:
         self.assertEqual(len(shm_status.regions), 0)
     cshm.destroy_shared_memory_region(shm_op0_handle)
    def setUp(self):
        self.data_type_ = np.float32
        # n GB divided by element size as tensor shape
        tensor_shape = (math.trunc(6 * (1024 * 1024 * 1024) /
                                   np.dtype(self.data_type_).itemsize), )
        self.in0_ = np.random.random(tensor_shape).astype(self.data_type_)

        small_tensor_shape = (1, )
        self.sin0_ = np.random.random(small_tensor_shape).astype(
            self.data_type_)

        self.clients_ = ((httpclient,
                          httpclient.InferenceServerClient('localhost:8000')),
                         (grpcclient,
                          grpcclient.InferenceServerClient('localhost:8001')))
    def test_http_infer(self):

        self._prepare_request("http")

        # The model is configured to take three seconds to send the
        # response. Expect an exception for small timeout values.
        with self.assertRaises(socket.timeout) as cm:
            triton_client = httpclient.InferenceServerClient(
                url="localhost:8000", verbose=True, network_timeout=2.0)
            result = triton_client.infer(model_name=self.model_name_,
                                         inputs=self.inputs_,
                                         outputs=self.outputs_)
        self.assertIn("timed out", str(cm.exception))

        # Expect to successfully pass with sufficiently large timeout
        triton_client = httpclient.InferenceServerClient(
            url="localhost:8000", verbose=True, connection_timeout=10.0)

        result = triton_client.infer(model_name=self.model_name_,
                                     inputs=self.inputs_,
                                     outputs=self.outputs_)

        output0_data = result.as_numpy('OUTPUT0')
        self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 def _get_infer_count_per_version(self, model_name):
     triton_client = tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True)
     stats = triton_client.get_inference_statistics(model_name)
     self.assertEqual(len(stats["model_stats"]), 2)
     infer_count = [0, 0]
     for model_stat in stats["model_stats"]:
         self.assertEqual(model_stat["name"],
                         model_name, "expected stats for model " + model_name)
         model_version = model_stat['version']
         if model_version == "1":
             infer_count[0] = model_stat["inference_stats"]["success"]["count"]
         elif model_version == "2":
             infer_count[1] = model_stat["inference_stats"]["success"]["count"]
         else:
             self.assertTrue(False, "unexpected version {} for model {}".format(model_version, model_name))
     return infer_count
示例#21
0
    def test_batch_item_shape(self):
        # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2]
        # Note that the test only checks the formation of "BATCH_INPUT" where
        # the value of "RAGGED_INPUT" is irrelevant, only the shape matters
        inputs = []
        for value in [[2, 1, 2], [1, 1, 2], [1, 2, 2]]:
            inputs.append(
                [tritonhttpclient.InferInput('RAGGED_INPUT', value, "FP32")])
            inputs[-1][0].set_data_from_numpy(
                np.full(value, value[0], np.float32))
        client = tritonhttpclient.InferenceServerClient(
            url="localhost:8000", concurrency=len(inputs))

        expected_outputs = [
            np.array([[1.0, 2.0], [1.0, 2.0]]),
            np.array([[1.0, 2.0]]),
            np.array([[2.0, 2.0]]),
        ]

        model_name = "batch_item"

        output_name = 'BATCH_OUTPUT'
        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]

        async_requests = []
        try:
            for request_inputs in inputs:
                # Asynchronous inference call.
                async_requests.append(
                    client.async_infer(model_name=model_name,
                                       inputs=request_inputs,
                                       outputs=outputs))

            for idx in range(len(async_requests)):
                # Get the result from the initiated asynchronous inference request.
                # Note the call will block till the server responds.
                result = async_requests[idx].get_result()

                # Validate the results by comparing with precomputed values.
                output_data = result.as_numpy(output_name)
                self.assertTrue(
                    np.allclose(output_data, expected_outputs[idx]),
                    "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}"
                    .format(expected_outputs[idx], output_data,
                            np.isclose(expected_outputs[idx], output_data)))
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
 def test_valid_create_set_register(self):
     # Create a valid cuda shared memory region, fill data in it and register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     cshm.set_shared_memory_region(shm_op0_handle,
                                   [np.array([1, 2], dtype=np.float32)])
     triton_client.register_cuda_shared_memory(
         "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertEqual(len(shm_status), 1)
     else:
         self.assertEqual(len(shm_status.regions), 1)
     cshm.destroy_shared_memory_region(shm_op0_handle)
示例#23
0
 def test_unregister_after_register(self):
     # Create a valid system shared memory region and unregister after register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url,
                                                          verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url,
                                                          verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     triton_client.register_system_shared_memory("dummy_data",
                                                 "/dummy_data", 8)
     triton_client.unregister_system_shared_memory("dummy_data")
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 0)
     else:
         self.assertTrue(len(shm_status.regions) == 0)
     shm.destroy_shared_memory_region(shm_op0_handle)
 def test_unregisterall(self):
     # Unregister all shared memory blocks
     shm_handles = self._configure_sever()
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     status_before = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertEqual(len(status_before), 4)
     else:
         self.assertEqual(len(status_before.regions), 4)
     triton_client.unregister_cuda_shared_memory()
     status_after = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertEqual(len(status_after), 0)
     else:
         self.assertEqual(len(status_after.regions), 0)
     self._cleanup_server(shm_handles)
 def test_too_big_shm(self):
     # Shared memory input region larger than needed - Throws error
     error_msg = []
     shm_handles = self._configure_sever()
     shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0)
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     triton_client.register_cuda_shared_memory(
         "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128)
     self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2],
                           shm_handles[3], error_msg, "input2_data", 128)
     if len(error_msg) > 0:
         self.assertIn(
             "unexpected total byte size 128 for input 'INPUT1', expecting 64",
             error_msg[-1])
     shm_handles.append(shm_ip2_handle)
     self._cleanup_server(shm_handles)
 def test_unregister_after_inference(self):
     # Unregister after inference
     error_msg = []
     shm_handles = self._configure_sever()
     self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
                           shm_handles[3], error_msg)
     if len(error_msg) > 0:
         raise Exception(str(error_msg))
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     triton_client.unregister_cuda_shared_memory("output0_data")
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertEqual(len(shm_status), 3)
     else:
         self.assertEqual(len(shm_status.regions), 3)
     self._cleanup_server(shm_handles)
示例#27
0
def unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                   precreated_shm_regions, outputs,
                                   use_system_shared_memory,
                                   use_cuda_shared_memory):
    if not (use_system_shared_memory or use_cuda_shared_memory):
        return None

    triton_client = httpclient.InferenceServerClient("localhost:8000")

    if use_cuda_shared_memory:
        triton_client.unregister_cuda_shared_memory(shm_regions[0] + '_data')
        triton_client.unregister_cuda_shared_memory(shm_regions[1] + '_data')
        cudashm.destroy_shared_memory_region(shm_handles[0])
        cudashm.destroy_shared_memory_region(shm_handles[1])
    else:
        triton_client.unregister_system_shared_memory(shm_regions[0] + '_data')
        triton_client.unregister_system_shared_memory(shm_regions[1] + '_data')
        shm.destroy_shared_memory_region(shm_handles[0])
        shm.destroy_shared_memory_region(shm_handles[1])

    if precreated_shm_regions is None:
        i = 0
        if "OUTPUT0" in outputs:
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(shm_regions[2] +
                                                            '_data')
                cudashm.destroy_shared_memory_region(shm_handles[2])
            else:
                triton_client.unregister_system_shared_memory(shm_regions[2] +
                                                              '_data')
                shm.destroy_shared_memory_region(shm_handles[2])
            i += 1
        if "OUTPUT1" in outputs:
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(shm_regions[2 +
                                                                        i] +
                                                            '_data')
                cudashm.destroy_shared_memory_region(shm_handles[3])
            else:
                triton_client.unregister_system_shared_memory(shm_regions[2 +
                                                                          i] +
                                                              '_data')
                shm.destroy_shared_memory_region(shm_handles[3])
示例#28
0
    def test_basic(self):
        try:
            for pair in [("localhost:8000", "http"),
                         ("localhost:8001", "grpc")]:
                model_name = "graphdef_int32_int8_int8"
                extensions = [
                    'classification', 'sequence', 'model_repository',
                    'schedule_policy', 'model_configuration',
                    'system_shared_memory', 'cuda_shared_memory',
                    'binary_tensor_data', 'statistics'
                ]
                if pair[1] == "http":
                    triton_client = httpclient.InferenceServerClient(
                        url=pair[0], verbose=True)
                else:
                    triton_client = grpcclient.InferenceServerClient(
                        url=pair[0], verbose=True)

                self.assertTrue(triton_client.is_server_live())
                self.assertTrue(triton_client.is_server_ready())
                server_metadata = triton_client.get_server_metadata()
                model_metadata = triton_client.get_model_metadata(model_name)

                if pair[1] == "http":
                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
                                     server_metadata['version'])
                    self.assertEqual("triton", server_metadata['name'])
                    for ext in extensions:
                        self.assertTrue(ext in server_metadata['extensions'])

                    self.assertEqual(model_name, model_metadata['name'])
                else:
                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
                                     server_metadata.version)
                    self.assertEqual("triton", server_metadata.name)
                    for ext in extensions:
                        self.assertTrue(ext in server_metadata.extensions)

                    self.assertEqual(model_name, model_metadata.name)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
    def _test_helper(self, modelVersion, tag, sig_def):
        shape = [self.dims]
        model_name = self.base_model_name + str(modelVersion)
        # The multiplier is defined during model creation. See server/qa/common/gen_tag_sigdef.py
        # for details
        multiplier = modelVersion + 1
        output_name = "OUTPUT"
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)
        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT', shape, "FP32"))
        input_data = np.ones(shape=shape).astype(np.float32)
        inputs[0].set_data_from_numpy(input_data, binary_data=True)

        outputs.append(
            httpclient.InferRequestedOutput(output_name, binary_data=True))
        results = triton_client.infer(model_name, inputs, outputs=outputs)
        output_data = results.as_numpy(output_name)
        test_output = input_data * multiplier
        self.assertTrue(np.isclose(output_data, test_output).all())
 def test_reregister_after_register(self):
     # Create a valid cuda shared memory region and unregister after register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     triton_client.register_cuda_shared_memory(
         "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
     try:
         triton_client.register_cuda_shared_memory(
             "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
     except Exception as ex:
         self.assertIn(
             "shared memory region 'dummy_data' already in manager", str(ex))
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertEqual(len(shm_status), 1)
     else:
         self.assertEqual(len(shm_status.regions), 1)
     cshm.destroy_shared_memory_region(shm_op0_handle)