def _configure_sever(self):
     shm_ip0_handle = shm.create_shared_memory_region(
         "input0_data", "/input0_data", 64)
     shm_ip1_handle = shm.create_shared_memory_region(
         "input1_data", "/input1_data", 64)
     shm_op0_handle = shm.create_shared_memory_region(
         "output0_data", "/output0_data", 64)
     shm_op1_handle = shm.create_shared_memory_region(
         "output1_data", "/output1_data", 64)
     input0_data = np.arange(start=0, stop=16, dtype=np.int32)
     input1_data = np.ones(shape=16, dtype=np.int32)
     shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
     shm.set_shared_memory_region(shm_ip1_handle, [input1_data])
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     triton_client.register_system_shared_memory("input0_data",
                                                 "/input0_data", 64)
     triton_client.register_system_shared_memory("input1_data",
                                                 "/input1_data", 64)
     triton_client.register_system_shared_memory("output0_data",
                                                 "/output0_data", 64)
     triton_client.register_system_shared_memory("output1_data",
                                                 "/output1_data", 64)
     return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
Пример #2
0
def create_set_either_shm_region(shm_region_names, input_list, input_byte_size,
                                 output_byte_size, use_system_shared_memory,
                                 use_cuda_shared_memory):
    if use_cuda_shared_memory and use_system_shared_memory:
        raise ValueError(
            "Cannot set both System and CUDA shared memory flags to 1")

    if not (use_system_shared_memory or use_cuda_shared_memory):
        return []

    if use_cuda_shared_memory:
        shm_ip_handle = cudashm.create_shared_memory_region(
            shm_region_names[0] + "_data", input_byte_size, 0)
        shm_op_handle = cudashm.create_shared_memory_region(
            shm_region_names[1] + "_data", output_byte_size, 0)
        cudashm.set_shared_memory_region(shm_ip_handle, input_list)
    elif use_system_shared_memory:
        shm_ip_handle = shm.create_shared_memory_region(
            shm_region_names[0] + "_data", "/" + shm_region_names[0],
            input_byte_size)
        shm_op_handle = shm.create_shared_memory_region(
            shm_region_names[1] + "_data", "/" + shm_region_names[1],
            output_byte_size)
        shm.set_shared_memory_region(shm_ip_handle, input_list)

    return [shm_ip_handle, shm_op_handle]
 def test_invalid_create_shm(self):
     # Raises error since tried to create invalid system shared memory region
     try:
         shm_op0_handle = shm.create_shared_memory_region(
             "dummy_data", "/dummy_data", -1)
         shm.destroy_shared_memory_region(shm_op0_handle)
     except Exception as ex:
         self.assertTrue(str(ex) == "unable to initialize the size")
Пример #4
0
 def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np):
     byte_size = tensor_np.size * tensor_np.dtype.itemsize
     self.shm_handles.append(
         shm.create_shared_memory_region(name, name, byte_size))
     # Put data values into shared memory
     shm.set_shared_memory_region(self.shm_handles[-1], [tensor_np])
     # Register shared memory with Triton Server
     self.triton_client.register_system_shared_memory(name, name, byte_size)
     # Set the parameters to use data from shared memory
     tensor.set_shared_memory(name, byte_size)
 def test_unregister_before_register(self):
     # Create a valid system shared memory region and unregister before register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     triton_client.unregister_system_shared_memory("dummy_data")
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 0)
     else:
         self.assertTrue(len(shm_status.regions) == 0)
     shm.destroy_shared_memory_region(shm_op0_handle)
 def test_valid_create_set_register(self):
     # Create a valid system shared memory region, fill data in it and register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     shm.set_shared_memory_region(shm_op0_handle,
                                  [np.array([1, 2], dtype=np.float32)])
     triton_client.register_system_shared_memory("dummy_data", "/dummy_data",
                                                 8)
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 1)
     else:
         self.assertTrue(len(shm_status.regions) == 1)
     shm.destroy_shared_memory_region(shm_op0_handle)
 def test_too_big_shm(self):
     # Shared memory input region larger than needed - Throws error
     error_msg = []
     shm_handles = self._configure_sever()
     shm_ip2_handle = shm.create_shared_memory_region(
         "input2_data", "/input2_data", 128)
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     triton_client.register_system_shared_memory("input2_data",
                                                 "/input2_data", 128)
     self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2],
                           shm_handles[3], error_msg, "input2_data", 128)
     if len(error_msg) > 0:
         self.assertTrue(
             "unexpected total byte size 128 for input 'INPUT1', expecting 64"
             in error_msg[-1])
     shm_handles.append(shm_ip2_handle)
     self._cleanup_server(shm_handles)
 def test_register_after_inference(self):
     # Register after inference
     error_msg = []
     shm_handles = self._configure_sever()
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
                           shm_handles[3], error_msg)
     if len(error_msg) > 0:
         raise Exception(str(error_msg))
     shm_ip2_handle = shm.create_shared_memory_region(
         "input2_data", "/input2_data", 64)
     triton_client.register_system_shared_memory("input2_data",
                                                 "/input2_data", 64)
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 5)
     else:
         self.assertTrue(len(shm_status.regions) == 5)
     shm_handles.append(shm_ip2_handle)
     self._cleanup_server(shm_handles)
 def test_reregister_after_register(self):
     # Create a valid system shared memory region and unregister after register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     triton_client.register_system_shared_memory("dummy_data", "/dummy_data",
                                                 8)
     try:
         triton_client.register_system_shared_memory("dummy_data",
                                                     "/dummy_data", 8)
     except Exception as ex:
         self.assertTrue(
             "shared memory region 'dummy_data' already in manager" in str(
                 ex))
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 1)
     else:
         self.assertTrue(len(shm_status.regions) == 1)
     shm.destroy_shared_memory_region(shm_op0_handle)
Пример #10
0
def infer_shape_tensor(tester,
                       pf,
                       tensor_dtype,
                       input_shape_values,
                       dummy_input_shapes,
                       use_http=True,
                       use_grpc=True,
                       use_streaming=True,
                       shm_suffix="",
                       use_system_shared_memory=False,
                       priority=0,
                       timeout_us=0,
                       batch_size=1):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    tester.assertTrue(pf == "plan" or pf == "plan_nobatch")
    tester.assertEqual(len(input_shape_values), len(dummy_input_shapes))

    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True))

    io_cnt = len(input_shape_values)

    # FIXME wrap up shm handle cleanup
    # item is (handle, byte_size)
    input_shm_handle_list = []
    output_shm_handle_list = []
    dummy_input_list = []
    input_list = []
    expected_dict = dict()
    # Prepare IO in advance
    for io_num in range(io_cnt):
        dummy_input_name = "DUMMY_INPUT{}".format(io_num)
        input_name = "INPUT{}".format(io_num)
        dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
        output_name = "OUTPUT{}".format(io_num)

        # Prepare the dummy tensor
        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                          high=np.iinfo(rtensor_dtype).max,
                                          size=dummy_input_shapes[io_num],
                                          dtype=rtensor_dtype)
        else:
            dummy_in0 = np.random.choice(a=[False, True],
                                         size=dummy_input_shapes[io_num])
        if tensor_dtype != np.object:
            dummy_in0 = dummy_in0.astype(tensor_dtype)
        else:
            dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()],
                                 dtype=object).reshape(dummy_in0.shape)
        dummy_input_list.append(dummy_in0)

        # Prepare shape input tensor
        in0 = np.asarray(input_shape_values[io_num], dtype=np.int32)
        input_list.append(in0)

        # Prepare the expected value for the output. Skip dummy output as we
        # only care about its shape (== value of OUTPUT*)
        expected_dict[output_name] = np.ndarray.copy(in0)

        # Only need to create region once
        input_byte_size = in0.size * np.dtype(np.int32).itemsize
        output_byte_size = input_byte_size * batch_size
        if use_system_shared_memory:
            input_shm_handle_list.append(
                (shm.create_shared_memory_region(input_name + shm_suffix,
                                                 '/' + input_name + shm_suffix,
                                                 input_byte_size),
                 input_byte_size))
            output_shm_handle_list.append((shm.create_shared_memory_region(
                output_name + shm_suffix, '/' + output_name + shm_suffix,
                output_byte_size), output_byte_size))
            shm.set_shared_memory_region(input_shm_handle_list[-1][0], [
                in0,
            ])

    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    # Run inference and check results for each config
    for config in configs:
        client_utils = grpcclient if config[1] == "grpc" else httpclient
        triton_client = client_utils.InferenceServerClient(config[0],
                                                           verbose=True)

        inputs = []
        outputs = []

        # Set IOs
        for io_num in range(io_cnt):
            dummy_input_name = "DUMMY_INPUT{}".format(io_num)
            input_name = "INPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

            inputs.append(
                client_utils.InferInput(dummy_input_name,
                                        dummy_input_shapes[io_num],
                                        np_to_triton_dtype(tensor_dtype)))
            inputs.append(
                client_utils.InferInput(input_name, input_list[io_num].shape,
                                        "INT32"))
            outputs.append(
                client_utils.InferRequestedOutput(dummy_output_name))
            outputs.append(client_utils.InferRequestedOutput(output_name))

            # -2: dummy; -1: input
            inputs[-2].set_data_from_numpy(dummy_input_list[io_num])
            if (not use_system_shared_memory):
                inputs[-1].set_data_from_numpy(input_list[io_num])
            else:
                input_byte_size = input_shm_handle_list[io_num][1]
                output_byte_size = output_shm_handle_list[io_num][1]
                triton_client.register_system_shared_memory(
                    input_name + shm_suffix, "/" + input_name + shm_suffix,
                    input_byte_size)
                triton_client.register_system_shared_memory(
                    output_name + shm_suffix, "/" + output_name + shm_suffix,
                    output_byte_size)
                inputs[-1].set_shared_memory(input_name + shm_suffix,
                                             input_byte_size)
                outputs[-1].set_shared_memory(output_name + shm_suffix,
                                              output_byte_size)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                                           inputs,
                                                           outputs=outputs,
                                                           priority=priority,
                                                           timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          outputs=outputs,
                                          priority=priority,
                                          timeout=timeout_us)

        for io_num in range(io_cnt):
            output_name = "OUTPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            expected = expected_dict[output_name]

            # get outputs as numpy array
            dummy_out = results.as_numpy(dummy_output_name)
            if (not use_system_shared_memory):
                out = results.as_numpy(output_name)
            else:
                output = results.get_output(output_name)
                if config[1] == "grpc":
                    output_shape = output.shape
                else:
                    output_shape = output["shape"]
                out = shm.get_contents_as_numpy(
                    output_shm_handle_list[io_num][0], np.int32, output_shape)

            # if out shape is 2D, it is batched
            if (len(out.shape) == 2):
                # The shape of the dummy output should be equal to the shape values
                # specified in the shape tensor
                tester.assertTrue(
                    np.array_equal(dummy_out.shape[1:], out[0]),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out[0],
                        dummy_out.shape[1:]))
                for b in range(1, out.shape[0]):
                    tester.assertTrue(
                        np.array_equal(out[b - 1], out[b]),
                        "expect shape tensor has consistent value, "
                        "expected: {}, got {}".format(out[b - 1], out[b]))
                out = out[0]
            else:
                tester.assertTrue(
                    np.array_equal(dummy_out.shape, out),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out, dummy_out.shape))
            tester.assertTrue(
                np.array_equal(out, expected),
                "{}, {}, expected: {}, got {}".format(model_name, output_name,
                                                      expected, out))

            # unregister shared memory region for next config
            if use_system_shared_memory:
                triton_client.unregister_system_shared_memory(input_name +
                                                              shm_suffix)
                triton_client.unregister_system_shared_memory(output_name +
                                                              shm_suffix)

    for handle in input_shm_handle_list:
        shm.destroy_shared_memory_region(handle[0])
    for handle in output_shm_handle_list:
        shm.destroy_shared_memory_region(handle[0])
Пример #11
0
def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
                           output1_byte_size, outputs, shm_region_names,
                           precreated_shm_regions, use_system_shared_memory,
                           use_cuda_shared_memory):
    if use_system_shared_memory and use_cuda_shared_memory:
        raise ValueError(
            "Cannot set both System and CUDA shared memory flags to 1")

    if not (use_system_shared_memory or use_cuda_shared_memory):
        return [], []

    input0_byte_size = sum([i0.nbytes for i0 in input0_list])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list])

    if shm_region_names is None:
        shm_region_names = ['input0', 'input1', 'output0', 'output1']

    shm_op0_handle = None
    shm_op1_handle = None

    if use_system_shared_memory:
        shm_ip0_handle = shm.create_shared_memory_region(
            shm_region_names[0] + '_data', '/' + shm_region_names[0],
            input0_byte_size)
        shm_ip1_handle = shm.create_shared_memory_region(
            shm_region_names[1] + '_data', '/' + shm_region_names[1],
            input1_byte_size)

        i = 0
        if "OUTPUT0" in outputs:
            if precreated_shm_regions is None:
                shm_op0_handle = shm.create_shared_memory_region(
                    shm_region_names[2] + '_data', '/' + shm_region_names[2],
                    output0_byte_size)
            else:
                shm_op0_handle = precreated_shm_regions[0]
            i += 1
        if "OUTPUT1" in outputs:
            if precreated_shm_regions is None:
                shm_op1_handle = shm.create_shared_memory_region(
                    shm_region_names[2 + i] + '_data',
                    '/' + shm_region_names[2 + i], output1_byte_size)
            else:
                shm_op1_handle = precreated_shm_regions[i]

        shm.set_shared_memory_region(shm_ip0_handle, input0_list)
        shm.set_shared_memory_region(shm_ip1_handle, input1_list)

    if use_cuda_shared_memory:
        shm_ip0_handle = cudashm.create_shared_memory_region(
            shm_region_names[0] + '_data', input0_byte_size, 0)
        shm_ip1_handle = cudashm.create_shared_memory_region(
            shm_region_names[1] + '_data', input1_byte_size, 0)
        i = 0
        if "OUTPUT0" in outputs:
            if precreated_shm_regions is None:
                shm_op0_handle = cudashm.create_shared_memory_region(
                    shm_region_names[2] + '_data', output0_byte_size, 0)
            else:
                shm_op0_handle = precreated_shm_regions[0]
            i += 1
        if "OUTPUT1" in outputs:
            if precreated_shm_regions is None:
                shm_op1_handle = cudashm.create_shared_memory_region(
                    shm_region_names[2 + i] + '_data', output1_byte_size, 0)
            else:
                shm_op1_handle = precreated_shm_regions[i]

        cudashm.set_shared_memory_region(shm_ip0_handle, input0_list)
        cudashm.set_shared_memory_region(shm_ip1_handle, input1_list)

    return shm_region_names, [
        shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle
    ]
Пример #12
0
    def precreate_register_dynaseq_shape_tensor_regions(self,
                                                        value_list,
                                                        dtype,
                                                        i,
                                                        batch_size=1,
                                                        tensor_shape=(1,)):
        self.assertFalse(_test_cuda_shared_memory,
                         "Shape tensors does not support CUDA shared memory")
        if _test_system_shared_memory:
            shm_region_handles = []
            for j, (shape_value, value) in enumerate(value_list):
                input_list = list()
                shape_input_list = list()
                dummy_input_list = list()

                for b in range(batch_size):
                    if dtype == np.object_:
                        dummy_in0 = np.full(tensor_shape, value, dtype=np.int32)
                        dummy_in0n = np.array(
                            [str(x) for x in dummy_in0.reshape(in0.size)],
                            dtype=object)
                        dummy_in0 = dummy_in0n.reshape(tensor_shape)
                    else:
                        dummy_in0 = np.full(tensor_shape, value, dtype=dtype)
                    dummy_input_list.append(dummy_in0)
                    in0 = np.full(tensor_shape, value, dtype=np.int32)
                    input_list.append(in0)

                # Only one shape tensor input per batch
                shape_input_list.append(
                    np.full(tensor_shape, shape_value, dtype=np.int32))

                if dtype == np.object_:
                    input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                    input_byte_size = sum(
                        [serialized_byte_size(i0) for i0 in input_list_tmp])
                else:
                    input_list_tmp = input_list
                    input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])

                dummy_input_byte_size = sum(
                    [i0.nbytes for i0 in dummy_input_list])

                shape_input_byte_size = sum(
                    [i0.nbytes for i0 in shape_input_list])
                shape_output_byte_size = shape_input_byte_size
                output_byte_size = np.dtype(np.int32).itemsize + 2
                resized_output_byte_size = 32 * shape_value

                # create shared memory regions and copy data for input values
                ip_name = 'ip{}{}'.format(i, j)
                shape_ip_name = 'shape_ip{}{}'.format(i, j)
                dummy_ip_name = 'dummy_ip{}{}'.format(i, j)
                shape_op_name = 'shape_op{}{}'.format(i, j)
                op_name = 'op{}{}'.format(i, j)
                resized_op_name = 'resized_op{}{}'.format(i, j)

                shm_ip_handle = shm.create_shared_memory_region(
                    ip_name, '/' + ip_name, input_byte_size)
                shm_shape_ip_handle = shm.create_shared_memory_region(
                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
                shm_dummy_ip_handle = shm.create_shared_memory_region(
                    dummy_ip_name, '/' + dummy_ip_name, dummy_input_byte_size)
                shm_shape_op_handle = shm.create_shared_memory_region(
                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
                shm_op_handle = shm.create_shared_memory_region(
                    op_name, '/' + op_name, output_byte_size)
                shm_resized_op_handle = shm.create_shared_memory_region(
                    resized_op_name, '/' + resized_op_name,
                    resized_output_byte_size)
                shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                shm.set_shared_memory_region(shm_shape_ip_handle,
                                             shape_input_list)
                shm.set_shared_memory_region(shm_dummy_ip_handle,
                                             dummy_input_list)
                self.triton_client_.register_system_shared_memory(
                    ip_name, '/' + ip_name, input_byte_size)
                self.triton_client_.register_system_shared_memory(
                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
                self.triton_client_.register_system_shared_memory(
                    dummy_ip_name, '/' + dummy_ip_name, dummy_input_byte_size)
                self.triton_client_.register_system_shared_memory(
                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
                self.triton_client_.register_system_shared_memory(
                    op_name, '/' + op_name, output_byte_size)
                self.triton_client_.register_system_shared_memory(
                    resized_op_name, '/' + resized_op_name,
                    resized_output_byte_size)

                shm_region_handles.append(
                    (ip_name, input_byte_size, shm_ip_handle))
                shm_region_handles.append(
                    (shape_ip_name, shape_input_byte_size, shm_shape_ip_handle))
                shm_region_handles.append(
                    (dummy_ip_name, dummy_input_byte_size, shm_dummy_ip_handle))
                shm_region_handles.append(
                    (shape_op_name, shape_output_byte_size,
                     shm_shape_op_handle))
                shm_region_handles.append(
                    (op_name, output_byte_size, shm_op_handle))
                shm_region_handles.append(
                    (resized_op_name, resized_output_byte_size,
                     shm_resized_op_handle))
            return shm_region_handles
        else:
            return []
Пример #13
0
    # output tensor is the element-wise sum of the inputs and one
    # output is the element-wise difference.
    model_name = "simple"
    model_version = ""

    # Create the data for the two input tensors. Initialize the first
    # to unique integers and the second to all ones.
    input0_data = np.arange(start=0, stop=16, dtype=np.int32)
    input1_data = np.ones(shape=16, dtype=np.int32)

    input_byte_size = input0_data.size * input0_data.itemsize
    output_byte_size = input_byte_size

    # Create shared memory region for output and store shared memory handle
    shm_op_handle = shm.create_shared_memory_region("output_data",
                                                    "/output_simple",
                                                    output_byte_size * 2)

    # Register shared memory region for outputs with Triton Server
    triton_client.register_system_shared_memory("output_data",
                                                "/output_simple",
                                                output_byte_size * 2)

    # Create shared memory region for input and store shared memory handle
    shm_ip_handle = shm.create_shared_memory_region("input_data",
                                                    "/input_simple",
                                                    input_byte_size * 2)

    # Put input data values into shared memory
    shm.set_shared_memory_region(shm_ip_handle, [input0_data])
    shm.set_shared_memory_region(shm_ip_handle, [input1_data],
def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
                           output1_byte_size, outputs, shm_region_names,
                           precreated_shm_regions, use_system_shared_memory,
                           use_cuda_shared_memory):
    # Lazy shm imports...
    if use_system_shared_memory:
        import tritonclient.utils.shared_memory as shm
    if use_cuda_shared_memory:
        import tritonclient.utils.cuda_shared_memory as cudashm

    if use_system_shared_memory and use_cuda_shared_memory:
        raise ValueError(
            "Cannot set both System and CUDA shared memory flags to 1")

    if not (use_system_shared_memory or use_cuda_shared_memory):
        return [], []

    if input0_list[0].dtype == np.object_:
        input0_byte_size = sum(
            [serialized_byte_size(i0) for i0 in input0_list])
    else:
        input0_byte_size = sum([i0.nbytes for i0 in input0_list])

    if input1_list[0].dtype == np.object_:
        input1_byte_size = sum(
            [serialized_byte_size(i1) for i1 in input1_list])
    else:
        input1_byte_size = sum([i1.nbytes for i1 in input1_list])

    if shm_region_names is None:
        shm_region_names = ['input0', 'input1', 'output0', 'output1']

    shm_op0_handle = None
    shm_op1_handle = None

    if use_system_shared_memory:
        shm_ip0_handle = shm.create_shared_memory_region(
            shm_region_names[0] + '_data', '/' + shm_region_names[0],
            input0_byte_size)
        shm_ip1_handle = shm.create_shared_memory_region(
            shm_region_names[1] + '_data', '/' + shm_region_names[1],
            input1_byte_size)

        i = 0
        if "OUTPUT0" in outputs:
            if precreated_shm_regions is None:
                shm_op0_handle = shm.create_shared_memory_region(
                    shm_region_names[2] + '_data', '/' + shm_region_names[2],
                    output0_byte_size)
            else:
                shm_op0_handle = precreated_shm_regions[0]
            i += 1
        if "OUTPUT1" in outputs:
            if precreated_shm_regions is None:
                shm_op1_handle = shm.create_shared_memory_region(
                    shm_region_names[2 + i] + '_data',
                    '/' + shm_region_names[2 + i], output1_byte_size)
            else:
                shm_op1_handle = precreated_shm_regions[i]

        shm.set_shared_memory_region(shm_ip0_handle, input0_list)
        shm.set_shared_memory_region(shm_ip1_handle, input1_list)

    if use_cuda_shared_memory:
        shm_ip0_handle = cudashm.create_shared_memory_region(
            shm_region_names[0] + '_data', input0_byte_size, 0)
        shm_ip1_handle = cudashm.create_shared_memory_region(
            shm_region_names[1] + '_data', input1_byte_size, 0)
        i = 0
        if "OUTPUT0" in outputs:
            if precreated_shm_regions is None:
                shm_op0_handle = cudashm.create_shared_memory_region(
                    shm_region_names[2] + '_data', output0_byte_size, 0)
            else:
                shm_op0_handle = precreated_shm_regions[0]
            i += 1
        if "OUTPUT1" in outputs:
            if precreated_shm_regions is None:
                shm_op1_handle = cudashm.create_shared_memory_region(
                    shm_region_names[2 + i] + '_data', output1_byte_size, 0)
            else:
                shm_op1_handle = precreated_shm_regions[i]

        cudashm.set_shared_memory_region(shm_ip0_handle, input0_list)
        cudashm.set_shared_memory_region(shm_ip1_handle, input1_list)

    return shm_region_names, [
        shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle
    ]
Пример #15
0
    def check_sequence(self,
                       trial,
                       model_name,
                       input_dtype,
                       correlation_id,
                       sequence_thresholds,
                       values,
                       expected_result,
                       protocol,
                       batch_size=1,
                       sequence_name="<unknown>",
                       tensor_shape=(1,)):
        """Perform sequence of inferences. The 'values' holds a list of
        tuples, one for each inference with format:

        (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms)

        """
        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
            ("custom" not in trial) and ("onnx" not in trial) and
            ("libtorch" not in trial) and ("plan" not in trial)):
            self.assertFalse(True, "unknown trial type: " + trial)

        # Can only send the request exactly once since it is a
        # sequence model with state, so can have only a single config.
        configs = []
        if protocol == "http":
            configs.append((f"{_tritonserver_ipaddr}:8000", "http", False))
        if protocol == "grpc":
            configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False))
        if protocol == "streaming":
            configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", True))

        self.assertFalse(
            _test_system_shared_memory and _test_cuda_shared_memory,
            "Cannot set both System and CUDA shared memory flags to 1")

        self.assertEqual(len(configs), 1)

        full_shape = tensor_shape if "nobatch" in trial else (
            batch_size,) + tensor_shape

        # create and register shared memory output region in advance,
        # knowing that this function will not be called concurrently.
        if _test_system_shared_memory or _test_cuda_shared_memory:
            self.triton_client_.unregister_system_shared_memory()
            self.triton_client_.unregister_cuda_shared_memory()
            output_byte_size = 512
            if _test_system_shared_memory:
                shm_op_handle = shm.create_shared_memory_region(
                    "output_data", "/output", output_byte_size)
                self.triton_client_.register_system_shared_memory(
                    "output_data", "/output", output_byte_size)
            elif _test_cuda_shared_memory:
                shm_op_handle = cudashm.create_shared_memory_region(
                    "output_data", output_byte_size, 0)
                self.triton_client_.register_cuda_shared_memory(
                    "output_data", cudashm.get_raw_handle(shm_op_handle), 0,
                    output_byte_size)
            shm_ip_handles = []

        for config in configs:
            client_utils = grpcclient if config[1] == "grpc" else httpclient

            triton_client = client_utils.InferenceServerClient(config[0],
                                                               verbose=True)
            if config[2]:
                user_data = UserData()
                triton_client.start_stream(
                    partial(completion_callback, user_data))
            # Execute the sequence of inference...
            try:
                seq_start_ms = int(round(time.time() * 1000))

                INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT"
                OUTPUT = "OUTPUT__0" if trial.startswith(
                    "libtorch") else "OUTPUT"
                for flag_str, value, thresholds, delay_ms in values:
                    if _test_valgrind or _test_jetson:
                        if delay_ms is not None:
                            delay_ms[0] = max(_valgrind_delay_ms, delay_ms[0])
                            delay_ms[1] = max(_valgrind_delay_ms, delay_ms[1])
                        else:
                            delay_ms = (_valgrind_delay_ms, _valgrind_delay_ms)

                    if delay_ms is not None:
                        time.sleep(delay_ms[0] / 1000.0)

                    seq_start = False
                    seq_end = False
                    if flag_str is not None:
                        seq_start = ("start" in flag_str)
                        seq_end = ("end" in flag_str)

                    # Construct request IOs
                    inputs = []
                    outputs = []
                    inputs.append(
                        client_utils.InferInput(
                            INPUT, full_shape, np_to_triton_dtype(input_dtype)))
                    outputs.append(client_utils.InferRequestedOutput(OUTPUT))
                    if input_dtype == np.object_:
                        in0 = np.full(full_shape, value, dtype=np.int32)
                        in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                                        dtype=object)
                        in0 = in0n.reshape(full_shape)
                    else:
                        in0 = np.full(full_shape, value, dtype=input_dtype)

                    # create input shared memory and copy input data values into it
                    if _test_system_shared_memory or _test_cuda_shared_memory:
                        if input_dtype == np.object_:
                            input_list_tmp = iu.serialize_byte_tensor_list(
                                [in0])
                            input_byte_size = sum([
                                serialized_byte_size(i0)
                                for i0 in input_list_tmp
                            ])
                        else:
                            input_list_tmp = [in0]
                            input_byte_size = sum(
                                [i0.nbytes for i0 in input_list_tmp])
                        ip_name = "ip{}".format(len(shm_ip_handles))
                        if _test_system_shared_memory:
                            shm_ip_handles.append(
                                shm.create_shared_memory_region(
                                    ip_name, "/" + ip_name, input_byte_size))
                            shm.set_shared_memory_region(
                                shm_ip_handles[-1], input_list_tmp)
                            triton_client.register_system_shared_memory(
                                ip_name, "/" + ip_name, input_byte_size)
                        elif _test_cuda_shared_memory:
                            shm_ip_handles.append(
                                cudashm.create_shared_memory_region(
                                    ip_name, input_byte_size, 0))
                            cudashm.set_shared_memory_region(
                                shm_ip_handles[-1], input_list_tmp)
                            triton_client.register_cuda_shared_memory(
                                ip_name,
                                cudashm.get_raw_handle(shm_ip_handles[-1]), 0,
                                input_byte_size)

                        inputs[0].set_shared_memory(ip_name, input_byte_size)
                        outputs[0].set_shared_memory("output_data",
                                                     output_byte_size)
                    else:
                        inputs[0].set_data_from_numpy(in0)

                    start_ms = int(round(time.time() * 1000))

                    if config[2]:
                        triton_client.async_stream_infer(
                            model_name,
                            inputs,
                            outputs=outputs,
                            sequence_id=correlation_id,
                            sequence_start=seq_start,
                            sequence_end=seq_end)
                        (results, error) = user_data._completed_requests.get()
                        if error is not None:
                            raise error
                    else:
                        results = triton_client.infer(
                            model_name,
                            inputs,
                            outputs=outputs,
                            sequence_id=correlation_id,
                            sequence_start=seq_start,
                            sequence_end=seq_end)

                    end_ms = int(round(time.time() * 1000))

                    # Get value of "OUTPUT", for shared memory, need to get it via
                    # shared memory utils
                    if (not _test_system_shared_memory) and (
                            not _test_cuda_shared_memory):
                        out = results.as_numpy(OUTPUT)
                    else:
                        output = results.get_output(OUTPUT)
                        if config[1] == "http":
                            output_shape = output["shape"]
                        else:
                            output_shape = output.shape
                        output_type = input_dtype
                        if _test_system_shared_memory:
                            out = shm.get_contents_as_numpy(
                                shm_op_handle, output_type, output_shape)
                        else:
                            out = cudashm.get_contents_as_numpy(
                                shm_op_handle, output_type, output_shape)
                    result = out[0] if "nobatch" in trial else out[0][0]
                    print("{}: {}".format(sequence_name, result))

                    if thresholds is not None:
                        lt_ms = thresholds[0]
                        gt_ms = thresholds[1]
                        if lt_ms is not None:
                            self.assertTrue((end_ms - start_ms) < lt_ms,
                                            "expected less than " + str(lt_ms) +
                                            "ms response time, got " +
                                            str(end_ms - start_ms) + " ms")
                        if gt_ms is not None:
                            self.assertTrue(
                                (end_ms - start_ms) > gt_ms,
                                "expected greater than " + str(gt_ms) +
                                "ms response time, got " +
                                str(end_ms - start_ms) + " ms")
                    if delay_ms is not None:
                        time.sleep(delay_ms[1] / 1000.0)

                seq_end_ms = int(round(time.time() * 1000))

                if input_dtype == np.object_:
                    self.assertEqual(int(result), expected_result)
                else:
                    self.assertEqual(result, expected_result)

                if sequence_thresholds is not None:
                    lt_ms = sequence_thresholds[0]
                    gt_ms = sequence_thresholds[1]
                    if lt_ms is not None:
                        if _test_jetson:
                            lt_ms *= _jetson_slowdown_factor
                        self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
                                        "sequence expected less than " +
                                        str(lt_ms) + "ms response time, got " +
                                        str(seq_end_ms - seq_start_ms) + " ms")
                    if gt_ms is not None:
                        self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
                                        "sequence expected greater than " +
                                        str(gt_ms) + "ms response time, got " +
                                        str(seq_end_ms - seq_start_ms) + " ms")
            except Exception as ex:
                self.add_deferred_exception(ex)
            if config[2]:
                triton_client.stop_stream()

        if _test_system_shared_memory or _test_cuda_shared_memory:
            self.triton_client_.unregister_system_shared_memory()
            self.triton_client_.unregister_cuda_shared_memory()
            destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region
            destroy_func(shm_op_handle)
            for shm_ip_handle in shm_ip_handles:
                destroy_func(shm_ip_handle)
        [str(x).encode('utf-8') for x in np.subtract(in0, in1).flatten()],
        dtype=object)
    expected_sum_serialized = utils.serialize_byte_tensor(expected_sum)
    expected_diff_serialized = utils.serialize_byte_tensor(expected_diff)

    input0_data_serialized = utils.serialize_byte_tensor(input0_data)
    input1_data_serialized = utils.serialize_byte_tensor(input1_data)
    input0_byte_size = utils.serialized_byte_size(input0_data_serialized)
    input1_byte_size = utils.serialized_byte_size(input1_data_serialized)
    output0_byte_size = utils.serialized_byte_size(expected_sum_serialized)
    output1_byte_size = utils.serialized_byte_size(expected_diff_serialized)
    output_byte_size = max(input0_byte_size, input1_byte_size) + 1

    # Create Output0 and Output1 in Shared Memory and store shared memory handles
    shm_op0_handle = shm.create_shared_memory_region("output0_data",
                                                     "/output0_simple",
                                                     output0_byte_size)
    shm_op1_handle = shm.create_shared_memory_region("output1_data",
                                                     "/output1_simple",
                                                     output1_byte_size)

    # Register Output0 and Output1 shared memory with Triton Server
    triton_client.register_system_shared_memory("output0_data",
                                                "/output0_simple",
                                                output0_byte_size)
    triton_client.register_system_shared_memory("output1_data",
                                                "/output1_simple",
                                                output1_byte_size)

    # Create Input0 and Input1 in Shared Memory and store shared memory handles
    shm_ip0_handle = shm.create_shared_memory_region("input0_data",
Пример #17
0
    def __init__(self,
                 url="localhost:8001",
                 model_name="yolov5",
                 model_version="",
                 verbose=False) -> None:
        self.triton_client = grpcclient.InferenceServerClient(url=url,
                                                              verbose=verbose)

        # To make sure no shared memory regions are registered with the server.
        self.triton_client.unregister_system_shared_memory()
        self.triton_client.unregister_cuda_shared_memory()

        # Yolo model takes 1 input tensors dims [1, 3, 640, 640]
        # each and returns 4 output tensors
        # dims: [1,25200,6]
        # dims: [1,3,80,80,6]
        # dims: [1,3,40,40,6]
        # dims: [1,3,20,20,6]
        self.model_name = model_name
        self.model_version = model_version

        # Create the data for the input and 4 outputs tensors
        input_images = np.zeros((1, 3, 640, 640), dtype=np.float32)
        output = np.zeros((1, 25200, 6), dtype=np.float32)
        output_397 = np.zeros((1, 3, 80, 80, 6), dtype=np.float32)
        output_458 = np.zeros((1, 3, 40, 40, 6), dtype=np.float32)
        output_519 = np.zeros((1, 3, 20, 20, 6), dtype=np.float32)

        # Calc input/output tensors sizes
        input_images_byte_size = input_images.size * input_images.itemsize
        output_byte_size = output.size * output.itemsize
        output_397_byte_size = output_397.size * output_397.itemsize
        output_458_byte_size = output_458.size * output_458.itemsize
        output_519_byte_size = output_519.size * output_519.itemsize

        # Create outputs in Shared Memory and store shared memory handles
        self.output_handle = shm.create_shared_memory_region(
            "output", "/output", output_byte_size)
        self.output_397_handle = shm.create_shared_memory_region(
            "output_397", "/output_397", output_397_byte_size)
        self.output_458_handle = shm.create_shared_memory_region(
            "output_458", "/output_458", output_458_byte_size)
        self.output_519_handle = shm.create_shared_memory_region(
            "output_519", "/output_519", output_519_byte_size)

        # Register outputs shared memory with Triton Server
        self.triton_client.register_system_shared_memory(
            "output", "/output", output_byte_size)
        self.triton_client.register_system_shared_memory(
            "output_397", "/output_397", output_397_byte_size)
        self.triton_client.register_system_shared_memory(
            "output_458", "/output_458", output_458_byte_size)
        self.triton_client.register_system_shared_memory(
            "output_519", "/output_519", output_519_byte_size)

        # Create inputs in Shared Memory and store shared memory handles
        self.input_images_handle = shm.create_shared_memory_region(
            "images", "/images", input_images_byte_size)
        # Register inputs shared memory with Triton Server
        self.triton_client.register_system_shared_memory(
            "images", "/images", input_images_byte_size)

        # Set the parameters to use data from shared memory
        self.inputs = []
        self.inputs.append(
            grpcclient.InferInput('images', [1, 3, 640, 640], "FP32"))
        self.inputs[-1].set_shared_memory("images", input_images_byte_size)

        self.outputs = []
        self.outputs.append(grpcclient.InferRequestedOutput('output'))
        self.outputs[-1].set_shared_memory("output", output_byte_size)

        self.predict(input_images)
Пример #18
0
    def precreate_register_regions(self,
                                   value_list,
                                   dtype,
                                   i,
                                   batch_size=1,
                                   tensor_shape=(1,)):
        if _test_system_shared_memory or _test_cuda_shared_memory:
            shm_region_handles = []
            for j, value in enumerate(value_list):
                # For string we can't know the size of the output
                # so we conservatively assume 64 bytes for each
                # element of the output
                if dtype == np.object_:
                    output_byte_size = 4  # size of empty string
                else:
                    output_byte_size = 0

                # create data
                input_list = list()
                for b in range(batch_size):
                    if dtype == np.object_:
                        in0 = np.full(tensor_shape, value, dtype=np.int32)
                        in0n = np.array([
                            str(x).encode('utf-8')
                            for x in in0.reshape(in0.size)
                        ],
                                        dtype=object)
                        in0 = in0n.reshape(tensor_shape)
                        output_byte_size += 64 * in0.size
                    else:
                        in0 = np.full(tensor_shape, value, dtype=dtype)
                        output_byte_size += np.dtype(dtype).itemsize * in0.size
                    input_list.append(in0)

                if dtype == np.object_:
                    input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                    input_byte_size = sum(
                        [serialized_byte_size(i0) for i0 in input_list_tmp])
                else:
                    input_list_tmp = input_list
                    input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])

                # create shared memory regions and copy data for input values
                ip_name = 'ip{}{}'.format(i, j)
                op_name = 'op{}{}_data'.format(i, j)
                if _test_system_shared_memory:
                    shm_ip_handle = shm.create_shared_memory_region(
                        ip_name, '/' + ip_name, input_byte_size)
                    shm_op_handle = shm.create_shared_memory_region(
                        op_name, '/' + op_name, output_byte_size)
                    shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                    self.triton_client_.register_system_shared_memory(
                        ip_name, '/' + ip_name, input_byte_size)
                    self.triton_client_.register_system_shared_memory(
                        op_name, '/' + op_name, output_byte_size)
                elif _test_cuda_shared_memory:
                    shm_ip_handle = cudashm.create_shared_memory_region(
                        ip_name, input_byte_size, 0)
                    shm_op_handle = cudashm.create_shared_memory_region(
                        op_name, output_byte_size, 0)
                    cudashm.set_shared_memory_region(shm_ip_handle,
                                                     input_list_tmp)
                    self.triton_client_.register_cuda_shared_memory(
                        ip_name, cudashm.get_raw_handle(shm_ip_handle), 0,
                        input_byte_size)
                    self.triton_client_.register_cuda_shared_memory(
                        op_name, cudashm.get_raw_handle(shm_op_handle), 0,
                        output_byte_size)
                shm_region_handles.append(
                    (ip_name, input_byte_size, shm_ip_handle))
                shm_region_handles.append(
                    (op_name, output_byte_size, shm_op_handle))
            return shm_region_handles
        else:
            return []