示例#1
0
 def cleanup_shm_regions(self, shm_handles):
     # Make sure unregister is before shared memory destruction
     if _test_system_shared_memory:
         self.triton_client_.unregister_system_shared_memory()
     if _test_cuda_shared_memory:
         self.triton_client_.unregister_cuda_shared_memory()
     for shm_tmp_handle in shm_handles:
         if _test_system_shared_memory:
             shm.destroy_shared_memory_region(shm_tmp_handle[2])
         elif _test_cuda_shared_memory:
             cudashm.destroy_shared_memory_region(shm_tmp_handle[2])
示例#2
0
def unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                   precreated_shm_regions, outputs,
                                   use_system_shared_memory,
                                   use_cuda_shared_memory):
    # Lazy shm imports...
    if use_system_shared_memory:
        import tritonclient.utils.shared_memory as shm
    if use_cuda_shared_memory:
        import tritonclient.utils.cuda_shared_memory as cudashm

    if not (use_system_shared_memory or use_cuda_shared_memory):
        return None

    triton_client = httpclient.InferenceServerClient(
        f"{_tritonserver_ipaddr}:8000")

    if use_cuda_shared_memory:
        triton_client.unregister_cuda_shared_memory(shm_regions[0] + '_data')
        triton_client.unregister_cuda_shared_memory(shm_regions[1] + '_data')
        cudashm.destroy_shared_memory_region(shm_handles[0])
        cudashm.destroy_shared_memory_region(shm_handles[1])
    else:
        triton_client.unregister_system_shared_memory(shm_regions[0] + '_data')
        triton_client.unregister_system_shared_memory(shm_regions[1] + '_data')
        shm.destroy_shared_memory_region(shm_handles[0])
        shm.destroy_shared_memory_region(shm_handles[1])

    if precreated_shm_regions is None:
        i = 0
        if "OUTPUT0" in outputs:
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(shm_regions[2] +
                                                            '_data')
                cudashm.destroy_shared_memory_region(shm_handles[2])
            else:
                triton_client.unregister_system_shared_memory(shm_regions[2] +
                                                              '_data')
                shm.destroy_shared_memory_region(shm_handles[2])
            i += 1
        if "OUTPUT1" in outputs:
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(shm_regions[2 +
                                                                        i] +
                                                            '_data')
                cudashm.destroy_shared_memory_region(shm_handles[3])
            else:
                triton_client.unregister_system_shared_memory(shm_regions[2 +
                                                                          i] +
                                                              '_data')
                shm.destroy_shared_memory_region(shm_handles[3])
    def test_http_out_of_shared_memory(self):
        triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
        inputs = []
        inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8"))
        inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))

        # Set up too small CUDA shared memory for outputs, expect query
        # returns default value
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
        shm_op0_handle = cudashm.create_shared_memory_region(
            "output0_data", 1, 0)
        shm_op1_handle = cudashm.create_shared_memory_region(
            "output1_data", 1, 0)
        triton_client.register_cuda_shared_memory(
            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1)
        triton_client.register_cuda_shared_memory(
            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1)
        outputs = []
        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
        outputs[-1].set_shared_memory("output0_data", 1)

        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
        outputs[-1].set_shared_memory("output1_data", 1)

        try:
            triton_client.infer(model_name="query",
                                inputs=inputs,
                                outputs=outputs)
            self.assertTrue(False, "expect error with query information")
        except InferenceServerException as ex:
            self.assertTrue("OUTPUT0 CPU 0" in ex.message())
            self.assertTrue("OUTPUT1 CPU 0" in ex.message())

        cudashm.destroy_shared_memory_region(shm_op0_handle)
        cudashm.destroy_shared_memory_region(shm_op1_handle)
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
示例#4
0
def unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                   precreated_shm_regions, outputs,
                                   use_system_shared_memory,
                                   use_cuda_shared_memory):
    if not (use_system_shared_memory or use_cuda_shared_memory):
        return None

    triton_client = httpclient.InferenceServerClient("localhost:8000")

    if use_cuda_shared_memory:
        triton_client.unregister_cuda_shared_memory(shm_regions[0] + '_data')
        triton_client.unregister_cuda_shared_memory(shm_regions[1] + '_data')
        cudashm.destroy_shared_memory_region(shm_handles[0])
        cudashm.destroy_shared_memory_region(shm_handles[1])
    else:
        triton_client.unregister_system_shared_memory(shm_regions[0] + '_data')
        triton_client.unregister_system_shared_memory(shm_regions[1] + '_data')
        shm.destroy_shared_memory_region(shm_handles[0])
        shm.destroy_shared_memory_region(shm_handles[1])

    if precreated_shm_regions is None:
        i = 0
        if "OUTPUT0" in outputs:
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(shm_regions[2] +
                                                            '_data')
                cudashm.destroy_shared_memory_region(shm_handles[2])
            else:
                triton_client.unregister_system_shared_memory(shm_regions[2] +
                                                              '_data')
                shm.destroy_shared_memory_region(shm_handles[2])
            i += 1
        if "OUTPUT1" in outputs:
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(shm_regions[2 +
                                                                        i] +
                                                            '_data')
                cudashm.destroy_shared_memory_region(shm_handles[3])
            else:
                triton_client.unregister_system_shared_memory(shm_regions[2 +
                                                                          i] +
                                                              '_data')
                shm.destroy_shared_memory_region(shm_handles[3])
示例#5
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_http_json_tensors=True,
               use_streaming=True,
               shm_region_name_prefix=None,
               use_system_shared_memory=False,
               use_cuda_shared_memory=False,
               priority=0,
               timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if use_http_json_tensors and (tensor_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    # Get model platform
    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    for io_num in range(io_cnt):
        if platform == "pytorch_libtorch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape,
                                            dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([
                unicode(str(x), encoding='utf-8')
                for x in input_array.flatten()
            ],
                                      dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region(
            [
                shm_region_name_prefix[0] + str(io_num),
                shm_region_name_prefix[1] + str(io_num)
            ], input_list_tmp, input_byte_size, output_byte_size,
            use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(
                zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(
                    httpclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    httpclient.InferRequestedOutput(output_name,
                                                    binary_data=config[3]))
            else:
                inputs.append(
                    grpcclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data,
                                                   binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(
                    inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size,
                    output_byte_size, use_system_shared_memory,
                    use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()),
                    priority=priority,
                    timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority,
                                          timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if platform == "pytorch_libtorch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(
                np.array_equal(output_data, expected),
                "{}, {}, expected: {}, got {}".format(model_name, result_name,
                                                      expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
    if output1 is not None:
        output1_data = cudashm.get_contents_as_numpy(
            shm_op1_handle, utils.triton_to_np_dtype(output1['datatype']),
            output1['shape'])
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)

    for i in range(16):
        print(
            str(input0_data[i]) + " + " + str(input1_data[i]) + " = " +
            str(output0_data[0][i]))
        print(
            str(input0_data[i]) + " - " + str(input1_data[i]) + " = " +
            str(output1_data[0][i]))
        if (input0_data[i] + input1_data[i]) != output0_data[0][i]:
            print("cudashm infer error: incorrect sum")
            sys.exit(1)
        if (input0_data[i] - input1_data[i]) != output1_data[0][i]:
            print("cudashm infer error: incorrect difference")
            sys.exit(1)

    print(triton_client.get_cuda_shared_memory_status())
    triton_client.unregister_cuda_shared_memory()
    cudashm.destroy_shared_memory_region(shm_ip0_handle)
    cudashm.destroy_shared_memory_region(shm_ip1_handle)
    cudashm.destroy_shared_memory_region(shm_op0_handle)
    cudashm.destroy_shared_memory_region(shm_op1_handle)

    print('PASS: cuda shared memory')