def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = utils.triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all(),
                "Model output does not match expected output")
        except Exception as ex:
            error_msg.append(str(ex))
Exemplo n.º 2
0
    def predict(self, input_images):
        # Put input data values into shared memory
        shm.set_shared_memory_region(self.input_images_handle, [input_images])

        results = self.triton_client.infer(model_name=self.model_name,
                                           inputs=self.inputs,
                                           outputs=self.outputs)
        # Read results from the shared memory.
        output = results.get_output("output")
        output_data = shm.get_contents_as_numpy(
            self.output_handle, utils.triton_to_np_dtype(output.datatype),
            output.shape)

        return output_data
    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
    outputs[-1].set_shared_memory("output0_data", output0_byte_size)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
    outputs[-1].set_shared_memory("output1_data", output1_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    print(utils.triton_to_np_dtype(output0.datatype))
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
            output0.shape)
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(
            shm_op1_handle, utils.triton_to_np_dtype(output1.datatype),
            output1.shape)
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)

    for i in range(16):
Exemplo n.º 4
0
def infer_exact(tester,
                pf,
                tensor_shape,
                batch_size,
                input_dtype,
                output0_dtype,
                output1_dtype,
                output0_raw=True,
                output1_raw=True,
                model_version=None,
                swap=False,
                outputs=("OUTPUT0", "OUTPUT1"),
                use_http=True,
                use_grpc=True,
                use_http_json_tensors=True,
                skip_request_id_check=False,
                use_streaming=True,
                correlation_id=0,
                shm_region_names=None,
                precreated_shm_regions=None,
                use_system_shared_memory=False,
                use_cuda_shared_memory=False,
                priority=0,
                timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    # configs [ url, protocol, async stream, binary data ]
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if output0_raw == output1_raw:
            # Float16 not supported for Input and Output via JSON
            if use_http_json_tensors and (input_dtype != np.float16) and \
               (output0_dtype != np.float16) and (output1_dtype != np.float16):
                configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(
        np.iinfo(rinput_dtype).min,
        np.iinfo(routput0_dtype).min,
        np.iinfo(routput1_dtype).min) / 2
    val_max = min(
        np.iinfo(rinput_dtype).max,
        np.iinfo(routput0_dtype).max,
        np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    input1_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    if input_dtype != np.object:
        input0_array = input0_array.astype(input_dtype)
        input1_array = input1_array.astype(input_dtype)

    if not swap:
        output0_array = input0_array + input1_array
        output1_array = input0_array - input1_array
    else:
        output0_array = input0_array - input1_array
        output1_array = input0_array + input1_array

    if output0_dtype == np.object:
        output0_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output0_array.flatten())
        ],
                                 dtype=object).reshape(output0_array.shape)
    else:
        output0_array = output0_array.astype(output0_dtype)
    if output1_dtype == np.object:
        output1_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output1_array.flatten())
        ],
                                 dtype=object).reshape(output1_array.shape)
    else:
        output1_array = output1_array.astype(output1_dtype)

    if input_dtype == np.object:
        in0n = np.array(
            [str(x) for x in input0_array.reshape(input0_array.size)],
            dtype=object)
        input0_array = in0n.reshape(input0_array.shape)
        in1n = np.array(
            [str(x) for x in input1_array.reshape(input1_array.size)],
            dtype=object)
        input1_array = in1n.reshape(input1_array.shape)

    # prepend size of string to output string data
    if output0_dtype == np.object:
        if batch_size == 1:
            output0_array_tmp = serialize_byte_tensor_list([output0_array])
        else:
            output0_array_tmp = serialize_byte_tensor_list(output0_array)
    else:
        output0_array_tmp = output0_array

    if output1_dtype == np.object:
        if batch_size == 1:
            output1_array_tmp = serialize_byte_tensor_list([output1_array])
        else:
            output1_array_tmp = serialize_byte_tensor_list(output1_array)
    else:
        output1_array_tmp = output1_array

    # Get model platform
    model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                   output1_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    if platform == "pytorch_libtorch":
        OUTPUT0 = "OUTPUT__0"
        OUTPUT1 = "OUTPUT__1"
        INPUT0 = "INPUT__0"
        INPUT1 = "INPUT__1"
    else:
        OUTPUT0 = "OUTPUT0"
        OUTPUT1 = "OUTPUT1"
        INPUT0 = "INPUT0"
        INPUT1 = "INPUT1"

    output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
    output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])

    if batch_size == 1:
        input0_list = [input0_array]
        input1_list = [input1_array]
    else:
        input0_list = [x for x in input0_array]
        input1_list = [x for x in input1_array]

    # Serialization of string tensors in the case of shared memory must be done manually
    if input_dtype == np.object:
        input0_list_tmp = serialize_byte_tensor_list(input0_list)
        input1_list_tmp = serialize_byte_tensor_list(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])

    # Create system/cuda shared memory regions if needed
    shm_regions, shm_handles = su.create_set_shm_regions(
        input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size,
        outputs, shm_region_names, precreated_shm_regions,
        use_system_shared_memory, use_cuda_shared_memory)

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                       output1_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        if config[1] == "http":
            inputs.append(
                httpclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                httpclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
        else:
            inputs.append(
                grpcclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                grpcclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))

        if not (use_cuda_shared_memory or use_system_shared_memory):
            if config[1] == "http":
                inputs[0].set_data_from_numpy(input0_array,
                                              binary_data=config[3])
                inputs[1].set_data_from_numpy(input1_array,
                                              binary_data=config[3])
            else:
                inputs[0].set_data_from_numpy(input0_array)
                inputs[1].set_data_from_numpy(input1_array)
        else:
            # Register necessary shared memory regions/handles
            su.register_add_shm_regions(inputs, outputs, shm_regions,
                                        precreated_shm_regions, shm_handles,
                                        input0_byte_size, input1_byte_size,
                                        output0_byte_size, output1_byte_size,
                                        use_system_shared_memory,
                                        use_cuda_shared_memory, triton_client)

        if batch_size == 1:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape((1, ) + tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape((1, ) + tensor_shape)
            ]
        else:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape(tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape(tensor_shape)
            ]

        # Force binary_data = False for shared memory and class
        output_req = []
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT0,
                                                        binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))

                output_req[-1].set_shared_memory(shm_regions[2] + '_data',
                                                 output0_byte_size)
            else:
                if output0_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT0))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0,
                                binary_data=config[3],
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT0, class_count=num_classes))
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT1,
                                                        binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))

                output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data',
                                                 output1_byte_size)
            else:
                if output1_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT1))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1,
                                binary_data=config[3],
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT1, class_count=num_classes))

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()))
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))

        last_response = results.get_response()

        if not skip_request_id_check:
            global _seen_request_ids
            if config[1] == "http":
                request_id = int(last_response["id"])
            else:
                request_id = int(last_response.id)
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(str(response_model_version), model_version)

        tester.assertEqual(len(response_outputs), len(outputs))

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            if ((result_name == OUTPUT0 and output0_raw)
                    or (result_name == OUTPUT1 and output1_raw)):
                if use_system_shared_memory or use_cuda_shared_memory:
                    if result_name == OUTPUT0:
                        shm_handle = shm_handles[2]
                    else:
                        shm_handle = shm_handles[3]

                    output = results.get_output(result_name)
                    if config[1] == "http":
                        output_datatype = output['datatype']
                        output_shape = output['shape']
                    else:
                        output_datatype = output.datatype
                        output_shape = output.shape
                    output_dtype = triton_to_np_dtype(output_datatype)
                if use_system_shared_memory:
                    output_data = shm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                elif use_cuda_shared_memory:
                    output_data = cudashm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                else:
                    output_data = results.as_numpy(result_name)

                if (output_data.dtype == np.object) and (config[3] == False):
                    output_data = output_data.astype(np.bytes_)

                if result_name == OUTPUT0:
                    tester.assertTrue(
                        np.array_equal(output_data, output0_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT0, output0_array, output_data))
                elif result_name == OUTPUT1:
                    tester.assertTrue(
                        np.array_equal(output_data, output1_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT1, output1_array, output_data))
                else:
                    tester.assertTrue(
                        False, "unexpected raw result {}".format(result_name))
            else:
                for b in range(batch_size):
                    # num_classes values must be returned and must
                    # match expected top values
                    if "nobatch" in pf:
                        class_list = results.as_numpy(result_name)
                    else:
                        class_list = results.as_numpy(result_name)[b]

                    tester.assertEqual(len(class_list), num_classes)
                    if batch_size == 1:
                        expected0_flatten = output0_array.flatten()
                        expected1_flatten = output1_array.flatten()
                    else:
                        expected0_flatten = output0_array[b].flatten()
                        expected1_flatten = output1_array[b].flatten()

                    for idx, class_label in enumerate(class_list):
                        # can't compare indices since could have different
                        # indices with the same value/prob, so check that
                        # the value of each index equals the expected value.
                        # Only compare labels when the indices are equal.
                        if type(class_label) == str:
                            ctuple = class_label.split(':')
                        else:
                            ctuple = "".join(chr(x)
                                             for x in class_label).split(':')
                        cval = float(ctuple[0])
                        cidx = int(ctuple[1])
                        if result_name == OUTPUT0:
                            tester.assertEqual(cval, expected0_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected0_flatten[expected0_sort_idx[b][idx]])
                            if cidx == expected0_sort_idx[b][idx]:
                                tester.assertEqual(
                                    ctuple[2], 'label{}'.format(
                                        expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(cval, expected1_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(
                                    result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                      precreated_shm_regions, outputs,
                                      use_system_shared_memory,
                                      use_cuda_shared_memory)

    return results
Exemplo n.º 5
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_http_json_tensors=True,
               use_streaming=True,
               shm_region_name_prefix=None,
               use_system_shared_memory=False,
               use_cuda_shared_memory=False,
               priority=0,
               timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if use_http_json_tensors and (tensor_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    # Get model platform
    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    for io_num in range(io_cnt):
        if platform == "pytorch_libtorch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape,
                                            dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([
                unicode(str(x), encoding='utf-8')
                for x in input_array.flatten()
            ],
                                      dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region(
            [
                shm_region_name_prefix[0] + str(io_num),
                shm_region_name_prefix[1] + str(io_num)
            ], input_list_tmp, input_byte_size, output_byte_size,
            use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(
                zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(
                    httpclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    httpclient.InferRequestedOutput(output_name,
                                                    binary_data=config[3]))
            else:
                inputs.append(
                    grpcclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data,
                                                   binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(
                    inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size,
                    output_byte_size, use_system_shared_memory,
                    use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()),
                    priority=priority,
                    timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority,
                                          timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if platform == "pytorch_libtorch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(
                np.array_equal(output_data, expected),
                "{}, {}, expected: {}, got {}".format(model_name, result_name,
                                                      expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
Exemplo n.º 6
0
def infer_shape_tensor(tester,
                       pf,
                       tensor_dtype,
                       input_shape_values,
                       dummy_input_shapes,
                       use_http=True,
                       use_grpc=True,
                       use_streaming=True,
                       shm_suffix="",
                       use_system_shared_memory=False,
                       priority=0,
                       timeout_us=0,
                       batch_size=1):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    tester.assertTrue(pf == "plan" or pf == "plan_nobatch")
    tester.assertEqual(len(input_shape_values), len(dummy_input_shapes))

    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True))

    io_cnt = len(input_shape_values)

    # FIXME wrap up shm handle cleanup
    # item is (handle, byte_size)
    input_shm_handle_list = []
    output_shm_handle_list = []
    dummy_input_list = []
    input_list = []
    expected_dict = dict()
    # Prepare IO in advance
    for io_num in range(io_cnt):
        dummy_input_name = "DUMMY_INPUT{}".format(io_num)
        input_name = "INPUT{}".format(io_num)
        dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
        output_name = "OUTPUT{}".format(io_num)

        # Prepare the dummy tensor
        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                          high=np.iinfo(rtensor_dtype).max,
                                          size=dummy_input_shapes[io_num],
                                          dtype=rtensor_dtype)
        else:
            dummy_in0 = np.random.choice(a=[False, True],
                                         size=dummy_input_shapes[io_num])
        if tensor_dtype != np.object:
            dummy_in0 = dummy_in0.astype(tensor_dtype)
        else:
            dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()],
                                 dtype=object).reshape(dummy_in0.shape)
        dummy_input_list.append(dummy_in0)

        # Prepare shape input tensor
        in0 = np.asarray(input_shape_values[io_num], dtype=np.int32)
        input_list.append(in0)

        # Prepare the expected value for the output. Skip dummy output as we
        # only care about its shape (== value of OUTPUT*)
        expected_dict[output_name] = np.ndarray.copy(in0)

        # Only need to create region once
        input_byte_size = in0.size * np.dtype(np.int32).itemsize
        output_byte_size = input_byte_size * batch_size
        if use_system_shared_memory:
            input_shm_handle_list.append(
                (shm.create_shared_memory_region(input_name + shm_suffix,
                                                 '/' + input_name + shm_suffix,
                                                 input_byte_size),
                 input_byte_size))
            output_shm_handle_list.append((shm.create_shared_memory_region(
                output_name + shm_suffix, '/' + output_name + shm_suffix,
                output_byte_size), output_byte_size))
            shm.set_shared_memory_region(input_shm_handle_list[-1][0], [
                in0,
            ])

    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    # Run inference and check results for each config
    for config in configs:
        client_utils = grpcclient if config[1] == "grpc" else httpclient
        triton_client = client_utils.InferenceServerClient(config[0],
                                                           verbose=True)

        inputs = []
        outputs = []

        # Set IOs
        for io_num in range(io_cnt):
            dummy_input_name = "DUMMY_INPUT{}".format(io_num)
            input_name = "INPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

            inputs.append(
                client_utils.InferInput(dummy_input_name,
                                        dummy_input_shapes[io_num],
                                        np_to_triton_dtype(tensor_dtype)))
            inputs.append(
                client_utils.InferInput(input_name, input_list[io_num].shape,
                                        "INT32"))
            outputs.append(
                client_utils.InferRequestedOutput(dummy_output_name))
            outputs.append(client_utils.InferRequestedOutput(output_name))

            # -2: dummy; -1: input
            inputs[-2].set_data_from_numpy(dummy_input_list[io_num])
            if (not use_system_shared_memory):
                inputs[-1].set_data_from_numpy(input_list[io_num])
            else:
                input_byte_size = input_shm_handle_list[io_num][1]
                output_byte_size = output_shm_handle_list[io_num][1]
                triton_client.register_system_shared_memory(
                    input_name + shm_suffix, "/" + input_name + shm_suffix,
                    input_byte_size)
                triton_client.register_system_shared_memory(
                    output_name + shm_suffix, "/" + output_name + shm_suffix,
                    output_byte_size)
                inputs[-1].set_shared_memory(input_name + shm_suffix,
                                             input_byte_size)
                outputs[-1].set_shared_memory(output_name + shm_suffix,
                                              output_byte_size)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                                           inputs,
                                                           outputs=outputs,
                                                           priority=priority,
                                                           timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          outputs=outputs,
                                          priority=priority,
                                          timeout=timeout_us)

        for io_num in range(io_cnt):
            output_name = "OUTPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            expected = expected_dict[output_name]

            # get outputs as numpy array
            dummy_out = results.as_numpy(dummy_output_name)
            if (not use_system_shared_memory):
                out = results.as_numpy(output_name)
            else:
                output = results.get_output(output_name)
                if config[1] == "grpc":
                    output_shape = output.shape
                else:
                    output_shape = output["shape"]
                out = shm.get_contents_as_numpy(
                    output_shm_handle_list[io_num][0], np.int32, output_shape)

            # if out shape is 2D, it is batched
            if (len(out.shape) == 2):
                # The shape of the dummy output should be equal to the shape values
                # specified in the shape tensor
                tester.assertTrue(
                    np.array_equal(dummy_out.shape[1:], out[0]),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out[0],
                        dummy_out.shape[1:]))
                for b in range(1, out.shape[0]):
                    tester.assertTrue(
                        np.array_equal(out[b - 1], out[b]),
                        "expect shape tensor has consistent value, "
                        "expected: {}, got {}".format(out[b - 1], out[b]))
                out = out[0]
            else:
                tester.assertTrue(
                    np.array_equal(dummy_out.shape, out),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out, dummy_out.shape))
            tester.assertTrue(
                np.array_equal(out, expected),
                "{}, {}, expected: {}, got {}".format(model_name, output_name,
                                                      expected, out))

            # unregister shared memory region for next config
            if use_system_shared_memory:
                triton_client.unregister_system_shared_memory(input_name +
                                                              shm_suffix)
                triton_client.unregister_system_shared_memory(output_name +
                                                              shm_suffix)

    for handle in input_shm_handle_list:
        shm.destroy_shared_memory_region(handle[0])
    for handle in output_shm_handle_list:
        shm.destroy_shared_memory_region(handle[0])
Exemplo n.º 7
0
    def check_sequence_shape_tensor_io(self,
                                       model_name,
                                       input_dtype,
                                       correlation_id,
                                       sequence_thresholds,
                                       values,
                                       expected_result,
                                       shm_region_handles,
                                       using_dynamic_batcher=False,
                                       sequence_name="<unknown>"):
        """Perform sequence of inferences using async run. The 'values' holds
        a list of tuples, one for each inference with format:

        (flag_str, shape_value, value, pre_delay_ms)

        """
        tensor_shape = (1, 1)
        # shape tensor is 1-D tensor that doesn't contain batch size as first value
        shape_tensor_shape = (1,)
        self.assertFalse(_test_cuda_shared_memory,
                         "Shape tensors does not support CUDA shared memory")

        client_utils = grpcclient
        triton_client = client_utils.InferenceServerClient(f"{_tritonserver_ipaddr}:8001",
                                                           verbose=True)
        user_data = UserData()
        triton_client.start_stream(partial(completion_callback, user_data))
        # Execute the sequence of inference...
        try:
            seq_start_ms = int(round(time.time() * 1000))

            sent_count = 0
            shape_values = list()
            for flag_str, shape_value, value, pre_delay_ms in values:
                seq_start = False
                seq_end = False
                if flag_str is not None:
                    seq_start = ("start" in flag_str)
                    seq_end = ("end" in flag_str)

                # Construct request IOs
                inputs = []
                outputs = []
                # input order: input, shape(, dummy)
                inputs.append(
                    client_utils.InferInput(
                        "INPUT", tensor_shape,
                        np_to_triton_dtype(np.int32 if using_dynamic_batcher
                                           else input_dtype)))
                inputs.append(
                    client_utils.InferInput("SHAPE_INPUT", shape_tensor_shape,
                                            np_to_triton_dtype(np.int32)))
                if using_dynamic_batcher:
                    inputs.append(
                        client_utils.InferInput(
                            "DUMMY_INPUT", tensor_shape,
                            np_to_triton_dtype(input_dtype)))
                # output order: shape, output, resized
                outputs.append(
                    client_utils.InferRequestedOutput("SHAPE_OUTPUT"))
                outputs.append(client_utils.InferRequestedOutput("OUTPUT"))
                outputs.append(
                    client_utils.InferRequestedOutput("RESIZED_OUTPUT"))

                # Set IO values
                shape_values.append(
                    np.full(shape_tensor_shape, shape_value, dtype=np.int32))
                if not _test_system_shared_memory:
                    if using_dynamic_batcher:
                        if input_dtype == np.object_:
                            dummy_in0 = np.full(tensor_shape,
                                                value,
                                                dtype=np.int32)
                            dummy_in0n = np.array(
                                [str(x) for x in in0.reshape(dummy_in0.size)],
                                dtype=object)
                            dummy_in0 = dummy_in0n.reshape(tensor_shape)
                        else:
                            dummy_in0 = np.full(tensor_shape,
                                                value,
                                                dtype=input_dtype)
                        in0 = np.full(tensor_shape, value, dtype=np.int32)
                    else:
                        if input_dtype == np.object_:
                            in0 = np.full(tensor_shape, value, dtype=np.int32)
                            in0n = np.array(
                                [str(x) for x in in0.reshape(in0.size)],
                                dtype=object)
                            in0 = in0n.reshape(tensor_shape)
                        else:
                            in0 = np.full(tensor_shape,
                                          value,
                                          dtype=input_dtype)

                    inputs[0].set_data_from_numpy(in0)
                    inputs[1].set_data_from_numpy(shape_values[-1])
                    if using_dynamic_batcher:
                        inputs[2].set_data_from_numpy(dummy_in0)
                else:
                    if using_dynamic_batcher:
                        input_offset = 6 * sent_count
                        output_offset = 6 * sent_count + 3
                    else:
                        input_offset = 5 * sent_count
                        output_offset = 5 * sent_count + 2
                    for i in range(len(inputs)):
                        inputs[i].set_shared_memory(
                            shm_region_handles[input_offset + i][0],
                            shm_region_handles[input_offset + i][1])
                    for i in range(len(outputs)):
                        outputs[i].set_shared_memory(
                            shm_region_handles[output_offset + i][0],
                            shm_region_handles[output_offset + i][1])

                if pre_delay_ms is not None:
                    time.sleep(pre_delay_ms / 1000.0)

                triton_client.async_stream_infer(model_name,
                                                 inputs,
                                                 outputs=outputs,
                                                 sequence_id=correlation_id,
                                                 sequence_start=seq_start,
                                                 sequence_end=seq_end)

                sent_count += 1

            # Wait for the results in the order sent
            result = None
            processed_count = 0
            while processed_count < sent_count:
                (results, error) = user_data._completed_requests.get()
                if error is not None:
                    raise error
                # Get value of "OUTPUT", for shared memory, need to get it via
                # shared memory utils
                if (not _test_system_shared_memory):
                    out = results.as_numpy("OUTPUT")
                else:
                    output = results.get_output("OUTPUT")
                    output_offset = 6 * processed_count + 4 if using_dynamic_batcher else 5 * processed_count + 3
                    output_shape = output.shape
                    output_type = np.int32 if using_dynamic_batcher else np.float32
                    out = shm.get_contents_as_numpy(
                        shm_region_handles[output_offset][2], output_type,
                        output_shape)
                result = out[0][0]

                # Validate the (debatched) shape of the resized output matches
                # with the shape input values
                resized_shape = results.get_output("RESIZED_OUTPUT").shape[1:]
                self.assertTrue(
                    np.array_equal(resized_shape,
                                   shape_values[processed_count]),
                    "{}, {}, slot {}, expected: {}, got {}".format(
                        model_name, "RESIZED_OUTPUT", processed_count,
                        shape_values[processed_count], resized_shape))
                print("{}: {}".format(sequence_name, result))
                processed_count += 1

            seq_end_ms = int(round(time.time() * 1000))

            if input_dtype == np.object_:
                self.assertEqual(int(result), expected_result)
            else:
                self.assertEqual(result, expected_result)

            if sequence_thresholds is not None:
                lt_ms = sequence_thresholds[0]
                gt_ms = sequence_thresholds[1]
                if lt_ms is not None:
                    if _test_jetson:
                        lt_ms *= _jetson_slowdown_factor
                    self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
                                    "sequence expected less than " +
                                    str(lt_ms) + "ms response time, got " +
                                    str(seq_end_ms - seq_start_ms) + " ms")
                if gt_ms is not None:
                    self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
                                    "sequence expected greater than " +
                                    str(gt_ms) + "ms response time, got " +
                                    str(seq_end_ms - seq_start_ms) + " ms")
        except Exception as ex:
            self.add_deferred_exception(ex)
        triton_client.stop_stream()
Exemplo n.º 8
0
    outputs.append(httpclient.InferRequestedOutput('OUTPUT1',
                                                   binary_data=True))
    outputs[-1].set_shared_memory("output_data",
                                  output_byte_size,
                                  offset=output_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op_handle, utils.triton_to_np_dtype(output0['datatype']),
            output0['shape'])
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(shm_op_handle,
                                                 utils.triton_to_np_dtype(
                                                     output1['datatype']),
                                                 output1['shape'],
                                                 offset=output_byte_size)
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)
Exemplo n.º 9
0
    def check_sequence_async(self,
                             trial,
                             model_name,
                             input_dtype,
                             correlation_id,
                             sequence_thresholds,
                             values,
                             expected_result,
                             shm_region_handles,
                             batch_size=1,
                             sequence_name="<unknown>",
                             tensor_shape=(1,)):
        """Perform sequence of inferences using stream async run.
        The 'values' holds a list of tuples, one for each inference with format:

        (flag_str, value, pre_delay_ms)

        """
        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
            ("custom" not in trial) and ("onnx" not in trial) and
            ("libtorch" not in trial) and ("plan" not in trial)):
            self.assertFalse(True, "unknown trial type: " + trial)

        self.assertFalse(
            _test_system_shared_memory and _test_cuda_shared_memory,
            "Cannot set both System and CUDA shared memory flags to 1")

        full_shape = tensor_shape if "nobatch" in trial else (
            batch_size,) + tensor_shape

        client_utils = grpcclient
        triton_client = client_utils.InferenceServerClient(f"{_tritonserver_ipaddr}:8001",
                                                           verbose=True)
        user_data = UserData()
        triton_client.start_stream(partial(completion_callback, user_data))
        # Execute the sequence of inference...
        try:
            seq_start_ms = int(round(time.time() * 1000))

            INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT"
            OUTPUT = "OUTPUT__0" if trial.startswith("libtorch") else "OUTPUT"
            sent_count = 0
            for flag_str, value, pre_delay_ms in values:
                seq_start = False
                seq_end = False
                if flag_str is not None:
                    seq_start = ("start" in flag_str)
                    seq_end = ("end" in flag_str)

                # Construct request IOs
                inputs = []
                outputs = []
                inputs.append(
                    client_utils.InferInput(INPUT, full_shape,
                                            np_to_triton_dtype(input_dtype)))
                outputs.append(client_utils.InferRequestedOutput(OUTPUT))

                if not (_test_system_shared_memory or _test_cuda_shared_memory):
                    if input_dtype == np.object_:
                        in0 = np.full(full_shape, value, dtype=np.int32)
                        in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                                        dtype=object)
                        in0 = in0n.reshape(full_shape)
                    else:
                        in0 = np.full(full_shape, value, dtype=input_dtype)
                    inputs[0].set_data_from_numpy(in0)
                else:
                    offset = 2 * sent_count
                    inputs[0].set_shared_memory(shm_region_handles[offset][0],
                                                shm_region_handles[offset][1])
                    outputs[0].set_shared_memory(
                        shm_region_handles[offset + 1][0],
                        shm_region_handles[offset + 1][1])

                if pre_delay_ms is not None:
                    time.sleep(pre_delay_ms / 1000.0)

                triton_client.async_stream_infer(model_name,
                                                 inputs,
                                                 outputs=outputs,
                                                 sequence_id=correlation_id,
                                                 sequence_start=seq_start,
                                                 sequence_end=seq_end)
                sent_count += 1

            # Wait for the results in the order sent
            result = None
            processed_count = 0
            while processed_count < sent_count:
                (results, error) = user_data._completed_requests.get()
                if error is not None:
                    raise error
                # Get value of "OUTPUT", for shared memory, need to get it via
                # shared memory utils
                if (not _test_system_shared_memory) and (
                        not _test_cuda_shared_memory):
                    out = results.as_numpy(OUTPUT)
                else:
                    output = results.get_output(OUTPUT)
                    offset = 2 * processed_count + 1
                    output_shape = output.shape
                    output_type = input_dtype
                    if _test_system_shared_memory:
                        out = shm.get_contents_as_numpy(
                            shm_region_handles[offset][2], output_type,
                            output_shape)
                    else:
                        out = cudashm.get_contents_as_numpy(
                            shm_region_handles[offset][2], output_type,
                            output_shape)
                result = out[0] if "nobatch" in trial else out[0][0]
                print("{}: {}".format(sequence_name, result))
                processed_count += 1

            seq_end_ms = int(round(time.time() * 1000))

            if input_dtype == np.object_:
                self.assertEqual(int(result), expected_result)
            else:
                self.assertEqual(result, expected_result)

            if sequence_thresholds is not None:
                lt_ms = sequence_thresholds[0]
                gt_ms = sequence_thresholds[1]
                if lt_ms is not None:
                    if _test_jetson:
                        lt_ms *= _jetson_slowdown_factor
                    self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
                                    "sequence expected less than " +
                                    str(lt_ms) + "ms response time, got " +
                                    str(seq_end_ms - seq_start_ms) + " ms")
                if gt_ms is not None:
                    self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
                                    "sequence expected greater than " +
                                    str(gt_ms) + "ms response time, got " +
                                    str(seq_end_ms - seq_start_ms) + " ms")
        except Exception as ex:
            self.add_deferred_exception(ex)
        triton_client.stop_stream()
Exemplo n.º 10
0
    def check_sequence(self,
                       trial,
                       model_name,
                       input_dtype,
                       correlation_id,
                       sequence_thresholds,
                       values,
                       expected_result,
                       protocol,
                       batch_size=1,
                       sequence_name="<unknown>",
                       tensor_shape=(1,)):
        """Perform sequence of inferences. The 'values' holds a list of
        tuples, one for each inference with format:

        (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms)

        """
        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
            ("custom" not in trial) and ("onnx" not in trial) and
            ("libtorch" not in trial) and ("plan" not in trial)):
            self.assertFalse(True, "unknown trial type: " + trial)

        # Can only send the request exactly once since it is a
        # sequence model with state, so can have only a single config.
        configs = []
        if protocol == "http":
            configs.append((f"{_tritonserver_ipaddr}:8000", "http", False))
        if protocol == "grpc":
            configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False))
        if protocol == "streaming":
            configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", True))

        self.assertFalse(
            _test_system_shared_memory and _test_cuda_shared_memory,
            "Cannot set both System and CUDA shared memory flags to 1")

        self.assertEqual(len(configs), 1)

        full_shape = tensor_shape if "nobatch" in trial else (
            batch_size,) + tensor_shape

        # create and register shared memory output region in advance,
        # knowing that this function will not be called concurrently.
        if _test_system_shared_memory or _test_cuda_shared_memory:
            self.triton_client_.unregister_system_shared_memory()
            self.triton_client_.unregister_cuda_shared_memory()
            output_byte_size = 512
            if _test_system_shared_memory:
                shm_op_handle = shm.create_shared_memory_region(
                    "output_data", "/output", output_byte_size)
                self.triton_client_.register_system_shared_memory(
                    "output_data", "/output", output_byte_size)
            elif _test_cuda_shared_memory:
                shm_op_handle = cudashm.create_shared_memory_region(
                    "output_data", output_byte_size, 0)
                self.triton_client_.register_cuda_shared_memory(
                    "output_data", cudashm.get_raw_handle(shm_op_handle), 0,
                    output_byte_size)
            shm_ip_handles = []

        for config in configs:
            client_utils = grpcclient if config[1] == "grpc" else httpclient

            triton_client = client_utils.InferenceServerClient(config[0],
                                                               verbose=True)
            if config[2]:
                user_data = UserData()
                triton_client.start_stream(
                    partial(completion_callback, user_data))
            # Execute the sequence of inference...
            try:
                seq_start_ms = int(round(time.time() * 1000))

                INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT"
                OUTPUT = "OUTPUT__0" if trial.startswith(
                    "libtorch") else "OUTPUT"
                for flag_str, value, thresholds, delay_ms in values:
                    if _test_valgrind or _test_jetson:
                        if delay_ms is not None:
                            delay_ms[0] = max(_valgrind_delay_ms, delay_ms[0])
                            delay_ms[1] = max(_valgrind_delay_ms, delay_ms[1])
                        else:
                            delay_ms = (_valgrind_delay_ms, _valgrind_delay_ms)

                    if delay_ms is not None:
                        time.sleep(delay_ms[0] / 1000.0)

                    seq_start = False
                    seq_end = False
                    if flag_str is not None:
                        seq_start = ("start" in flag_str)
                        seq_end = ("end" in flag_str)

                    # Construct request IOs
                    inputs = []
                    outputs = []
                    inputs.append(
                        client_utils.InferInput(
                            INPUT, full_shape, np_to_triton_dtype(input_dtype)))
                    outputs.append(client_utils.InferRequestedOutput(OUTPUT))
                    if input_dtype == np.object_:
                        in0 = np.full(full_shape, value, dtype=np.int32)
                        in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                                        dtype=object)
                        in0 = in0n.reshape(full_shape)
                    else:
                        in0 = np.full(full_shape, value, dtype=input_dtype)

                    # create input shared memory and copy input data values into it
                    if _test_system_shared_memory or _test_cuda_shared_memory:
                        if input_dtype == np.object_:
                            input_list_tmp = iu.serialize_byte_tensor_list(
                                [in0])
                            input_byte_size = sum([
                                serialized_byte_size(i0)
                                for i0 in input_list_tmp
                            ])
                        else:
                            input_list_tmp = [in0]
                            input_byte_size = sum(
                                [i0.nbytes for i0 in input_list_tmp])
                        ip_name = "ip{}".format(len(shm_ip_handles))
                        if _test_system_shared_memory:
                            shm_ip_handles.append(
                                shm.create_shared_memory_region(
                                    ip_name, "/" + ip_name, input_byte_size))
                            shm.set_shared_memory_region(
                                shm_ip_handles[-1], input_list_tmp)
                            triton_client.register_system_shared_memory(
                                ip_name, "/" + ip_name, input_byte_size)
                        elif _test_cuda_shared_memory:
                            shm_ip_handles.append(
                                cudashm.create_shared_memory_region(
                                    ip_name, input_byte_size, 0))
                            cudashm.set_shared_memory_region(
                                shm_ip_handles[-1], input_list_tmp)
                            triton_client.register_cuda_shared_memory(
                                ip_name,
                                cudashm.get_raw_handle(shm_ip_handles[-1]), 0,
                                input_byte_size)

                        inputs[0].set_shared_memory(ip_name, input_byte_size)
                        outputs[0].set_shared_memory("output_data",
                                                     output_byte_size)
                    else:
                        inputs[0].set_data_from_numpy(in0)

                    start_ms = int(round(time.time() * 1000))

                    if config[2]:
                        triton_client.async_stream_infer(
                            model_name,
                            inputs,
                            outputs=outputs,
                            sequence_id=correlation_id,
                            sequence_start=seq_start,
                            sequence_end=seq_end)
                        (results, error) = user_data._completed_requests.get()
                        if error is not None:
                            raise error
                    else:
                        results = triton_client.infer(
                            model_name,
                            inputs,
                            outputs=outputs,
                            sequence_id=correlation_id,
                            sequence_start=seq_start,
                            sequence_end=seq_end)

                    end_ms = int(round(time.time() * 1000))

                    # Get value of "OUTPUT", for shared memory, need to get it via
                    # shared memory utils
                    if (not _test_system_shared_memory) and (
                            not _test_cuda_shared_memory):
                        out = results.as_numpy(OUTPUT)
                    else:
                        output = results.get_output(OUTPUT)
                        if config[1] == "http":
                            output_shape = output["shape"]
                        else:
                            output_shape = output.shape
                        output_type = input_dtype
                        if _test_system_shared_memory:
                            out = shm.get_contents_as_numpy(
                                shm_op_handle, output_type, output_shape)
                        else:
                            out = cudashm.get_contents_as_numpy(
                                shm_op_handle, output_type, output_shape)
                    result = out[0] if "nobatch" in trial else out[0][0]
                    print("{}: {}".format(sequence_name, result))

                    if thresholds is not None:
                        lt_ms = thresholds[0]
                        gt_ms = thresholds[1]
                        if lt_ms is not None:
                            self.assertTrue((end_ms - start_ms) < lt_ms,
                                            "expected less than " + str(lt_ms) +
                                            "ms response time, got " +
                                            str(end_ms - start_ms) + " ms")
                        if gt_ms is not None:
                            self.assertTrue(
                                (end_ms - start_ms) > gt_ms,
                                "expected greater than " + str(gt_ms) +
                                "ms response time, got " +
                                str(end_ms - start_ms) + " ms")
                    if delay_ms is not None:
                        time.sleep(delay_ms[1] / 1000.0)

                seq_end_ms = int(round(time.time() * 1000))

                if input_dtype == np.object_:
                    self.assertEqual(int(result), expected_result)
                else:
                    self.assertEqual(result, expected_result)

                if sequence_thresholds is not None:
                    lt_ms = sequence_thresholds[0]
                    gt_ms = sequence_thresholds[1]
                    if lt_ms is not None:
                        if _test_jetson:
                            lt_ms *= _jetson_slowdown_factor
                        self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
                                        "sequence expected less than " +
                                        str(lt_ms) + "ms response time, got " +
                                        str(seq_end_ms - seq_start_ms) + " ms")
                    if gt_ms is not None:
                        self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
                                        "sequence expected greater than " +
                                        str(gt_ms) + "ms response time, got " +
                                        str(seq_end_ms - seq_start_ms) + " ms")
            except Exception as ex:
                self.add_deferred_exception(ex)
            if config[2]:
                triton_client.stop_stream()

        if _test_system_shared_memory or _test_cuda_shared_memory:
            self.triton_client_.unregister_system_shared_memory()
            self.triton_client_.unregister_cuda_shared_memory()
            destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region
            destroy_func(shm_op_handle)
            for shm_ip_handle in shm_ip_handles:
                destroy_func(shm_ip_handle)
Exemplo n.º 11
0
    outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
    outputs[-1].set_shared_memory("output0_data", output0_byte_size)

    outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
    outputs[-1].set_shared_memory("output1_data", output1_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    print(utils.triton_to_np_dtype(output0['datatype']))
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0['datatype']),
            output0['shape'])
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(
            shm_op1_handle, utils.triton_to_np_dtype(output1['datatype']),
            output1['shape'])
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)

    for i in range(16):
def infer_and_validata(use_shared_memory, orig_input0_data, orig_input1_data):
    if use_shared_memory:
        input0_data = orig_input0_data
        input1_data = orig_input1_data
        byte_size = input0_data.size * input0_data.itemsize
        inputs[0].set_shared_memory("input0_data", byte_size)
        inputs[1].set_shared_memory("input1_data", byte_size)
        outputs[0].set_shared_memory("output0_data", byte_size)
        outputs[1].set_shared_memory("output1_data", byte_size)
    else:
        input0_data = orig_input0_data
        input1_data = orig_input1_data * 2
        inputs[0].set_data_from_numpy(np.expand_dims(input0_data, axis=0))
        inputs[1].set_data_from_numpy(np.expand_dims(input1_data, axis=0))
        outputs[0].unset_shared_memory()
        outputs[1].unset_shared_memory()

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        if use_shared_memory:
            if protocol == "grpc":
                output0_data = shm.get_contents_as_numpy(
                    shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
                    output0.shape)
            else:
                output0_data = shm.get_contents_as_numpy(
                    shm_op0_handle,
                    utils.triton_to_np_dtype(output0['datatype']),
                    output0['shape'])
        else:
            output0_data = results.as_numpy('OUTPUT0')
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        if use_shared_memory:
            if protocol == "grpc":
                output1_data = shm.get_contents_as_numpy(
                    shm_op1_handle, utils.triton_to_np_dtype(output1.datatype),
                    output1.shape)
            else:
                output1_data = shm.get_contents_as_numpy(
                    shm_op1_handle,
                    utils.triton_to_np_dtype(output1['datatype']),
                    output1['shape'])
        else:
            output1_data = results.as_numpy('OUTPUT1')
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)

    if use_shared_memory:
        print("\n\n======== SHARED_MEMORY ========\n")
    else:
        print("\n\n======== NO_SHARED_MEMORY ========\n")
    for i in range(16):
        print(
            str(input0_data[i]) + " + " + str(input1_data[i]) + " = " +
            str(output0_data[0][i]))
        print(
            str(input0_data[i]) + " - " + str(input1_data[i]) + " = " +
            str(output1_data[0][i]))
        if (input0_data[i] + input1_data[i]) != output0_data[0][i]:
            print("shm infer error: incorrect sum")
            sys.exit(1)
        if (input0_data[i] - input1_data[i]) != output1_data[0][i]:
            print("shm infer error: incorrect difference")
            sys.exit(1)
    print("\n======== END ========\n\n")
Exemplo n.º 13
0
    outputs[-1].set_shared_memory("output_data", output_byte_size)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
    outputs[-1].set_shared_memory("output_data",
                                  output_byte_size,
                                  offset=output_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op_handle, utils.triton_to_np_dtype(output0.datatype),
            output0.shape)
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(shm_op_handle,
                                                 utils.triton_to_np_dtype(
                                                     output1.datatype),
                                                 output1.shape,
                                                 offset=output_byte_size)
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)