示例#1
0
    def run(self, client_metadata):
        trial = self.get_trial()
        model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_)
        triton_client = client_metadata[0]
        input_name = self.input_name_
        if "librotch" in trial:
            input_name = "INPUT__0"

        tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) //
                                   np.dtype(self.input_dtype_).itemsize), )
        in0 = np.random.random(tensor_shape).astype(self.input_dtype_)
        inputs = [
            grpcclient.InferInput(input_name, tensor_shape,
                                  np_to_triton_dtype(self.input_dtype_)),
        ]
        inputs[0].set_data_from_numpy(in0)

        # Expect an exception for small timeout values.
        try:
            triton_client.infer(model_name, inputs, client_timeout=0.1)
            assert False, "expected inference failure from deadline exceeded"
        except Exception as ex:
            if "Deadline Exceeded" not in ex.message():
                assert False, "timeout_client failed {}".format(self.name_)
            # Expect timeout error as success case
            return 1
    def test_custom(self):
        tensor_shape = (self.input_size_, )
        small_tensor_shape = (1, )

        # custom_zero_1_float32 is identity model with input shape [-1]
        for protocol, url in self.protocols_:
            model_name = tu.get_zero_model_name("custom", 1, self.data_type_)
            ctx = InferContext(url, protocol, model_name, None, True)
            self._test_helper(ctx, tensor_shape, small_tensor_shape)
示例#3
0
    def test_libtorch(self):
        tensor_shape = (self.input_size_,)
        small_tensor_shape = (1,)

        # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1]
        for protocol, url in self.protocols_:
            model_name = tu.get_zero_model_name("libtorch_nobatch", 1, self.data_type_)
            ctx = InferContext(url, protocol, model_name, None, True)
            self._test_helper(ctx, tensor_shape, small_tensor_shape,
                              'INPUT__0', 'OUTPUT__0')
示例#4
0
def create_identity_ensemble_modelconfig(ensemble_test_type,
                                         models_dir,
                                         model_version,
                                         max_batch,
                                         dtype,
                                         input_shapes,
                                         input_model_shapes,
                                         output_shapes,
                                         output_model_shapes,
                                         predefined_schedule=None):
    io_cnt = len(input_shapes)

    for ensemble_type in BASIC_ENSEMBLE_TYPES:
        # Use a different model name for the non-batching variant
        ensemble_prefix = "{}_{}".format(ensemble_type, ensemble_test_type)
        model_name = tu.get_zero_model_name(
            ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt,
            dtype)

        # [TODO] Temp fix for infer_zero
        ensemble_schedule = predefined_schedule
        if predefined_schedule is None:
            ensemble_schedule = IdentityEnsembleSchedule(
                ensemble_type,
                ensemble_test_type).get_schedule(dtype, input_shapes,
                                                 input_model_shapes,
                                                 output_shapes,
                                                 output_model_shapes)

        config_dir = models_dir + "/" + model_name
        config = create_general_modelconfig(model_name, "ensemble", max_batch,
                                            repeat(dtype, io_cnt),
                                            input_shapes, input_model_shapes,
                                            repeat(dtype, io_cnt),
                                            output_shapes, output_model_shapes,
                                            repeat(None, io_cnt))
        config += ensemble_schedule

        try:
            os.makedirs(config_dir)
        except OSError as ex:
            pass  # ignore existing dir

        with open(config_dir + "/config.pbtxt", "w") as cfile:
            cfile.write(config)
def create_identity_ensemble_modelfile(ensemble_test_type, models_dir,
                                       model_version, max_batch, dtype,
                                       input_shapes, output_shapes):
    io_cnt = len(input_shapes)

    # Use a different model name for the non-batching variant
    for ensemble_type in BASIC_ENSEMBLE_TYPES:
        ensemble_prefix = "{}_{}".format(ensemble_type, ensemble_test_type)
        model_name = tu.get_zero_model_name(
            ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt,
            dtype)
        model_version_dir = models_dir + "/" + model_name + "/" + str(
            model_version)

        try:
            os.makedirs(model_version_dir)
        except OSError as ex:
            pass  # ignore existing dir
    def test_dynamic_different_shape_values(self):
        # Send two requests with sum of static batch sizes ==
        # preferred size, but with different shape values. This
        # should cause the requests to not be batched. The first
        # response will come back immediately and the second
        # delayed by the max batch queue delay
        try:
            url = "localhost:8000"
            protocol = ProtocolType.HTTP
            model_name = tu.get_zero_model_name("plan", 1, np.float32)
            self.check_setup(url, protocol, model_name)
            self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ)

            threads = []
            threads.append(
                threading.Thread(target=self.check_response,
                                 args=(3, (6000, None)),
                                 kwargs={
                                     'shape_values': [[2, 2]],
                                     'dummy_input_shapes': [[16, 16]],
                                     'shm_suffix': '{}'.format(len(threads))
                                 }))
            threads.append(
                threading.Thread(target=self.check_response,
                                 args=(3, (_max_queue_delay_ms * 1.5,
                                           _max_queue_delay_ms)),
                                 kwargs={
                                     'shape_values': [[4, 4]],
                                     'dummy_input_shapes': [[16, 16]],
                                     'shm_suffix': '{}'.format(len(threads))
                                 }))
            threads[0].start()
            time.sleep(1)
            threads[1].start()
            for t in threads:
                t.join()
            self.check_deferred_exception()
            self.check_status(url, protocol, model_name, (3, ), 2, 6)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
示例#7
0
    def test_dynamic_identical_shape_values(self):
        # Send two requests with sum of static batch sizes ==
        # preferred size, but with identical shape values. This
        # should cause the requests to get batched. Both
        # responses should come back immediately.
        try:
            model_name = tu.get_zero_model_name("plan", 1, np.float32)
            self.check_setup(model_name)
            self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)

            threads = []
            threads.append(
                threading.Thread(target=self.check_response,
                                 args=(4, (6000, None)),
                                 kwargs={
                                     'shape_values': [[4, 4]],
                                     'dummy_input_shapes': [[16, 16]],
                                     'shm_suffix': '{}'.format(len(threads))
                                 }))
            threads.append(
                threading.Thread(target=self.check_response,
                                 args=(2, (6000, None)),
                                 kwargs={
                                     'shape_values': [[4, 4]],
                                     'dummy_input_shapes': [[16, 16]],
                                     'shm_suffix': '{}'.format(len(threads))
                                 }))
            threads[0].start()
            time.sleep(1)
            threads[1].start()
            for t in threads:
                t.join()
            self.check_deferred_exception()
            self.check_status(model_name, {6: 1}, 1, 6)
        except Exception as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
示例#8
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_streaming=True,
               shm_region_name_prefix=None,
               use_system_shared_memory=False,
               use_cuda_shared_memory=False):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", ProtocolType.HTTP, False))
    if use_grpc:
        configs.append(("localhost:8001", ProtocolType.GRPC, False))
    if use_streaming:
        configs.append(("localhost:8001", ProtocolType.GRPC, True))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    output_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()
    shared_memory_ctx = SharedMemoryControlContext("localhost:8000",
                                                   ProtocolType.HTTP,
                                                   verbose=False)

    for io_num in range(io_cnt):
        if pf == "libtorch" or pf == "libtorch_nobatch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_list = list()
        expected_list = list()
        for b in range(batch_size):
            rtensor_dtype = _range_repr_dtype(tensor_dtype)
            if (rtensor_dtype != np.bool):
                in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                        high=np.iinfo(rtensor_dtype).max,
                                        size=input_shapes[io_num],
                                        dtype=rtensor_dtype)
            else:
                in0 = np.random.choice(a=[False, True],
                                       size=input_shapes[io_num])
            if tensor_dtype != np.object:
                in0 = in0.astype(tensor_dtype)
                expected0 = np.ndarray.copy(in0)
            else:
                expected0 = np.array(
                    [unicode(str(x), encoding='utf-8') for x in in0.flatten()],
                    dtype=object)
                in0 = np.array([str(x) for x in in0.flatten()],
                               dtype=object).reshape(in0.shape)

            expected0 = expected0.reshape(output_shapes[io_num])

            input_list.append(in0)
            expected_list.append(expected0)

        expected_dict[output_name] = expected_list

        input_byte_size = tu.shape_element_count(input_shapes[io_num]) *\
                            np.dtype(tensor_dtype).itemsize * batch_size
        output_byte_size = tu.shape_element_count(output_shapes[io_num]) *\
                            np.dtype(tensor_dtype).itemsize * batch_size
        # create and register shared memory region for inputs and outputs
        shm_io_handle = su.create_register_set_either_shm_region(
            [
                shm_region_name_prefix[0] + str(io_num),
                shm_region_name_prefix[1] + str(io_num)
            ], input_list, input_byte_size, output_byte_size,
            shared_memory_ctx, use_system_shared_memory,
            use_cuda_shared_memory)
        if len(shm_io_handle) != 0:
            shm_ip_handles.append(shm_io_handle[0])
            shm_op_handles.append(shm_io_handle[1])
            input_dict[input_name] = (shm_ip_handles[io_num], input_shapes)
            output_dict[output_name] = (InferContext.ResultFormat.RAW,
                                        shm_op_handles[io_num])
        else:
            input_dict[input_name] = input_list
            output_dict[output_name] = InferContext.ResultFormat.RAW

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        ctx = InferContext(config[0],
                           config[1],
                           model_name,
                           model_version,
                           correlation_id=0,
                           streaming=config[2],
                           verbose=True)
        results = ctx.run(input_dict, output_dict, batch_size)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(),
                               model_version)

        tester.assertEqual(len(results), io_cnt)
        for (result_name, result_val) in iteritems(results):
            tester.assertTrue(result_name in output_dict)
            tester.assertTrue(result_name in expected_dict)
            for b in range(batch_size):
                expected = expected_dict[result_name][b]
                tester.assertEqual(result_val[b].shape, expected.shape)
                tester.assertTrue(
                    np.array_equal(result_val[b], expected),
                    "{}, {}, slot {}, expected: {}, got {}".format(
                        model_name, result_name, b, expected, result_val[b]))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            shared_memory_ctx.unregister(shm_ip_handles[io_num])
            shared_memory_ctx.unregister(shm_op_handles[io_num])
            su.destroy_either_shm_region(shm_ip_handles[io_num],
                                         use_system_shared_memory,
                                         use_cuda_shared_memory)
            su.destroy_either_shm_region(shm_op_handles[io_num],
                                         use_system_shared_memory,
                                         use_cuda_shared_memory)

    return results
 def test_custom(self):
     # custom_zero_1_float32 is identity model with input shape [-1]
     for client in self._clients:
         model_name = tu.get_zero_model_name("custom", 1, self._data_type)
         self._test_helper(client, model_name)
示例#10
0
 def test_libtorch(self):
     # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1]
     for client in self._clients:
         model_name = tu.get_zero_model_name("libtorch_nobatch", 1,
                                             self._data_type)
         self._test_helper(client, model_name, 'INPUT__0', 'OUTPUT__0')
示例#11
0
 def test_plan(self):
     # plan_nobatch_zero_1_float32 is identity model with input shape [-1]
     for client in self._clients:
         model_name = tu.get_zero_model_name("plan_nobatch", 1,
                                             self._data_type)
         self._test_helper(client, model_name)
示例#12
0
 def test_onnx(self):
     # onnx_nobatch_zero_1_float32 is identity model with input shape [-1]
     for client in self._clients:
         model_name = tu.get_zero_model_name("onnx_nobatch", 1,
                                             self._data_type)
示例#13
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_streaming=True):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        if TEST_SHARED_MEMORY:
            configs.append(("localhost:8000", ProtocolType.HTTP, False, True))
        else:
            configs.append(("localhost:8000", ProtocolType.HTTP, False, False))
    if use_grpc:
        if TEST_SHARED_MEMORY:
            configs.append(("localhost:8001", ProtocolType.GRPC, False, True))
        else:
            configs.append(("localhost:8001", ProtocolType.GRPC, False, False))
    if use_streaming:
        if TEST_SHARED_MEMORY:
            configs.append(("localhost:8001", ProtocolType.GRPC, True, True))
        else:
            configs.append(("localhost:8001", ProtocolType.GRPC, True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
        input_dict = {}
        output_dict = {}
        expected_dict = {}

        if config[3]:
            # create and register shared memory region for inputs and outputs
            shm_ip_handles = list()
            shm_op_handles = list()
            shared_memory_ctx = SharedMemoryControlContext(config[0],
                                                           config[1],
                                                           verbose=True)
            for io_num in range(io_cnt):
                input0_byte_size = tu.shape_element_count(input_shapes[io_num]) *\
                                    np.dtype(tensor_dtype).itemsize * batch_size
                output0_byte_size = tu.shape_element_count(output_shapes[io_num]) *\
                                    np.dtype(tensor_dtype).itemsize * batch_size
                shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data",\
                                            "/input"+str(io_num), input0_byte_size))
                shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data",\
                                            "/output"+str(io_num), output0_byte_size))

                shm.register(shm_ip_handles[io_num])
                shm.register(shm_op_handles[io_num])

            offset_input = 0
            offset_output = 0

        for io_num in range(io_cnt):
            if pf == "libtorch" or pf == "libtorch_nobatch":
                input_name = "INPUT__{}".format(io_num)
                output_name = "OUTPUT__{}".format(io_num)
            else:
                input_name = "INPUT{}".format(io_num)
                output_name = "OUTPUT{}".format(io_num)

            input_list = list()
            expected_list = list()
            for b in range(batch_size):
                rtensor_dtype = _range_repr_dtype(tensor_dtype)
                in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                        high=np.iinfo(rtensor_dtype).max,
                                        size=input_shapes[io_num],
                                        dtype=rtensor_dtype)
                if tensor_dtype != np.object:
                    in0 = in0.astype(tensor_dtype)
                    expected0 = np.ndarray.copy(in0)
                else:
                    expected0 = np.array([
                        unicode(str(x), encoding='utf-8')
                        for x in in0.flatten()
                    ],
                                         dtype=object)
                    in0 = np.array([str(x) for x in in0.flatten()],
                                   dtype=object).reshape(in0.shape)

                expected0 = expected0.reshape(output_shapes[io_num])

                input_list.append(in0)
                expected_list.append(expected0)

            expected_dict[output_name] = expected_list
            if config[3]:
                # copy data into shared memory region for input values
                shm.set_shared_memory_region(shm_ip_handles[io_num],
                                             input_list)
                input_dict[input_name] = shm_ip_handles[io_num]
                output_dict[output_name] = (InferContext.ResultFormat.RAW,
                                            shm_op_handles[io_num])
            else:
                input_dict[input_name] = input_list
                output_dict[output_name] = InferContext.ResultFormat.RAW

        ctx = InferContext(config[0],
                           config[1],
                           model_name,
                           model_version,
                           correlation_id=0,
                           streaming=config[2],
                           verbose=True)
        results = ctx.run(input_dict, output_dict, batch_size)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(),
                               model_version)

        tester.assertEqual(len(results), io_cnt)
        for (result_name, result_val) in iteritems(results):
            tester.assertTrue(result_name in output_dict)
            tester.assertTrue(result_name in expected_dict)
            for b in range(batch_size):
                expected = expected_dict[result_name][b]
                tester.assertEqual(result_val[b].shape, expected.shape)
                tester.assertTrue(
                    np.array_equal(result_val[b], expected),
                    "{}, {}, slot {}, expected: {}, got {}".format(
                        model_name, result_name, b, expected, result_val[b]))
        if config[3]:
            for io_num in range(io_cnt):
                shared_memory_ctx.unregister(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shared_memory_ctx.unregister(shm_op_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
示例#14
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_http_json_tensors=True,
               use_streaming=True,
               shm_region_name_prefix=None,
               use_system_shared_memory=False,
               use_cuda_shared_memory=False,
               priority=0,
               timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if use_http_json_tensors and (tensor_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    # Get model platform
    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    for io_num in range(io_cnt):
        if platform == "pytorch_libtorch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape,
                                            dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([
                unicode(str(x), encoding='utf-8')
                for x in input_array.flatten()
            ],
                                      dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region(
            [
                shm_region_name_prefix[0] + str(io_num),
                shm_region_name_prefix[1] + str(io_num)
            ], input_list_tmp, input_byte_size, output_byte_size,
            use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(
                zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(
                    httpclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    httpclient.InferRequestedOutput(output_name,
                                                    binary_data=config[3]))
            else:
                inputs.append(
                    grpcclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data,
                                                   binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(
                    inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size,
                    output_byte_size, use_system_shared_memory,
                    use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()),
                    priority=priority,
                    timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority,
                                          timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if platform == "pytorch_libtorch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(
                np.array_equal(output_data, expected),
                "{}, {}, expected: {}, got {}".format(model_name, result_name,
                                                      expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
示例#15
0
def infer_shape_tensor(tester,
                       pf,
                       tensor_dtype,
                       input_shape_values,
                       dummy_input_shapes,
                       use_http=True,
                       use_grpc=True,
                       use_streaming=True,
                       shm_suffix="",
                       use_system_shared_memory=False,
                       priority=0,
                       timeout_us=0,
                       batch_size=1):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    tester.assertTrue(pf == "plan" or pf == "plan_nobatch")
    tester.assertEqual(len(input_shape_values), len(dummy_input_shapes))

    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True))

    io_cnt = len(input_shape_values)

    # FIXME wrap up shm handle cleanup
    # item is (handle, byte_size)
    input_shm_handle_list = []
    output_shm_handle_list = []
    dummy_input_list = []
    input_list = []
    expected_dict = dict()
    # Prepare IO in advance
    for io_num in range(io_cnt):
        dummy_input_name = "DUMMY_INPUT{}".format(io_num)
        input_name = "INPUT{}".format(io_num)
        dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
        output_name = "OUTPUT{}".format(io_num)

        # Prepare the dummy tensor
        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                          high=np.iinfo(rtensor_dtype).max,
                                          size=dummy_input_shapes[io_num],
                                          dtype=rtensor_dtype)
        else:
            dummy_in0 = np.random.choice(a=[False, True],
                                         size=dummy_input_shapes[io_num])
        if tensor_dtype != np.object:
            dummy_in0 = dummy_in0.astype(tensor_dtype)
        else:
            dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()],
                                 dtype=object).reshape(dummy_in0.shape)
        dummy_input_list.append(dummy_in0)

        # Prepare shape input tensor
        in0 = np.asarray(input_shape_values[io_num], dtype=np.int32)
        input_list.append(in0)

        # Prepare the expected value for the output. Skip dummy output as we
        # only care about its shape (== value of OUTPUT*)
        expected_dict[output_name] = np.ndarray.copy(in0)

        # Only need to create region once
        input_byte_size = in0.size * np.dtype(np.int32).itemsize
        output_byte_size = input_byte_size * batch_size
        if use_system_shared_memory:
            input_shm_handle_list.append(
                (shm.create_shared_memory_region(input_name + shm_suffix,
                                                 '/' + input_name + shm_suffix,
                                                 input_byte_size),
                 input_byte_size))
            output_shm_handle_list.append((shm.create_shared_memory_region(
                output_name + shm_suffix, '/' + output_name + shm_suffix,
                output_byte_size), output_byte_size))
            shm.set_shared_memory_region(input_shm_handle_list[-1][0], [
                in0,
            ])

    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    # Run inference and check results for each config
    for config in configs:
        client_utils = grpcclient if config[1] == "grpc" else httpclient
        triton_client = client_utils.InferenceServerClient(config[0],
                                                           verbose=True)

        inputs = []
        outputs = []

        # Set IOs
        for io_num in range(io_cnt):
            dummy_input_name = "DUMMY_INPUT{}".format(io_num)
            input_name = "INPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

            inputs.append(
                client_utils.InferInput(dummy_input_name,
                                        dummy_input_shapes[io_num],
                                        np_to_triton_dtype(tensor_dtype)))
            inputs.append(
                client_utils.InferInput(input_name, input_list[io_num].shape,
                                        "INT32"))
            outputs.append(
                client_utils.InferRequestedOutput(dummy_output_name))
            outputs.append(client_utils.InferRequestedOutput(output_name))

            # -2: dummy; -1: input
            inputs[-2].set_data_from_numpy(dummy_input_list[io_num])
            if (not use_system_shared_memory):
                inputs[-1].set_data_from_numpy(input_list[io_num])
            else:
                input_byte_size = input_shm_handle_list[io_num][1]
                output_byte_size = output_shm_handle_list[io_num][1]
                triton_client.register_system_shared_memory(
                    input_name + shm_suffix, "/" + input_name + shm_suffix,
                    input_byte_size)
                triton_client.register_system_shared_memory(
                    output_name + shm_suffix, "/" + output_name + shm_suffix,
                    output_byte_size)
                inputs[-1].set_shared_memory(input_name + shm_suffix,
                                             input_byte_size)
                outputs[-1].set_shared_memory(output_name + shm_suffix,
                                              output_byte_size)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                                           inputs,
                                                           outputs=outputs,
                                                           priority=priority,
                                                           timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          outputs=outputs,
                                          priority=priority,
                                          timeout=timeout_us)

        for io_num in range(io_cnt):
            output_name = "OUTPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            expected = expected_dict[output_name]

            # get outputs as numpy array
            dummy_out = results.as_numpy(dummy_output_name)
            if (not use_system_shared_memory):
                out = results.as_numpy(output_name)
            else:
                output = results.get_output(output_name)
                if config[1] == "grpc":
                    output_shape = output.shape
                else:
                    output_shape = output["shape"]
                out = shm.get_contents_as_numpy(
                    output_shm_handle_list[io_num][0], np.int32, output_shape)

            # if out shape is 2D, it is batched
            if (len(out.shape) == 2):
                # The shape of the dummy output should be equal to the shape values
                # specified in the shape tensor
                tester.assertTrue(
                    np.array_equal(dummy_out.shape[1:], out[0]),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out[0],
                        dummy_out.shape[1:]))
                for b in range(1, out.shape[0]):
                    tester.assertTrue(
                        np.array_equal(out[b - 1], out[b]),
                        "expect shape tensor has consistent value, "
                        "expected: {}, got {}".format(out[b - 1], out[b]))
                out = out[0]
            else:
                tester.assertTrue(
                    np.array_equal(dummy_out.shape, out),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out, dummy_out.shape))
            tester.assertTrue(
                np.array_equal(out, expected),
                "{}, {}, expected: {}, got {}".format(model_name, output_name,
                                                      expected, out))

            # unregister shared memory region for next config
            if use_system_shared_memory:
                triton_client.unregister_system_shared_memory(input_name +
                                                              shm_suffix)
                triton_client.unregister_system_shared_memory(output_name +
                                                              shm_suffix)

    for handle in input_shm_handle_list:
        shm.destroy_shared_memory_region(handle[0])
    for handle in output_shm_handle_list:
        shm.destroy_shared_memory_region(handle[0])
示例#16
0
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes,
               model_version=None, use_http=True, use_grpc=True,
               use_streaming=True):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", ProtocolType.HTTP, False))
    if use_grpc:
        configs.append(("localhost:8001", ProtocolType.GRPC, False))
    if use_streaming:
        configs.append(("localhost:8001", ProtocolType.GRPC, True))

    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
        input_dict = {}
        output_dict = {}
        expected_dict = {}

        for io_num in range(io_cnt):
            if pf == "libtorch" or pf == "libtorch_nobatch":
                input_name = "INPUT__{}".format(io_num)
                output_name = "OUTPUT__{}".format(io_num)
            else:
                input_name = "INPUT{}".format(io_num)
                output_name = "OUTPUT{}".format(io_num)

            input_list = list()
            expected_list = list()
            for b in range(batch_size):
                rtensor_dtype = _range_repr_dtype(tensor_dtype)
                in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                        high=np.iinfo(rtensor_dtype).max,
                                        size=input_shapes[io_num], dtype=rtensor_dtype)
                if tensor_dtype != np.object:
                    in0 = in0.astype(tensor_dtype)
                    expected0 = np.ndarray.copy(in0)
                else:
                    expected0 = np.array([unicode(str(x), encoding='utf-8')
                                    for x in in0.flatten()], dtype=object)
                    in0 = np.array([str(x) for x in in0.flatten()],
                                   dtype=object).reshape(in0.shape)

                expected0 = expected0.reshape(output_shapes[io_num])

                input_list.append(in0)
                expected_list.append(expected0)

            input_dict[input_name] = input_list
            output_dict[output_name] = InferContext.ResultFormat.RAW
            expected_dict[output_name] = expected_list

        ctx = InferContext(config[0], config[1], model_name, model_version,
                           correlation_id=0, streaming=config[2],
                           verbose=True)
        results = ctx.run(input_dict, output_dict, batch_size)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(), model_version)

        tester.assertEqual(len(results), io_cnt)
        for (result_name, result_val) in iteritems(results):
            tester.assertTrue(result_name in output_dict)
            tester.assertTrue(result_name in expected_dict)
            for b in range(batch_size):
                expected = expected_dict[result_name][b]
                tester.assertEqual(result_val[b].shape, expected.shape)
                tester.assertTrue(np.array_equal(result_val[b], expected),
                                  "{}, {}, slot {}, expected: {}, got {}".format(
                                      model_name, result_name, b, expected, result_val[b]))

    return results
示例#17
0
    def __init__(self,
                 name,
                 rng,
                 sequence_trials,
                 identity_trials,
                 queue_latency_range_us=(10000, 100000),
                 sequence_id_range=None,
                 verbose=False,
                 out_stream=sys.stdout):
        super().__init__(name, [], verbose, out_stream)
        self.rng_ = rng
        self.sequence_id_range_ = sequence_id_range
        # List of tuples
        # (model_name, max_concurrency, batch_size, list(more PA options),
        #  real_data_file),
        self.options_ = []

        # Add no validation models
        self.options_.append(
            PerfAnalyzerScenario.ModelOption("resnet_v1_50_graphdef_def", 32,
                                             (1, 4, 1),
                                             queue_latency_range_us))
        for trial in sequence_trials:
            dtype = self.get_datatype(trial)
            # Skip string sequence model for now, it is hard for PA to generate
            # valid input
            if dtype == np.dtype(object):
                continue
            model_name = tu.get_sequence_model_name(trial, dtype)
            self.options_.append(
                PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1),
                                                 queue_latency_range_us))
        for trial in identity_trials:
            dtype = np.float32
            model_name = tu.get_zero_model_name(trial, 1, dtype)
            if "libtorch" in trial:
                input_shapes = [("INPUT__0", "16")]
            else:
                input_shapes = [("INPUT0", "16")]
            self.options_.append(
                PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1),
                                                 queue_latency_range_us,
                                                 input_shapes))

        # Add output validation version of the models
        # Skip resnet as the output data has variation which makes exact
        # matching hard
        for trial in sequence_trials:
            dtype = self.get_datatype(trial)
            model_name = tu.get_sequence_model_name(trial, dtype)
            data_file = os.path.join("validation_data",
                                     "{}.json".format(model_name))
            self.generate_sequence_data(trial, dtype, data_file)
            self.options_.append(
                PerfAnalyzerScenario.ModelOption(model_name,
                                                 1, (1, 4, 1),
                                                 queue_latency_range_us,
                                                 input_file=data_file))
        for trial in identity_trials:
            dtype = np.float32
            model_name = tu.get_zero_model_name(trial, 1, dtype)
            data_file = os.path.join("validation_data",
                                     "{}.json".format(model_name))
            self.generate_identity_data(trial, dtype, data_file)
            self.options_.append(
                PerfAnalyzerScenario.ModelOption(model_name,
                                                 1, (1, 4, 1),
                                                 queue_latency_range_us,
                                                 input_file=data_file))
示例#18
0
def infer_shape_tensor(tester,
                       pf,
                       batch_size,
                       tensor_dtype,
                       input_shape_values,
                       dummy_input_shapes,
                       model_version=None,
                       use_http=True,
                       use_grpc=True,
                       use_streaming=True,
                       shm_suffix="",
                       use_system_shared_memory=False,
                       use_cuda_shared_memory=False,
                       priority=0,
                       timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", ProtocolType.HTTP, False))
    if use_grpc:
        configs.append(("localhost:8001", ProtocolType.GRPC, False))
    if use_streaming:
        configs.append(("localhost:8001", ProtocolType.GRPC, True))
    tester.assertEqual(len(input_shape_values), len(dummy_input_shapes))
    io_cnt = len(input_shape_values)

    if use_system_shared_memory and use_cuda_shared_memory:
        raise ValueError(
            "Cannot set both System and CUDA shared memory flags to 1")

    input_dict = {}
    output_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()
    shared_memory_ctx = SharedMemoryControlContext("localhost:8000",
                                                   ProtocolType.HTTP,
                                                   verbose=False)

    for io_num in range(io_cnt):
        tester.assertTrue(pf == "plan" or pf == "plan_nobatch")

        input_name = "INPUT{}".format(io_num)
        output_name = "OUTPUT{}".format(io_num)
        dummy_input_name = "DUMMY_INPUT{}".format(io_num)
        dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)

        input_list = list()
        dummy_input_list = list()
        expected_list = list()
        for b in range(batch_size):
            # Prepare the dummy tensor
            rtensor_dtype = _range_repr_dtype(tensor_dtype)
            if (rtensor_dtype != np.bool):
                dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                              high=np.iinfo(rtensor_dtype).max,
                                              size=dummy_input_shapes[io_num],
                                              dtype=rtensor_dtype)
            else:
                dummy_in0 = np.random.choice(a=[False, True],
                                             size=dummy_input_shapes[io_num])
            if tensor_dtype != np.object:
                dummy_in0 = dummy_in0.astype(tensor_dtype)
            else:
                dummy_in0 = np.array([str(x) for x in in0.flatten()],
                                     dtype=object).reshape(in0.shape)

            dummy_input_list.append(dummy_in0)

        # Prepare shape input tensor. Only one tensor per batch
        in0 = np.asarray(input_shape_values[io_num], dtype=np.int32)
        input_list.append(in0)

        # Prepare the expected list for the output
        expected0 = np.ndarray.copy(in0)
        expected_list.append(expected0)

        expected_dict[output_name] = expected_list

        input_byte_size = len(in0) * np.dtype(tensor_dtype).itemsize
        output_byte_size = input_byte_size * batch_size
        dummy_input_byte_size = tu.shape_element_count(dummy_input_shapes[io_num]) *\
                            np.dtype(tensor_dtype).itemsize * batch_size
        # The dimension of this tensor will be the value of the shape tensor
        dummy_output_byte_size = tu.shape_element_count(in0) *\
                            np.dtype(tensor_dtype).itemsize * batch_size

        # create and register shared memory region for inputs and outputs
        if use_cuda_shared_memory:
            shm_ip_handles.append(
                cudashm.create_shared_memory_region(
                    "input" + str(io_num) + "_data" + shm_suffix,
                    input_byte_size, 0))
            shm_ip_handles.append(
                cudashm.create_shared_memory_region(
                    "dummy_input" + str(io_num) + "_data" + shm_suffix,
                    dummy_input_byte_size, 0))
            shm_op_handles.append(
                cudashm.create_shared_memory_region(
                    "output" + str(io_num) + "_data" + shm_suffix,
                    output_byte_size, 0))
            shm_op_handles.append(
                cudashm.create_shared_memory_region(
                    "dummy_output" + str(io_num) + "_data" + shm_suffix,
                    dummy_output_byte_size, 0))

            shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num])
            shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num + 1])
            shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num])
            shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num + 1])

            # copy data into shared memory region for input values
            cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num],
                                             input_list)
            cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1],
                                             dummy_input_list)
        elif use_system_shared_memory:
            shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data"+shm_suffix,\
                                        "/input"+str(io_num)+shm_suffix, input_byte_size))
            shm_ip_handles.append(shm.create_shared_memory_region("dumy_input"+str(io_num)+"_data"+shm_suffix,\
                                        "/dummy_input"+str(io_num)+shm_suffix, dummy_input_byte_size))
            shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data"+shm_suffix,\
                                        "/output"+str(io_num)+shm_suffix, output_byte_size))
            shm_op_handles.append(shm.create_shared_memory_region("dummy_output"+str(io_num)+"_data"+shm_suffix,\
                                        "/dummy_output"+str(io_num)+shm_suffix, dummy_output_byte_size))
            shared_memory_ctx.register(shm_ip_handles[2 * io_num])
            shared_memory_ctx.register(shm_ip_handles[2 * io_num + 1])
            shared_memory_ctx.register(shm_op_handles[2 * io_num])
            shared_memory_ctx.register(shm_op_handles[2 * io_num + 1])
            # copy data into shared memory region for input values
            shm.set_shared_memory_region(shm_ip_handles[2 * io_num],
                                         input_list)
            shm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1],
                                         dummy_input_list)
        if use_system_shared_memory or use_cuda_shared_memory:
            input_dict[input_name] = (shm_ip_handles[2 * io_num],
                                      [len(input_shape_values[0])])
            input_dict[dummy_input_name] = (shm_ip_handles[2 * io_num + 1],
                                            dummy_input_shapes[io_num])
            output_dict[output_name] = (InferContext.ResultFormat.RAW,
                                        shm_op_handles[2 * io_num])
            output_dict[dummy_output_name] = (InferContext.ResultFormat.RAW,
                                              shm_op_handles[2 * io_num + 1])
        else:
            input_dict[input_name] = input_list
            input_dict[dummy_input_name] = dummy_input_list
            output_dict[output_name] = InferContext.ResultFormat.RAW
            output_dict[dummy_output_name] = InferContext.ResultFormat.RAW

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        ctx = InferContext(config[0],
                           config[1],
                           model_name,
                           model_version,
                           correlation_id=0,
                           streaming=config[2],
                           verbose=True)
        results = ctx.run(input_dict,
                          output_dict,
                          batch_size,
                          priority=priority,
                          timeout_us=timeout_us)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(),
                               model_version)

        tester.assertEqual(len(results), 2 * io_cnt)
        for (result_name, result_val) in iteritems(results):
            tester.assertTrue(result_name in output_dict)
            expected = expected_dict[output_name][0]
            for b in range(batch_size):
                if result_name == output_name:
                    tester.assertEqual(result_val[b].shape, expected.shape)
                    tester.assertTrue(
                        np.array_equal(result_val[b], expected),
                        "{}, {}, slot {}, expected: {}, got {}".format(
                            model_name, result_name, b, expected,
                            result_val[b]))
                elif result_name == dummy_output_name:
                    # The shape of the dummy output should be equal to the shape values
                    # specified in the shape tensor
                    tester.assertTrue(
                        np.array_equal(result_val[b].shape, expected),
                        "{}, {}, slot {}, expected: {}, got {}".format(
                            model_name, result_name, b, expected,
                            result_val[b]))

    if use_cuda_shared_memory or use_system_shared_memory:
        for io_num in range(2 * io_cnt):
            shared_memory_ctx.unregister(shm_ip_handles[io_num])
            shared_memory_ctx.unregister(shm_op_handles[io_num])
            if use_cuda_shared_memory:
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
示例#19
0
        existing_shm.close()
        results = triton_client.infer(model_name, inputs)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--trial',
                        type=str,
                        required=True,
                        help='Set trial for the crashing client')
    FLAGS = parser.parse_args()
    trial = FLAGS.trial

    dtype = np.float32
    model_name = tu.get_zero_model_name(trial, 1, dtype)
    tensor_shape = (1, ) if "nobatch" in trial else (1, 1)

    triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
                                                     verbose=True)

    shm = shared_memory.SharedMemory(create=True, size=8)
    count = np.ndarray((1, ), dtype=np.int32, buffer=shm.buf)
    count[0] = 0

    p = Process(target=crashing_client,
                name="crashing_client",
                args=(
                    model_name,
                    dtype,
                    tensor_shape,
 def test_savedmodel(self):
     # savedmodel_nobatch_zero_1_float32 is identity model with input shape [-1]
     for client in self.clients_:
         model_name = tu.get_zero_model_name("savedmodel_nobatch", 1,
                                             self.data_type_)
         self._test_helper(client, model_name)