示例#1
0
    def preprocess(cls, img):
        """
        Pre-process an image to meet the size, type and format
        requirements specified by the parameters.

        :param img: Pillow image

        :returns:
            - model_input: input as required by the model
            - extra_data: dict of data that is needed by the postprocess function
        """
        extra_data = {}
        # Careful, Pillow has (w,h) format but most models expect (h,w)
        w, h = img.size
        extra_data["original_image_size"] = (h, w)

        if cls.SHAPE[2] == 1:
            img = img.convert("L")
        else:
            img = img.convert("RGB")

        logger.info(f"Original image size: {img.size}")

        # convert to cv2
        img = np.array(img)
        img = img[:, :, ::-1].copy()

        img = image_resize(img, cls.SHAPE[1:])
        img = image_preprocess(img)

        npdtype = triton_to_np_dtype(cls.DTYPE)
        img = img.astype(npdtype)

        return img, extra_data
    def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = utils.triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all(),
                "Model output does not match expected output")
        except Exception as ex:
            error_msg.append(str(ex))
示例#3
0
    def get_embedding(self, face_img):
        if not isinstance(face_img, list):
            face_img = [face_img]

        face_img = np.stack(face_img)

        input_size = tuple(face_img[0].shape[0:2][::-1])
        blob = cv2.dnn.blobFromImages(
            face_img,
            1.0 / self.input_std,
            input_size, (self.input_mean, self.input_mean, self.input_mean),
            swapRB=True)

        blob = blob.astype(triton_to_np_dtype(self.dtype))

        inputs = []
        inputs.append(
            grpcclient.InferInput(self.input_name,
                                  [blob.shape[0], self.c, self.h, self.w],
                                  "FP32"))
        # inputs[0].set_data_from_numpy(face_img)

        cudashm.set_shared_memory_region(self.in_handle, [blob])
        input_bytesize = 12 * blob.shape[0] * self.w * self.h
        inputs[-1].set_shared_memory(self.in_handle_name, input_bytesize)

        outputs = []
        out_bytesize = 12 * 512 * self.max_batch_size
        outputs.append(grpcclient.InferRequestedOutput(self.output_name[0]))
        outputs[-1].set_shared_memory(self.out_handle_name, out_bytesize)

        out = self.triton_client.infer(self.model_name,
                                       inputs,
                                       model_version=self.model_version,
                                       outputs=outputs)

        out = [
            cudashm.get_contents_as_numpy(self.out_handle,
                                          triton_to_np_dtype(self.dtype),
                                          [blob.shape[0], 512])
        ]
        # out = [out.as_numpy(e) for e in self.output_name]

        return out[0]
示例#4
0
    def predict(self, input_images):
        # Put input data values into shared memory
        shm.set_shared_memory_region(self.input_images_handle, [input_images])

        results = self.triton_client.infer(model_name=self.model_name,
                                           inputs=self.inputs,
                                           outputs=self.outputs)
        # Read results from the shared memory.
        output = results.get_output("output")
        output_data = shm.get_contents_as_numpy(
            self.output_handle, utils.triton_to_np_dtype(output.datatype),
            output.shape)

        return output_data
示例#5
0
    def predict(self, deployment_name, df):
        single_input_np = None
        if isinstance(df, np.ndarray):
            single_input_np = df

        inputs = []
        if single_input_np is not None:
            model_metadata = self.triton_client.get_model_metadata(
                deployment_name)
            raise MlflowException("Unnamed input is not currently supported")
        else:
            if isinstance(df, pd.DataFrame):
                model_metadata = self.triton_client.get_model_metadata(
                    deployment_name)
                input_dtype = {}
                for input in model_metadata["inputs"]:
                    input_dtype[input["name"]] = triton_to_np_dtype(
                        input["datatype"])
                # Sanity check
                if len(df.columns) != 1:
                    raise MlflowException(
                        "Expect Pandas DataFrame has only 1 column")
                col = df.columns[0]
                for row in df.index:
                    val = df[col][row]
                    # Need to form numpy array of the data type expected
                    if type(df[col][row]) != np.ndarray:
                        val = np.array(val, dtype=input_dtype[row])
                    inputs.append(
                        tritonhttpclient.InferInput(
                            row, val.shape, np_to_triton_dtype(val.dtype)))
                    inputs[-1].set_data_from_numpy(val)
            else:
                for key, val in df:
                    inputs.append(
                        tritonhttpclient.InferInput(
                            key, val.shape, np_to_triton_dtype(val.dtype)))
                    inputs[-1].set_data_from_numpy(val)

        try:
            resp = self.triton_client.infer(model_name=deployment_name,
                                            inputs=inputs)
            res = {}
            for output in resp.get_response()['outputs']:
                res[output['name']] = resp.as_numpy(output['name'])
            return {"outputs": res}
        except InferenceServerException as ex:
            raise MlflowException(str(ex))
示例#6
0
    def get_embedding(self,face_img):
        face_img = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
        face_img = np.transpose(face_img, (2, 0, 1))
        face_img = np.expand_dims(face_img, axis=0)
        face_img = face_img.astype(triton_to_np_dtype(self.dtype))
        inputs = []
        inputs.append(httpclient.InferInput(self.input_name, [1, self.c, self.h,self.w], "FP32"))
        inputs[0].set_data_from_numpy(face_img)

        out = self.triton_client.infer(self.model_name,
                            inputs,
                            model_version=self.model_version,
                            outputs=None)
        out = [out.as_numpy(e)[0] for e in self.output_name]
        #print(output.get_output(self.output_name)['data'])
        return out
示例#7
0
def preprocess(img, format, dtype, c, h, w, scaling, protocol):
    """
    Pre-process an image to meet the size, type and format
    requirements specified by the parameters.
    """
    # np.set_printoptions(threshold='nan')

    if c == 1:
        sample_img = img.convert('L')
    else:
        sample_img = img.convert('RGB')

    resized_img = sample_img.resize((w, h), Image.BILINEAR)
    resized = np.array(resized_img)
    if resized.ndim == 2:
        resized = resized[:, :, np.newaxis]

    npdtype = triton_to_np_dtype(dtype)
    typed = resized.astype(npdtype)

    if scaling == 'INCEPTION':
        scaled = (typed / 127.5) - 1
    elif scaling == 'VGG':
        if c == 1:
            scaled = typed - np.asarray((128,), dtype=npdtype)
        else:
            scaled = typed - np.asarray((123, 117, 104), dtype=npdtype)
    else:
        scaled = typed

    # Swap to CHW if necessary
    if protocol == "grpc":
        if format == mc.ModelInput.FORMAT_NCHW:
            ordered = np.transpose(scaled, (2, 0, 1))
        else:
            ordered = scaled
    else:
        if format == "FORMAT_NCHW":
            ordered = np.transpose(scaled, (2, 0, 1))
        else:
            ordered = scaled

    # Channels are in RGB order. Currently model configuration data
    # doesn't provide any information as to other channel orderings
    # (like BGR) so we just assume RGB.
    return ordered
示例#8
0
    def __iter__(self):
        client = InferenceServerClient(self._server_url, verbose=self._verbose)
        error = self._verify_triton_state(client)
        if error:
            raise RuntimeError(
                f"Could not communicate to Triton Server: {error}")

        LOGGER.debug(
            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} "
            f"are up and ready!")

        model_config = client.get_model_config(self._model_name,
                                               self._model_version)
        model_metadata = client.get_model_metadata(self._model_name,
                                                   self._model_version)
        LOGGER.info(f"Model config {model_config}")
        LOGGER.info(f"Model metadata {model_metadata}")

        inputs = {tm.name: tm for tm in model_metadata.inputs}
        outputs = {tm.name: tm for tm in model_metadata.outputs}
        output_names = list(outputs)
        outputs_req = [InferRequestedOutput(name) for name in outputs]

        for ids, x, y_real in self._dataloader:
            infer_inputs = []
            for name in inputs:
                data = x[name]
                infer_input = InferInput(name, data.shape,
                                         inputs[name].datatype)

                target_np_dtype = client_utils.triton_to_np_dtype(
                    inputs[name].datatype)
                data = data.astype(target_np_dtype)

                infer_input.set_data_from_numpy(data)
                infer_inputs.append(infer_input)

            results = client.infer(
                model_name=self._model_name,
                model_version=self._model_version,
                inputs=infer_inputs,
                outputs=outputs_req,
                client_timeout=self._response_wait_t,
            )
            y_pred = {name: results.as_numpy(name) for name in output_names}
            yield ids, x, y_pred, y_real
示例#9
0
    def _pre_process_edgetpu(cls, img, dims):
        """
        set image file dimensions to 224x224 by resizing and cropping
        image from center

        :param img: image as array in HWC format
        :param dims: dims as tuple in HWC order
        """
        output_height, output_width, _ = dims
        img = cls._resize_with_aspectratio(img,
                                           output_height,
                                           output_width,
                                           inter_pol=cv2.INTER_LINEAR)
        img = cls._center_crop(img, output_height, output_width)
        npdtype = triton_to_np_dtype(cls.DTYPE)
        img = np.asarray(img, dtype=npdtype)
        # converts jpg pixel value from [0 - 255] to float array [-1.0 - 1.0]
        img -= [127.0, 127.0, 127.0]
        img /= [128.0, 128.0, 128.0]
        return img
示例#10
0
    def prepare(self, ctx_id=0):
        concurrency = 2
        # Make sure the model matches our requirements, and get some
        # properties of the model that we need for preprocessing

        try:
            model_metadata = self.triton_client.get_model_metadata(
                model_name=self.model_name, model_version=self.model_version)
        except InferenceServerException as e:
            print("failed to retrieve the metadata: " + str(e))
            sys.exit(1)

        logging.info(model_metadata)

        try:
            model_config = self.triton_client.get_model_config(
                model_name=self.model_name, model_version=self.model_version)
        except InferenceServerException as e:
            print("failed to retrieve the config: " + str(e))
            sys.exit(1)

        self.max_batch_size, self.input_name, self.output_name, self.c, self.h, self.w, self.format, self.dtype, self.out_shapes = parse_model_grpc(
            model_metadata, model_config.config)

        self.input_shape = (1, self.c, self.h, self.w)
        self.input_dtype = triton_to_np_dtype(self.dtype)

        self.in_handle_name = f'{self.model_name}_data_{os.getpid()}'

        if self.max_batch_size <= 0:
            self.max_batch_size = 1
        self.input_bytesize = 12 * self.w * self.h * 1

        self.in_handle = cudashm.create_shared_memory_region(
            self.in_handle_name, self.input_bytesize, 0)

        self.triton_client.unregister_cuda_shared_memory(self.in_handle_name)
        self.triton_client.register_cuda_shared_memory(
            self.in_handle_name, cudashm.get_raw_handle(self.in_handle), 0,
            self.input_bytesize)
示例#11
0
    def preprocess(cls, img):
        """
        Pre-process an image to meet the size, type and format
        requirements specified by the parameters.
        https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov4

        :param img: Pillow image

        :returns:
            - model_input: input as required by the model
            - extra_data: dict of data that is needed by the postprocess function
        """
        extra_data = {}
        # Careful, Pillow has (w,h) format but most models expect (h,w)
        w, h = img.size
        extra_data["original_image_size"] = (h, w)

        if cls.SHAPE[2] == 1:
            sample_img = img.convert("L")
        else:
            sample_img = img.convert("RGB")

        logger.info(f"Original image size: {sample_img.size}")

        # convert to cv2
        open_cv_image = np.array(sample_img)
        open_cv_image = open_cv_image[:, :, ::-1].copy()

        image = image_preprocess(open_cv_image, (cls.SHAPE[0], cls.SHAPE[1]))

        npdtype = triton_to_np_dtype(cls.DTYPE)
        image = image.astype(npdtype)

        # channels first if needed
        if cls.CHANNEL_FIRST:
            img = np.transpose(img, (2, 0, 1))

        return image, extra_data
示例#12
0
    def inputs_outputs_generator(self, raw_inputs):
        """
        Generate inputs and outptus blob for triton client inference
        :param raw_inputs: list of raw numpy inputs
        :return: inputs outputs data
        """
        inputs = []
        for input_specs, raw_input in zip(self.inputs_specs, raw_inputs):
            # parse data type
            raw_input = raw_input.astype(
                triton_to_np_dtype(input_specs.datatype))
            infer_input = grpcclient.InferInput(input_specs.name,
                                                raw_input.shape,
                                                input_specs.datatype)
            infer_input.set_data_from_numpy(raw_input)
            inputs.append(infer_input)

        outputs = []
        for output_specs in self.outputs_specs:
            outputs.append(
                grpcclient.InferRequestedOutput(output_specs.name,
                                                class_count=0))
        return inputs, outputs
示例#13
0
# TODO: Make it easily configurable
MODEL = "mnist_tf_savedmodel"
MODEL_VER = "1"
URL_HTTP = "localhost:8000"
URL_GRPC = "localhost:8001"

INPUT_SHAPE = (28, 28)
DATA = "data/7.png"

# pre-processing
img = Image.open(DATA).convert('L')
img = img.resize(INPUT_SHAPE)
imgArr = np.asarray(img) / 255
imgArr = np.expand_dims(imgArr[:, :, np.newaxis], 0)
imgArr= imgArr.astype(triton_to_np_dtype('FP32'))

# Client-Server GRPC
print("Using GRPC ... ")
triton_client = grpcclient.InferenceServerClient(url=URL_GRPC, verbose=0)
inputs = []
inputs.append(grpcclient.InferInput('flatten_1_input', imgArr.shape, 'FP32'))
inputs[0].set_data_from_numpy(imgArr)
outputs = []
outputs.append(grpcclient.InferRequestedOutput('dense_3', class_count=0))
responses = []
responses.append(triton_client.infer(MODEL,inputs,
                    request_id=str(1),
                    model_version=MODEL_VER,
                    outputs=outputs))
示例#14
0
    outputs.append(httpclient.InferRequestedOutput('OUTPUT1',
                                                   binary_data=True))
    outputs[-1].set_shared_memory("output_data",
                                  output_byte_size,
                                  offset=output_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op_handle, utils.triton_to_np_dtype(output0['datatype']),
            output0['shape'])
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(shm_op_handle,
                                                 utils.triton_to_np_dtype(
                                                     output1['datatype']),
                                                 output1['shape'],
                                                 offset=output_byte_size)
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)
示例#15
0
    def test_buffer_attributes(self):
        model_name = 'bls'

        # Infer
        clients = [
            httpclient.InferenceServerClient(url='localhost:8000'),
            grpcclient.InferenceServerClient(url='localhost:8001')
        ]
        triton_clients = [httpclient, grpcclient]
        for i, client in enumerate(clients):

            # To make sure no shared memory regions are registered with the
            # server.
            client.unregister_system_shared_memory()
            client.unregister_cuda_shared_memory()

            triton_client = triton_clients[i]
            inputs = []
            outputs = []
            inputs.append(
                triton_client.InferInput('INPUT0', [1, 1000], "INT32"))

            input0_data = np.arange(start=0, stop=1000, dtype=np.int32)
            input0_data = np.expand_dims(input0_data, axis=0)

            input_byte_size = input0_data.size * input0_data.itemsize
            output_byte_size = input_byte_size

            shm_ip0_handle = cudashm.create_shared_memory_region(
                "input0_data", input_byte_size, 0)
            shm_op0_handle = cudashm.create_shared_memory_region(
                "output0_data", output_byte_size, 0)

            client.register_cuda_shared_memory(
                "input0_data", cudashm.get_raw_handle(shm_ip0_handle), 0,
                input_byte_size)
            client.register_cuda_shared_memory(
                "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0,
                input_byte_size)

            cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data])
            inputs[0].set_shared_memory("input0_data", input_byte_size)

            if triton_client is grpcclient:
                outputs.append(triton_client.InferRequestedOutput('OUTPUT0'))
                outputs[0].set_shared_memory("output0_data", output_byte_size)
            else:
                outputs.append(
                    triton_client.InferRequestedOutput('OUTPUT0',
                                                       binary_data=True))
                outputs[0].set_shared_memory("output0_data", output_byte_size)

            results = client.infer(model_name=model_name,
                                   inputs=inputs,
                                   outputs=outputs)

            output0 = results.get_output("OUTPUT0")
            self.assertIsNotNone(output0)
            if triton_client is grpcclient:
                output0_data = cudashm.get_contents_as_numpy(
                    shm_op0_handle, triton_to_np_dtype(output0.datatype),
                    output0.shape)
            else:
                output0_data = cudashm.get_contents_as_numpy(
                    shm_op0_handle, triton_to_np_dtype(output0['datatype']),
                    output0['shape'])
            self.assertTrue(np.all(output0_data == input0_data))
    inputs[-1].set_shared_memory("input1_data", input1_byte_size)

    outputs = []
    outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
    outputs[-1].set_shared_memory("output0_data", output0_byte_size)

    outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
    outputs[-1].set_shared_memory("output1_data", output1_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    print(utils.triton_to_np_dtype(output0['datatype']))
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0['datatype']),
            output0['shape'])
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(
            shm_op1_handle, utils.triton_to_np_dtype(output1['datatype']),
            output1['shape'])
    else:
        print("OUTPUT1 is missing in the response.")
示例#17
0
    outputs[-1].set_shared_memory("output_data", output_byte_size)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
    outputs[-1].set_shared_memory("output_data",
                                  output_byte_size,
                                  offset=output_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op_handle, utils.triton_to_np_dtype(output0.datatype),
            output0.shape)
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(shm_op_handle,
                                                 utils.triton_to_np_dtype(
                                                     output1.datatype),
                                                 output1.shape,
                                                 offset=output_byte_size)
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)
    inputs[-1].set_shared_memory("input1_data", input1_byte_size)

    outputs = []
    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
    outputs[-1].set_shared_memory("output0_data", output0_byte_size)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
    outputs[-1].set_shared_memory("output1_data", output1_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    print(utils.triton_to_np_dtype(output0.datatype))
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
            output0.shape)
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        output1_data = shm.get_contents_as_numpy(
            shm_op1_handle, utils.triton_to_np_dtype(output1.datatype),
            output1.shape)
    else:
        print("OUTPUT1 is missing in the response.")
    try:
        model_config = triton_client.get_model_config(
            model_name=FLAGS.model_name, model_version=FLAGS.model_version)
    except InferenceServerException as e:
        print("failed to retrieve the config: " + str(e))
        sys.exit(1)

    if FLAGS.protocol.lower() == "grpc":
        max_batch_size, input_name, output_name, dtype = parse_model_grpc(
            model_metadata, model_config.config)
    else:
        max_batch_size, input_name, output_name, dtype = parse_model_http(
            model_metadata, model_config)

    input_data = np.zeros([FLAGS.batch_size, FLAGS.shape],
                          dtype=triton_to_np_dtype(dtype))

    # --------------------------- Warm-Up --------------------------------------------------------
    for i in range(FLAGS.warmup_count):
        inputs, outputs = requestGenerator(input_name, input_data, output_name,
                                           dtype, FLAGS.protocol.lower())
        triton_client.infer(FLAGS.model_name,
                            inputs,
                            model_version=FLAGS.model_version,
                            outputs=outputs)

    latencies = []

    # --------------------------- Start Load --------------------------------------------------------

    start_time = time.time()
    def req_loop(self):
        client = InferenceServerClient(self._server_url, verbose=self._verbose)
        self._errors = self._verify_triton_state(client)
        if self._errors:
            return

        LOGGER.debug(
            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} "
            f"are up and ready!")

        model_config = client.get_model_config(self._model_name,
                                               self._model_version)
        model_metadata = client.get_model_metadata(self._model_name,
                                                   self._model_version)
        LOGGER.info(f"Model config {model_config}")
        LOGGER.info(f"Model metadata {model_metadata}")

        inputs = {tm.name: tm for tm in model_metadata.inputs}
        outputs = {tm.name: tm for tm in model_metadata.outputs}
        output_names = list(outputs)
        outputs_req = [InferRequestedOutput(name) for name in outputs]

        self._num_waiting_for = 0

        for ids, x, y_real in self._dataloader:
            infer_inputs = []
            for name in inputs:
                data = x[name]
                infer_input = InferInput(name, data.shape,
                                         inputs[name].datatype)

                target_np_dtype = client_utils.triton_to_np_dtype(
                    inputs[name].datatype)
                data = data.astype(target_np_dtype)

                infer_input.set_data_from_numpy(data)
                infer_inputs.append(infer_input)

            with self._sync:

                def _check_can_send():
                    return self._num_waiting_for < self._max_unresp_reqs

                can_send = self._sync.wait_for(_check_can_send,
                                               timeout=self._response_wait_t)
                if not can_send:
                    error_msg = f"Runner could not send new requests for {self._response_wait_t}s"
                    self._errors.append(error_msg)
                    break

                callback = functools.partial(AsyncGRPCTritonRunner._on_result,
                                             self, ids, x, y_real,
                                             output_names)
                client.async_infer(
                    model_name=self._model_name,
                    model_version=self._model_version,
                    inputs=infer_inputs,
                    outputs=outputs_req,
                    callback=callback,
                )
                self._num_waiting_for += 1

        # wait till receive all requested data
        with self._sync:

            def _all_processed():
                LOGGER.debug(
                    f"wait for {self._num_waiting_for} unprocessed jobs")
                return self._num_waiting_for == 0

            self._processed_all = self._sync.wait_for(
                _all_processed, self.DEFAULT_MAX_FINISH_WAIT_S)
            if not self._processed_all:
                error_msg = f"Runner {self._response_wait_t}s timeout received while waiting for results from server"
                self._errors.append(error_msg)
        LOGGER.debug("Finished request thread")
def infer_and_validata(use_shared_memory, orig_input0_data, orig_input1_data):
    if use_shared_memory:
        input0_data = orig_input0_data
        input1_data = orig_input1_data
        byte_size = input0_data.size * input0_data.itemsize
        inputs[0].set_shared_memory("input0_data", byte_size)
        inputs[1].set_shared_memory("input1_data", byte_size)
        outputs[0].set_shared_memory("output0_data", byte_size)
        outputs[1].set_shared_memory("output1_data", byte_size)
    else:
        input0_data = orig_input0_data
        input1_data = orig_input1_data * 2
        inputs[0].set_data_from_numpy(np.expand_dims(input0_data, axis=0))
        inputs[1].set_data_from_numpy(np.expand_dims(input1_data, axis=0))
        outputs[0].unset_shared_memory()
        outputs[1].unset_shared_memory()

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        if use_shared_memory:
            if protocol == "grpc":
                output0_data = shm.get_contents_as_numpy(
                    shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
                    output0.shape)
            else:
                output0_data = shm.get_contents_as_numpy(
                    shm_op0_handle,
                    utils.triton_to_np_dtype(output0['datatype']),
                    output0['shape'])
        else:
            output0_data = results.as_numpy('OUTPUT0')
    else:
        print("OUTPUT0 is missing in the response.")
        sys.exit(1)

    output1 = results.get_output("OUTPUT1")
    if output1 is not None:
        if use_shared_memory:
            if protocol == "grpc":
                output1_data = shm.get_contents_as_numpy(
                    shm_op1_handle, utils.triton_to_np_dtype(output1.datatype),
                    output1.shape)
            else:
                output1_data = shm.get_contents_as_numpy(
                    shm_op1_handle,
                    utils.triton_to_np_dtype(output1['datatype']),
                    output1['shape'])
        else:
            output1_data = results.as_numpy('OUTPUT1')
    else:
        print("OUTPUT1 is missing in the response.")
        sys.exit(1)

    if use_shared_memory:
        print("\n\n======== SHARED_MEMORY ========\n")
    else:
        print("\n\n======== NO_SHARED_MEMORY ========\n")
    for i in range(16):
        print(
            str(input0_data[i]) + " + " + str(input1_data[i]) + " = " +
            str(output0_data[0][i]))
        print(
            str(input0_data[i]) + " - " + str(input1_data[i]) + " = " +
            str(output1_data[0][i]))
        if (input0_data[i] + input1_data[i]) != output0_data[0][i]:
            print("shm infer error: incorrect sum")
            sys.exit(1)
        if (input0_data[i] - input1_data[i]) != output1_data[0][i]:
            print("shm infer error: incorrect difference")
            sys.exit(1)
    print("\n======== END ========\n\n")