Python Tensor示例，triton_python_backend_utils.Tensor Python示例

示例#1

0

显示文件

文件： model.py 项目： triton-inference-server/server

    def execute(self, requests):
        output0_dtype = self.output0_dtype
        output1_dtype = self.output1_dtype

        responses = []
        for request in requests:
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            # If both of the tensors are in CPU, use NumPy.
            if in_0.is_cpu() and in_1.is_cpu():
                if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
                ).dtype == np.object_:
                    out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\
                        in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32))
                    out_tensor_0 = pb_utils.Tensor("OUTPUT0",
                                                   out_0.astype(output0_dtype))
                    out_tensor_1 = pb_utils.Tensor("OUTPUT1",
                                                   out_1.astype(output1_dtype))
                else:
                    in_0_pytorch, in_1_pytorch = from_dlpack(
                        in_0.to_dlpack()), from_dlpack(in_1.to_dlpack())
                    out_0, out_1 = (in_0_pytorch - in_1_pytorch,
                                    in_0_pytorch + in_1_pytorch)

                    if self.output0_dtype == np.object_:
                        out_tensor_0 = pb_utils.Tensor(
                            "OUTPUT0",
                            out_0.numpy().astype(output0_dtype))
                    else:
                        out_0 = out_0.type(
                            self.numpy_to_pytorch_dtype[output0_dtype])
                        out_tensor_0 = pb_utils.Tensor.from_dlpack(
                            "OUTPUT0", to_dlpack(out_0))

                    if self.output1_dtype == np.object_:
                        out_tensor_1 = pb_utils.Tensor(
                            "OUTPUT1",
                            out_1.numpy().astype(output1_dtype))
                    else:
                        out_1 = out_1.type(
                            self.numpy_to_pytorch_dtype[output1_dtype])
                        out_tensor_1 = pb_utils.Tensor.from_dlpack(
                            "OUTPUT1", to_dlpack(out_1))

            else:
                in_0_pytorch, in_1_pytorch = from_dlpack(
                    in_0.to_dlpack()).cuda(), from_dlpack(
                        in_1.to_dlpack()).cuda()
                out_0, out_1 = (in_0_pytorch - in_1_pytorch,
                                in_0_pytorch + in_1_pytorch)
                out_tensor_0 = pb_utils.Tensor.from_dlpack(
                    "OUTPUT0", to_dlpack(out_0))
                out_tensor_1 = pb_utils.Tensor.from_dlpack(
                    "OUTPUT1", to_dlpack(out_1))

            responses.append(
                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))

        return responses

示例#2

0

显示文件

    def execute(self, requests):
        """Model supporting optional inputs. If the input is not provided, an
        input tensor of size 1 containing scalar 5 will be used."""
        responses = []
        for request in requests:
            input0_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            input1_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            if input0_tensor is not None:
                input0_numpy = input0_tensor.as_numpy()
            else:
                input0_numpy = np.array([5], dtype=np.int32)

            if input1_tensor is not None:
                input1_numpy = input1_tensor.as_numpy()
            else:
                input1_numpy = np.array([5], dtype=np.int32)

            output0_tensor = pb_utils.Tensor("OUTPUT0",
                                             input0_numpy + input1_numpy)
            output1_tensor = pb_utils.Tensor("OUTPUT1",
                                             input0_numpy - input1_numpy)
            responses.append(
                pb_utils.InferenceResponse([output0_tensor, output1_tensor]))

        return responses

示例#3

0

显示文件

文件： model.py 项目： Delameta-AI-Research/server

def bls_add_sub(_=None):
    input0_np = np.random.randn(*[16])
    input0_np = input0_np.astype(np.float32)
    input1_np = np.random.randn(*[16])
    input1_np = input1_np.astype(np.float32)
    input0 = pb_utils.Tensor('INPUT0', input0_np)
    input1 = pb_utils.Tensor('INPUT1', input1_np)
    infer_request = pb_utils.InferenceRequest(
        model_name='add_sub',
        inputs=[input0, input1],
        requested_output_names=['OUTPUT0', 'OUTPUT1'])
    infer_response = infer_request.exec()
    if infer_response.has_error():
        return False

    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
    if output0 is None or output1 is None:
        return False

    expected_output_0 = input0.as_numpy() + input1.as_numpy()
    expected_output_1 = input0.as_numpy() - input1.as_numpy()

    if not np.all(expected_output_0 == output0.as_numpy()):
        return False

    if not np.all(expected_output_1 == output1.as_numpy()):
        return False

    return True

示例#4

0

显示文件

    def execute(self, requests):
        """ This function is called on inference request.
        """

        output0_dtype = self.output0_dtype
        output1_dtype = self.output1_dtype

        responses = []
        for request in requests:
            input_tensors = request.inputs()
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
            if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
            ).dtype == np.object:
                out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\
                    in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32))
            else:
                out_0, out_1 = (in_0.as_numpy() - in_1.as_numpy(),
                                in_0.as_numpy() + in_1.as_numpy())

            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
                                           out_0.astype(output0_dtype))
            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
                                           out_1.astype(output1_dtype))
            responses.append(
                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
        return responses

示例#5

0

显示文件

文件： model.py 项目： spnettec/kura

    def execute(self, requests):
        output0_dtype = self.output0_dtype
        output1_dtype = self.output1_dtype

        responses = []

        for request in requests:
            THRESHOLD = 0.20

            # Get input
            x_recon = pb_utils.get_input_tensor_by_name(
                request, "RECONSTR0").as_numpy()
            x_orig = pb_utils.get_input_tensor_by_name(request,
                                                       "ORIG0").as_numpy()

            # Get Mean square error between reconstructed input and original input
            reconstruction_score = np.mean((x_orig - x_recon)**2, axis=1)

            anomaly = reconstruction_score > THRESHOLD

            # Create output tensors
            out_tensor_0 = pb_utils.Tensor(
                "ANOMALY_SCORE0", reconstruction_score.astype(output0_dtype))
            out_tensor_1 = pb_utils.Tensor("ANOMALY0",
                                           anomaly.astype(output1_dtype))

            inference_response = pb_utils.InferenceResponse(
                output_tensors=[out_tensor_0, out_tensor_1])
            responses.append(inference_response)

        return responses

示例#6

0

显示文件

文件： model.py 项目： triton-inference-server/server

    def execute(self, requests):
        """ Create a response sender object and use that
        for sending the response.
        """

        # This model does not support batching, so 'request_count' should always be 1.
        if len(requests) != 1:
            raise pb_utils.TritonModelException("unsupported batch size " +
                                                len(requests))

        output0_dtype = self.output0_dtype
        output1_dtype = self.output1_dtype

        response_sender = requests[0].get_response_sender()
        in_0 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT0")
        in_1 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT1")
        out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
                        in_0.as_numpy() - in_1.as_numpy())

        out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
        out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
        response = pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])

        response_sender.send(
            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
        response_sender.send(response)

示例#7

0

显示文件

    def execute(self, requests):
        """`execute` MUST be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference request is made
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        output0_dtype = self.output0_dtype
        output1_dtype = self.output1_dtype

        responses = []

        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            # Get INPUT1
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
                            in_0.as_numpy() - in_1.as_numpy())

            # Create output tensors. You need pb_utils.Tensor
            # objects to create pb_utils.InferenceResponse.
            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
                                           out_0.astype(output0_dtype))
            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
                                           out_1.astype(output1_dtype))

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[out_tensor_0, out_tensor_1])
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses

示例#8

0

显示文件

文件： model.py 项目： Delameta-AI-Research/server

    def _send_bls_sequence_requests(self, correlation_id):
        # Start request
        try:
            input = pb_utils.Tensor('INPUT', np.array([1000], dtype=np.int32))

            infer_request = pb_utils.InferenceRequest(
                model_name='onnx_nobatch_sequence_int32',
                inputs=[input],
                requested_output_names=['OUTPUT'],
                flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START,
                correlation_id=correlation_id)
            self.assertTrue(infer_request.flags(),
                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START)
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())
            output = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT')
            self.assertEqual(output.as_numpy()[0], input.as_numpy()[0])

            for i in range(10):
                input = pb_utils.Tensor('INPUT', np.array([i], dtype=np.int32))
                infer_request = pb_utils.InferenceRequest(
                    model_name='onnx_nobatch_sequence_int32',
                    inputs=[input],
                    requested_output_names=['OUTPUT'],
                    correlation_id=correlation_id)
                infer_response = infer_request.exec()
                self.assertFalse(infer_response.has_error())

                # The new output is the previous output + the current input
                expected_output = output.as_numpy()[0] + i
                output = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT')
                self.assertEqual(output.as_numpy()[0], expected_output)

            # Final request
            input = pb_utils.Tensor('INPUT', np.array([2000], dtype=np.int32))

            infer_request = pb_utils.InferenceRequest(
                model_name='onnx_nobatch_sequence_int32',
                inputs=[input],
                requested_output_names=['OUTPUT'],
                correlation_id=correlation_id)
            infer_request.set_flags(
                pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)
            self.assertTrue(infer_request.flags(),
                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)

            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())
            expected_output = output.as_numpy()[0] + input.as_numpy()[0]
            output = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT')
            self.assertEqual(output.as_numpy()[0], expected_output)
        except Exception as e:
            self.add_deferred_exception(e)

示例#9

0

显示文件

 def execute(self, requests):
     responses = []
     new_shape = [64, 2, 32, 55, 84]
     shape_reorder = [1, 0, 4, 2, 3]
     for request in requests:
         input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
         input_numpy = input_tensor.as_numpy()
         output0 = pb_utils.Tensor("OUTPUT0", input_numpy.reshape(new_shape))
         # Transpose the tensor to create a non-contiguous tensor.
         output1 = pb_utils.Tensor("OUTPUT1", input_numpy.T)
         output2 = pb_utils.Tensor("OUTPUT2",
                                   np.transpose(input_numpy, shape_reorder))
         responses.append(
             pb_utils.InferenceResponse([output0, output1, output2]))
     return responses

示例#10

0

显示文件

文件： model.py 项目： triton-inference-server/server

    def response_thread(self, response_sender, input0, gpu_output):
        # Sleep 5 seconds to make sure the main thread has exited.
        time.sleep(5)

        if input0.is_cpu():
            if not gpu_output[0]:
                output0 = pb_utils.Tensor.from_dlpack("OUTPUT0",
                                                      input0.to_dlpack())
            else:
                outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                output0 = pb_utils.Tensor.from_dlpack(
                    "OUTPUT0", to_dlpack(outptu0_pytorch))
        else:
            if gpu_output[0]:
                output0 = pb_utils.Tensor.from_dlpack("OUTPUT0",
                                                      input0.to_dlpack())
            else:
                output0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                output0 = pb_utils.Tensor.from_dlpack(
                    "OUTPUT0", to_dlpack(output0_pytorch))

        next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:])
        infer_response = pb_utils.InferenceResponse([output0, next_gpu_output])

        # Number of times to repeat the response
        response_repeat = 2
        for _ in range(response_repeat):
            response_sender.send(infer_response)

        response_sender.send(
            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

        with self.inflight_thread_count_lck:
            self.inflight_thread_count -= 1

示例#11

0

显示文件

文件： model.py 项目： bnookala/server

    def execute(self, requests):
        responses = []
        for request in requests:
            input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            print('ISCPU', input0.is_cpu())
            gpu_output = pb_utils.get_input_tensor_by_name(
                request, "GPU_OUTPUT").as_numpy()

            if input0.is_cpu():
                if not gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))
            else:
                if gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))

            next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT",
                                              gpu_output[1:])
            responses.append(
                pb_utils.InferenceResponse([output0, next_gpu_output]))

        return responses

示例#12

0

显示文件

    def execute(self, requests):

        output0_dtype = self.output0_dtype

        responses = []

        for request in requests:
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            input_smiles = in_0.as_numpy()[0].decode()
            print('processing', input_smiles)
            generated_smiles, neighboring_embeddings, pad_mask = \
                self.find_similars_smiles_list(input_smiles,
                                               num_requested=10,
                                               force_unique=True)

            out_0 = np.array(generated_smiles).astype(np.object)

            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
                                           out_0.astype(output0_dtype))

            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[out_tensor_0])
            responses.append(inference_response)

        return responses

示例#13

0

显示文件

    def response_thread(self, response_sender, index, in_input):
        # The response_sender is used to send response(s) associated with the
        # corresponding request.  The first request will send errors and the
        # other requests will send the responses.  The number of responses per
        # requests is the number of elements in input tensor.

        in_value = in_input
        out_output = pb_utils.Tensor("OUT", in_value)

        if index == 0:
            error = pb_utils.TritonError('An error occured during execution')
            response = pb_utils.InferenceResponse(output_tensors=[out_output],
                                                  error=error)
        else:
            response = pb_utils.InferenceResponse(output_tensors=[out_output])
        response_sender.send(response)

        # We must close the response sender to indicate to Triton that we are
        # done sending responses for the corresponding request. We can't use the
        # response sender after closing it.
        response_sender.send(
            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

        with self.inflight_thread_count_lck:
            self.inflight_thread_count -= 1

示例#14

0

显示文件

    def execute(self, requests):
        responses = []
        for _ in requests:
            if self._index % 2 == 0:
                out_tensor_0 = pb_utils.Tensor(
                    "OUTPUT0",
                    np.array(['123456'], dtype=self._dtypes[self._index % 3]))
            else:
                # Test sending strings with no elements
                out_tensor_0 = pb_utils.Tensor(
                    "OUTPUT0", np.array([],
                                        dtype=self._dtypes[self._index % 3]))

            self._index += 1
            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
        return responses

示例#15

0

显示文件

文件： model.py 项目： Beam-wi/tritonserver

 def execute(self, requests):
     responses = []
     for request in requests:
         input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
         out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
         responses.append(pb_utils.InferenceResponse([out_tensor], error))
     return responses

示例#16

0

显示文件

文件： model.py 项目： VibhuJawa/rapids-examples

    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse.

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []

        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            input_ids = pb_utils.get_input_tensor_by_name(
                request, "input_ids").to_dlpack()
            attention_mask = pb_utils.get_input_tensor_by_name(
                request, "attention_mask").to_dlpack()

            # TODO: Set environment variable to prevent to(self.device)
            input_ids = from_dlpack(input_ids).long().to(self.device)
            attention_mask = from_dlpack(attention_mask).long().to(self.device)

            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask)
                conf, preds = torch.max(outputs, dim=1)
                preds = preds.int()

            out_tensor_0 = pb_utils.Tensor("preds", preds.cpu().numpy())

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[out_tensor_0])
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses

示例#17

0

显示文件

 def _send_identity_tensor(self, size):
     tensor_size = [1, size]
     input0_np = np.random.randn(*tensor_size)
     input0 = pb_utils.Tensor('INPUT0', input0_np.astype(np.float32))
     infer_request = pb_utils.InferenceRequest(
         model_name='identity_fp32',
         inputs=[input0],
         requested_output_names=['OUTPUT0'])
     return input0_np, infer_request.exec()

示例#18

0

显示文件

    def test_bls_wrong_inputs(self):
        input0 = pb_utils.Tensor('INPUT0', np.random.randn(*[1, 16]))

        infer_request = pb_utils.InferenceRequest(
            model_name='add_sub',
            inputs=[input0],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertTrue(infer_response.has_error())

示例#19

0

显示文件

文件： model.py 项目： bnookala/server

 def execute(self, requests):
     responses = []
     for _ in requests:
         out_tensor_0 = pb_utils.Tensor(
             "OUTPUT0", np.array(['123456'],
                                 dtype=self._dtypes[self._index]))
         self._index += 1
         responses.append(pb_utils.InferenceResponse([out_tensor_0]))
     return responses

示例#20

0

显示文件

文件： model.py 项目： rmccorm4/tensorrt-inference-server

 def execute(self, requests):
     """ This function is called on inference request.
     """
     responses = []
     for request in requests:
         input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
         out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy())
         responses.append(pb_utils.InferenceResponse([out_tensor]))
     return responses

示例#21

0

显示文件

    def response_thread(self, response_sender, in_input):
        # The response_sender is used to send response(s) associated with the
        # corresponding request.
        # Sleep 5 seconds to make sure the main thread has exited.
        time.sleep(5)

        status = self.execute_gpu_bls()
        if not status:
            infer_response = pb_utils.InferenceResponse(
                error="GPU BLS test failed.")
            response_sender.send(infer_response)
        else:
            in_value = in_input
            infer_request = pb_utils.InferenceRequest(
                model_name='identity_fp32',
                requested_output_names=["OUTPUT0"],
                inputs=[pb_utils.Tensor('INPUT0', in_input)])
            infer_response = infer_request.exec()
            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, "OUTPUT0")
            if infer_response.has_error():
                response = pb_utils.InferenceResponse(
                    error=infer_response.error().message())
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
            elif np.any(in_input != output0.as_numpy()):
                error_message = (
                    "BLS Request input and BLS response output do not match."
                    f" {in_value} != {output0.as_numpy()}")
                response = pb_utils.InferenceResponse(error=error_message)
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
            else:
                output_tensors = [pb_utils.Tensor('OUT', in_value)]
                response = pb_utils.InferenceResponse(
                    output_tensors=output_tensors)
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

        with self.inflight_thread_count_lck:
            self.inflight_thread_count -= 1

示例#22

0

显示文件

文件： model.py 项目： triton-inference-server/server

 def execute(self, requests):
     """
     Identity model in Python backend.
     """
     responses = []
     for request in requests:
         input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
         out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy())
         responses.append(pb_utils.InferenceResponse([out_tensor]))
     return responses

示例#23

0

显示文件

文件： model.py 项目： Bole0331/TritonServer-Patch-Logging

 def execute(self, requests):
     """ This function is called on inference request.
     """
     responses = []
     for request in requests:
         input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
         out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
         error = pb_utils.TritonError('An error occured during execution')
         responses.append(pb_utils.InferenceResponse([out_tensor], error))
     return responses

示例#24

0

显示文件

文件： model.py 项目： luvwinnie/server

 def execute(self, requests):
     responses = []
     for request in requests:
         in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
         out_tensor_0 = pb_utils.Tensor(
             "OUTPUT0",
             in_0.as_numpy().astype(self._dtypes[self._index]))
         self._index += 1
         responses.append(pb_utils.InferenceResponse([out_tensor_0]))
     return responses

示例#25

0

显示文件

文件： model.py 项目： tianxin1860/PaddleNLP

    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse.
        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest
        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """
        responses = []
        # print("num:", len(requests), flush=True)
        for request in requests:
            data = pb_utils.get_input_tensor_by_name(request,
                                                     self.input_names[0])
            data = data.as_numpy()
            data = [i[0].decode('utf-8') for i in data]
            data = self.tokenizer(data,
                                  max_length=128,
                                  padding=True,
                                  truncation=True)
            input_ids = np.array(data["input_ids"], dtype=self.output_dtype[0])
            token_type_ids = np.array(data["token_type_ids"],
                                      dtype=self.output_dtype[1])

            # print("input_ids:", input_ids)
            # print("token_type_ids:", token_type_ids)

            out_tensor1 = pb_utils.Tensor(self.output_names[0], input_ids)
            out_tensor2 = pb_utils.Tensor(self.output_names[1], token_type_ids)
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[out_tensor1, out_tensor2])
            responses.append(inference_response)
        return responses

示例#26

0

显示文件

    def test_dlpack_string_tensor(self):
        np_object = np.array(['An Example String'], dtype=np.object_)
        pb_tensor = pb_utils.Tensor('test_tensor', np_object)

        with self.assertRaises(Exception) as e:
            pb_tensor.to_dlpack()

        self.assertTrue(
            str(e.exception) ==
            'TYPE_BYTES tensors cannot be converted to DLPack.')

示例#27

0

显示文件

文件： model.py 项目： luvwinnie/server

    def test_dlpack_string_tensor(self):
        np_object = np.array(['An Example String'], dtype=np.object_)
        pb_tensor = pb_utils.Tensor('test_tensor', np_object)

        with self.assertRaises(Exception) as e:
            pb_tensor.to_dlpack()

        self.assertTrue(
            str(e.exception) ==
            'DLPack does not have support for string tensors.')

示例#28

0

显示文件

文件： model.py 项目： spnettec/kura

    def execute(self, requests):
        output0_dtype = self.output0_dtype

        responses = []

        for request in requests:
            acc_x = pb_utils.get_input_tensor_by_name(request,
                                                      "ACC_X").as_numpy()
            acc_y = pb_utils.get_input_tensor_by_name(request,
                                                      "ACC_Y").as_numpy()
            acc_z = pb_utils.get_input_tensor_by_name(request,
                                                      "ACC_Z").as_numpy()
            gyro_x = pb_utils.get_input_tensor_by_name(request,
                                                       "GYRO_X").as_numpy()
            gyro_y = pb_utils.get_input_tensor_by_name(request,
                                                       "GYRO_Y").as_numpy()
            gyro_z = pb_utils.get_input_tensor_by_name(request,
                                                       "GYRO_Z").as_numpy()
            humidity = pb_utils.get_input_tensor_by_name(
                request, "HUMIDITY").as_numpy()
            pressure = pb_utils.get_input_tensor_by_name(
                request, "PRESSURE").as_numpy()
            temp_hum = pb_utils.get_input_tensor_by_name(
                request, "TEMP_HUM").as_numpy()
            temp_press = pb_utils.get_input_tensor_by_name(
                request, "TEMP_PRESS").as_numpy()

            out_0 = np.array([
                acc_y, acc_x, acc_z, pressure, temp_press, temp_hum, humidity,
                gyro_x, gyro_y, gyro_z
            ]).transpose()

            #                  ACC_Y     ACC_X     ACC_Z    PRESSURE   TEMP_PRESS   TEMP_HUM   HUMIDITY    GYRO_X    GYRO_Y    GYRO_Z
            min = np.array([
                -0.132551, -0.049693, 0.759847, 976.001709, 38.724998,
                40.220890, 13.003981, -1.937896, -0.265019, -0.250647
            ])
            max = np.array([
                0.093099, 0.150289, 1.177543, 1007.996338, 46.093750,
                48.355824, 23.506138, 1.923712, 0.219204, 0.671759
            ])

            # MinMax scaling
            out_0_scaled = (out_0 - min) / (max - min)

            # Create output tensor
            out_tensor_0 = pb_utils.Tensor("INPUT0",
                                           out_0_scaled.astype(output0_dtype))

            inference_response = pb_utils.InferenceResponse(
                output_tensors=[out_tensor_0])
            responses.append(inference_response)

        return responses

示例#29

0

显示文件

文件： model.py 项目： triton-inference-server/server

 def execute(self, requests):
     """
     The body of this model doesn't matter. The main purpose of this model is
     to test correct handling of Python errors in the `finalize` function.
     """
     responses = []
     for request in requests:
         input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
         out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
         responses.append(pb_utils.InferenceResponse([out_tensor], error))
     return responses

示例#30

0

显示文件

文件： model.py 项目： luvwinnie/server

 async def execute(self, requests):
     responses = []
     for _ in requests:
         # Run the unittest and store the results in InferenceResponse.
         result = await test_bls_out_of_memory()
         responses.append(
             pb_utils.InferenceResponse([
                 pb_utils.Tensor('OUTPUT0',
                                 np.array([result], dtype=np.float16))
             ]))
     return responses