Пример #1
0
 def initialize(self, args):
     self.args = args
     if args['model_name'] != 'init_args' or args[
             'model_instance_name'] != 'init_args_0':
         raise pb_utils.TritonModelException(
             'model_instance_name/model_name does not contain correct value.'
         )
Пример #2
0
    def execute(self, requests):
        """ Create a response sender object and use that
        for sending the response.
        """

        # This model does not support batching, so 'request_count' should always be 1.
        if len(requests) != 1:
            raise pb_utils.TritonModelException("unsupported batch size " +
                                                len(requests))

        output0_dtype = self.output0_dtype
        output1_dtype = self.output1_dtype

        response_sender = requests[0].get_response_sender()
        in_0 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT0")
        in_1 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT1")
        out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
                        in_0.as_numpy() - in_1.as_numpy())

        out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
        out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
        response = pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])

        response_sender.send(
            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
        response_sender.send(response)
Пример #3
0
    def execute(self, requests):
        """ This function is called on inference request.
        """

        # Only generate the error for the first request
        for i, request in enumerate(requests):
            request_input = pb_utils.get_input_tensor_by_name(request, 'IN')

            # Sync BLS request
            infer_request = pb_utils.InferenceRequest(
                model_name='identity_fp32',
                requested_output_names=["OUTPUT0"],
                inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())])
            infer_response = infer_request.exec()
            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    f"BLS Response has an error: {infer_response.error().message()}"
                )

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, "OUTPUT0")
            if np.any(output0.as_numpy() != request_input.as_numpy()):
                raise pb_utils.TritonModelException(
                    f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}"
                )

            thread1 = threading.Thread(target=self.response_thread,
                                       args=(request.get_response_sender(),
                                             pb_utils.get_input_tensor_by_name(
                                                 request, 'IN').as_numpy()))
            thread1.daemon = True
            with self.inflight_thread_count_lck:
                self.inflight_thread_count += 1
            thread1.start()

        return None
Пример #4
0
    def initialize(self, args):
        self.model_config = model_config = json.loads(args['model_config'])

        using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
            model_config)
        if not using_decoupled:
            raise pb_utils.TritonModelException(
                """the model `{}` can generate any number of responses per request,
                enable decoupled transaction policy in model configuration to 
                serve this model""".format(args['model_name']))

        output0_config = pb_utils.get_output_config_by_name(
            model_config, "OUTPUT0")
        output1_config = pb_utils.get_output_config_by_name(
            model_config, "OUTPUT1")

        self.output0_dtype = pb_utils.triton_string_to_numpy(
            output0_config['data_type'])
        self.output1_dtype = pb_utils.triton_string_to_numpy(
            output1_config['data_type'])
Пример #5
0
    def initialize(self, args):
        # You must parse model_config. JSON string is not parsed here
        self.model_config = model_config = json.loads(args['model_config'])

        using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
            model_config)
        if not using_decoupled:
            raise pb_utils.TritonModelException(
                """the model `{}` can generate any number of responses per request,
                enable decoupled transaction policy in model configuration to
                serve this model""".format(args['model_name']))

        # Get OUT configuration
        out_config = pb_utils.get_output_config_by_name(model_config, "OUT")

        # Convert Triton types to numpy types
        self.out_dtype = pb_utils.triton_string_to_numpy(
            out_config['data_type'])

        self.inflight_thread_count = 0
        self.inflight_thread_count_lck = threading.Lock()
Пример #6
0
    def execute(self, requests):
        responses = []
        for request in requests:
            # Get INPUT0
            input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0')
            infer_request = pb_utils.InferenceRequest(
                model_name='identity',
                requested_output_names=["OUTPUT0"],
                inputs=[input0])
            infer_response = infer_request.exec()

            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    infer_response.error().message())

            inference_response = pb_utils.InferenceResponse(output_tensors=[
                pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
            ])
            responses.append(inference_response)

        return responses
    async def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference request is made
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []
        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")

            # Get INPUT1
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            # List of awaitables containing inflight inference responses.
            inference_response_awaits = []
            for model_name in ['pytorch', 'add_sub']:
                # Create inference request object
                infer_request = pb_utils.InferenceRequest(
                    model_name=model_name,
                    requested_output_names=["OUTPUT0", "OUTPUT1"],
                    inputs=[in_0, in_1])

                # Store the awaitable inside the array. We don't need
                # the inference response immediately so we do not `await`
                # here.
                inference_response_awaits.append(infer_request.async_exec())

            # Wait for all the inference requests to finish. The execution
            # of the Python script will be blocked until all the awaitables
            # are resolved.
            inference_responses = await asyncio.gather(
                *inference_response_awaits)

            for infer_response in inference_responses:
                # Make sure that the inference response doesn't have an error.
                # If it has an error and you can't proceed with your model
                # execution you can raise an exception.
                if infer_response.has_error():
                    raise pb_utils.TritonModelException(
                        infer_response.error().message())

            # Get the OUTPUT0 from the "pytorch" model inference resposne
            pytorch_output0_tensor = pb_utils.get_output_tensor_by_name(
                inference_responses[0], "OUTPUT0")

            # Get the OUTPUT1 from the "addsub" model inference resposne
            addsub_output1_tensor = pb_utils.get_output_tensor_by_name(
                inference_responses[1], "OUTPUT1")

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            #
            # Because the infer_response of the models contains the final
            # outputs with correct output names, we can just pass the list
            # of outputs to the InferenceResponse object.
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[pytorch_output0_tensor, addsub_output1_tensor])
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses
Пример #8
0
    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model.

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []

        # Every Python backend must iterate through list of requests and create
        # an instance of pb_utils.InferenceResponse class for each of them. You
        # should avoid storing any of the input Tensors in the class attributes
        # as they will be overridden in subsequent inference requests. You can
        # make a copy of the underlying NumPy array and store it if it is
        # required.

        batch_encoder_out, batch_encoder_lens = [], []
        batch_log_probs, batch_log_probs_idx = [], []
        batch_count = []
        batch_root = TrieVector()
        batch_start = []
        root_dict = {}

        encoder_max_len = 0
        hyps_max_len = 0
        total = 0
        for request in requests:
            # Perform inference on the request and append it to responses list...
            in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out")
            in_1 = pb_utils.get_input_tensor_by_name(request,
                                                     "encoder_out_lens")
            in_2 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs")
            in_3 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs_idx")

            batch_encoder_out.append(in_0.as_numpy())
            encoder_max_len = max(encoder_max_len,
                                  batch_encoder_out[-1].shape[1])

            cur_b_lens = in_1.as_numpy()
            batch_encoder_lens.append(cur_b_lens)
            cur_batch = cur_b_lens.shape[0]
            batch_count.append(cur_batch)

            cur_b_log_probs = in_2.as_numpy()
            cur_b_log_probs_idx = in_3.as_numpy()
            for i in range(cur_batch):
                cur_len = cur_b_lens[i]
                cur_probs = cur_b_log_probs[i][
                    0:cur_len, :].tolist()  # T X Beam
                cur_idx = cur_b_log_probs_idx[i][
                    0:cur_len, :].tolist()  # T x Beam
                batch_log_probs.append(cur_probs)
                batch_log_probs_idx.append(cur_idx)
                root_dict[total] = PathTrie()
                batch_root.append(root_dict[total])
                batch_start.append(True)
                total += 1

        score_hyps = ctc_beam_search_decoder_batch(
            batch_log_probs,
            batch_log_probs_idx,
            batch_root,
            batch_start,
            self.beam_size,
            min(total, self.num_processes),
            blank_id=self.blank_id,
            space_id=-2,
            cutoff_prob=self.cutoff_prob,
            ext_scorer=self.lm)
        all_hyps = []
        all_ctc_score = []
        max_seq_len = 0
        for seq_cand in score_hyps:
            # if candidates less than beam size
            if len(seq_cand) != self.beam_size:
                seq_cand = list(seq_cand)
                seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"),
                                                                 (0, ))]

            for score, hyps in seq_cand:
                all_hyps.append(list(hyps))
                all_ctc_score.append(score)
                max_seq_len = max(len(hyps), max_seq_len)

        beam_size = self.beam_size
        feature_size = self.feature_size
        hyps_max_len = max_seq_len + 2
        in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type)
        in_hyps_pad_sos_eos = np.ones(
            (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos
        if self.bidecoder:
            in_r_hyps_pad_sos_eos = np.ones(
                (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos

        in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32)

        in_encoder_out = np.zeros((total, encoder_max_len, feature_size),
                                  dtype=self.data_type)
        in_encoder_out_lens = np.zeros(total, dtype=np.int32)
        st = 0
        for b in batch_count:
            t = batch_encoder_out.pop(0)
            in_encoder_out[st:st + b, 0:t.shape[1]] = t
            in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0)
            for i in range(b):
                for j in range(beam_size):
                    cur_hyp = all_hyps.pop(0)
                    cur_len = len(cur_hyp) + 2
                    in_hyp = [self.sos] + cur_hyp + [self.eos]
                    in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp
                    in_hyps_lens_sos[st + i][j] = cur_len - 1
                    if self.bidecoder:
                        r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos]
                        in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp
                    in_ctc_score[st + i][j] = all_ctc_score.pop(0)
            st += b
        in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1)
        in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out)
        in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens)
        in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos)
        in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos)
        input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3]
        if self.bidecoder:
            in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos",
                                          in_r_hyps_pad_sos_eos)
            input_tensors.append(in_tensor_4)
        in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score)
        input_tensors.append(in_tensor_5)

        inference_request = pb_utils.InferenceRequest(
            model_name='decoder',
            requested_output_names=['best_index'],
            inputs=input_tensors)

        inference_response = inference_request.exec()
        if inference_response.has_error():
            raise pb_utils.TritonModelException(
                inference_response.error().message())
        else:
            # Extract the output tensors from the inference response.
            best_index = pb_utils.get_output_tensor_by_name(
                inference_response, 'best_index')
            best_index = best_index.as_numpy()
            hyps = []
            idx = 0
            for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos):
                best_idx = best_index[idx][0]
                best_cand_len = cand_lens[best_idx] - 1  # remove sos
                best_cand = cands[best_idx][1:1 + best_cand_len].tolist()
                hyps.append(best_cand)
                idx += 1

            hyps = map_batch(
                hyps, self.vocabulary,
                min(multiprocessing.cpu_count(), len(in_ctc_score)))
            st = 0
            for b in batch_count:
                sents = np.array(hyps[st:st + b])
                out0 = pb_utils.Tensor("OUTPUT0",
                                       sents.astype(self.out0_dtype))
                inference_response = pb_utils.InferenceResponse(
                    output_tensors=[out0])
                responses.append(inference_response)
                st += b
        return responses
Пример #9
0
    def execute(self, requests):
        responses = []
        for request in requests:
            input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            gpu_output = pb_utils.get_input_tensor_by_name(
                request, "GPU_OUTPUT").as_numpy()

            if input0.is_cpu():
                if not gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))
            else:
                if gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))

            next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT",
                                              gpu_output[1:])

            # Do not perform BLS inference if it is the first
            # model in the pipeline.
            if self._model_name != 'dlpack_io_identity_1':
                infer_request = pb_utils.InferenceRequest(
                    model_name='dlpack_io_identity_1',
                    inputs=[
                        input0,
                        pb_utils.get_input_tensor_by_name(
                            request, "GPU_OUTPUT")
                    ],
                    requested_output_names=['OUTPUT0'])
                infer_response = infer_request.exec()

                if infer_response.has_error():
                    raise pb_utils.TritonModelException(
                        infer_response.error().message())

                bls_output0 = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT0')
                if not output0.is_cpu():
                    bls_output0 = from_dlpack(
                        bls_output0.to_dlpack()).detach().cpu().numpy()
                else:
                    bls_output0 = bls_output0.as_numpy()

                if not input0.is_cpu():
                    input0 = from_dlpack(
                        input0.to_dlpack()).detach().cpu().numpy()
                else:
                    input0 = input0.as_numpy()

                if not np.allclose(bls_output0, input0):
                    raise pb_utils.TritonModelException(
                        'BLS input and output tensors are not equal')

            responses.append(
                pb_utils.InferenceResponse([output0, next_gpu_output]))

        return responses
Пример #10
0
 def initialize(self, args):
     if file1.FILE_NAME != 'FILE1' or file2.FILE_NAME != 'FILE2':
         raise pb_utils.TritonModelException('Imports do not work')
Пример #11
0
 def initialize(self, args):
     # Make sure that environment variables are correctly propagated
     # to the Python models
     if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != 'MY_ENV':
         raise pb_utils.TritonModelException(
             "MY_ENV doesn't exists or contains incorrect value")
Пример #12
0
 def initialize(self, args):
     if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != 'MY_ENV':
         raise pb_utils.TritonModelException(
             "MY_ENV doesn't exists or contains incorrect value")
    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference request is made
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []
        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")

            # Get INPUT1
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            # Get Model Name
            model_name = pb_utils.get_input_tensor_by_name(
                request, "MODEL_NAME")

            # Model Name string
            model_name_string = model_name.as_numpy()[0]

            # Create inference request object
            infer_request = pb_utils.InferenceRequest(
                model_name=model_name_string,
                requested_output_names=["OUTPUT0", "OUTPUT1"],
                inputs=[in_0, in_1])

            # Perform synchronous blocking inference request
            infer_response = infer_request.exec()

            # Make sure that the inference response doesn't have an error. If
            # it has an error and you can't proceed with your model execution
            # you can raise an exception.
            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    infer_response.error().message())

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            #
            # Because the infer_response of the models contains the final
            # outputs with correct output names, we can just pass the list
            # of outputs to the InferenceResponse object.
            inference_response = pb_utils.InferenceResponse(
                output_tensors=infer_response.output_tensors())
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses