Пример #1
0
    def test_bls_out_of_memory(self):
        tensor_size = 1024 * 1024 * 1024
        input0_np, infer_response = self._send_identity_tensor(tensor_size)
        out_of_memory_message = "Failed to increase the shared memory pool size for key"

        if infer_response.has_error():
            self.assertIn(out_of_memory_message,
                          infer_response.error().message())
        else:
            self.assertFalse(infer_response.has_error())
            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT0')
            self.assertIsNotNone(output0)
            self.assertTrue(np.allclose(output0.as_numpy(), input0_np))

        tensor_size = 50 * 1024 * 1024
        for _ in range(4):
            input0_np, infer_response = self._send_identity_tensor(tensor_size)
            if infer_response.has_error():
                self.assertIn(out_of_memory_message,
                              infer_response.error().message())
            else:
                self.assertFalse(infer_response.has_error())
                output0 = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT0')
                self.assertIsNotNone(output0)
                self.assertTrue(np.allclose(output0.as_numpy(), input0_np))
Пример #2
0
def bls_add_sub(_=None):
    input0_np = np.random.randn(*[16])
    input0_np = input0_np.astype(np.float32)
    input1_np = np.random.randn(*[16])
    input1_np = input1_np.astype(np.float32)
    input0 = pb_utils.Tensor('INPUT0', input0_np)
    input1 = pb_utils.Tensor('INPUT1', input1_np)
    infer_request = pb_utils.InferenceRequest(
        model_name='add_sub',
        inputs=[input0, input1],
        requested_output_names=['OUTPUT0', 'OUTPUT1'])
    infer_response = infer_request.exec()
    if infer_response.has_error():
        return False

    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
    if output0 is None or output1 is None:
        return False

    expected_output_0 = input0.as_numpy() + input1.as_numpy()
    expected_output_1 = input0.as_numpy() - input1.as_numpy()

    if not np.all(expected_output_0 == output0.as_numpy()):
        return False

    if not np.all(expected_output_1 == output1.as_numpy()):
        return False

    return True
Пример #3
0
async def test_bls_out_of_memory():
    tensor_size = 1024 * 1024 * 1024
    input0_np, infer_response = await _send_identity_tensor(tensor_size)
    out_of_memory_message = "Failed to increase the shared memory pool size for key"

    if infer_response.has_error():
        if not (out_of_memory_message in infer_response.error().message()):
            return False
    else:
        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        if output0 is None:
            return False
        if not np.allclose(output0.as_numpy(), input0_np):
            return False

    tensor_size = 50 * 1024 * 1024
    for _ in range(4):
        input0_np, infer_response = await _send_identity_tensor(tensor_size)
        if infer_response.has_error():
            if not (out_of_memory_message in infer_response.error().message()):
                return False
        else:
            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT0')
            if output0 is None:
                return False
            if not np.allclose(output0.as_numpy(), input0_np):
                return False

    return True
Пример #4
0
    def _get_gpu_bls_outputs(self, input0_pb, input1_pb):
        """
        This function is created to test that the DLPack container works
        properly when the inference response and outputs go out of scope.

        Returns True on success and False on failure.
        """
        infer_request = pb_utils.InferenceRequest(
            model_name='dlpack_add_sub',
            inputs=[input0_pb, input1_pb],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        if infer_response.has_error():
            return False

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
        if output0 is None or output1 is None:
            return False

        # When one of the inputs is in GPU the output returned by the model must
        # be in GPU, otherwise the outputs will be in CPU.
        if not input0_pb.is_cpu() or not input1_pb.is_cpu():
            if output0.is_cpu() or output1.is_cpu():
                return False
        else:
            if (not output0.is_cpu()) or (not output1.is_cpu()):
                return False

        # Make sure that the reference count is increased by one when DLPack
        # representation is created.
        rc_before_dlpack_output0 = sys.getrefcount(output0)
        rc_before_dlpack_output1 = sys.getrefcount(output1)

        output0_dlpack = output0.to_dlpack()
        output1_dlpack = output1.to_dlpack()

        rc_after_dlpack_output0 = sys.getrefcount(output0)
        rc_after_dlpack_output1 = sys.getrefcount(output1)

        if rc_after_dlpack_output0 - rc_before_dlpack_output0 != 1:
            return False

        if rc_after_dlpack_output1 - rc_before_dlpack_output1 != 1:
            return False

        # Make sure that reference count decreases after destroying the DLPack
        output0_dlpack = None
        output1_dlpack = None
        rc_after_del_dlpack_output0 = sys.getrefcount(output0)
        rc_after_del_dlpack_output1 = sys.getrefcount(output1)
        if rc_after_del_dlpack_output0 - rc_after_dlpack_output0 != -1:
            return False

        if rc_after_del_dlpack_output1 - rc_after_dlpack_output1 != -1:
            return False

        return output0.to_dlpack(), output1.to_dlpack()
Пример #5
0
    def _send_bls_sequence_requests(self, correlation_id):
        # Start request
        try:
            input = pb_utils.Tensor('INPUT', np.array([1000], dtype=np.int32))

            infer_request = pb_utils.InferenceRequest(
                model_name='onnx_nobatch_sequence_int32',
                inputs=[input],
                requested_output_names=['OUTPUT'],
                flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START,
                correlation_id=correlation_id)
            self.assertTrue(infer_request.flags(),
                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START)
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())
            output = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT')
            self.assertEqual(output.as_numpy()[0], input.as_numpy()[0])

            for i in range(10):
                input = pb_utils.Tensor('INPUT', np.array([i], dtype=np.int32))
                infer_request = pb_utils.InferenceRequest(
                    model_name='onnx_nobatch_sequence_int32',
                    inputs=[input],
                    requested_output_names=['OUTPUT'],
                    correlation_id=correlation_id)
                infer_response = infer_request.exec()
                self.assertFalse(infer_response.has_error())

                # The new output is the previous output + the current input
                expected_output = output.as_numpy()[0] + i
                output = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT')
                self.assertEqual(output.as_numpy()[0], expected_output)

            # Final request
            input = pb_utils.Tensor('INPUT', np.array([2000], dtype=np.int32))

            infer_request = pb_utils.InferenceRequest(
                model_name='onnx_nobatch_sequence_int32',
                inputs=[input],
                requested_output_names=['OUTPUT'],
                correlation_id=correlation_id)
            infer_request.set_flags(
                pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)
            self.assertTrue(infer_request.flags(),
                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)

            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())
            expected_output = output.as_numpy()[0] + input.as_numpy()[0]
            output = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT')
            self.assertEqual(output.as_numpy()[0], expected_output)
        except Exception as e:
            self.add_deferred_exception(e)
Пример #6
0
    def _get_gpu_bls_outputs(self, input0_pb, input1_pb):
        """
        This function is created to test that the DLPack container works
        properly when the inference response and outputs go out of scope.
        """
        infer_request = pb_utils.InferenceRequest(
            model_name='dlpack_add_sub',
            inputs=[input0_pb, input1_pb],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertFalse(infer_response.has_error())

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
        self.assertIsNotNone(output0)
        self.assertIsNotNone(output1)

        # When one of the inputs is in GPU the output returned by the model must
        # be in GPU, otherwise the outputs will be in CPU.
        if not input0_pb.is_cpu() or not input1_pb.is_cpu():
            self.assertTrue((not output0.is_cpu()) and (not output1.is_cpu()))
        else:
            self.assertTrue((output0.is_cpu()) and (output1.is_cpu()))

        # Make sure that the reference count is increased by one when DLPack
        # representation is created.
        rc_before_dlpack_output0 = sys.getrefcount(output0)
        rc_before_dlpack_output1 = sys.getrefcount(output1)

        output0_dlpack = output0.to_dlpack()
        output1_dlpack = output1.to_dlpack()

        rc_after_dlpack_output0 = sys.getrefcount(output0)
        rc_after_dlpack_output1 = sys.getrefcount(output1)

        self.assertEqual(rc_after_dlpack_output0 - rc_before_dlpack_output0, 1)
        self.assertEqual(rc_after_dlpack_output1 - rc_before_dlpack_output1, 1)

        # Make sure that reference count decreases after destroying the DLPack
        output0_dlpack = None
        output1_dlpack = None
        rc_after_del_dlpack_output0 = sys.getrefcount(output0)
        rc_after_del_dlpack_output1 = sys.getrefcount(output1)
        self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0,
                         -1)
        self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1,
                         -1)

        return output0.to_dlpack(), output1.to_dlpack()
Пример #7
0
def verify_add_sub_results(input0, input1, infer_response):
    if infer_response.has_error():
        return False

    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')

    if (output0 is None) or (output1 is None):
        return False

    if not input0.is_cpu():
        input0 = from_dlpack(
            input0.to_dlpack()).to('cpu').cpu().detach().numpy()
    else:
        input0 = input0.as_numpy()

    if not input1.is_cpu():
        input1 = from_dlpack(
            input1.to_dlpack()).to('cpu').cpu().detach().numpy()
    else:
        input1 = input1.as_numpy()

    if not output0.is_cpu():
        output0 = from_dlpack(
            output0.to_dlpack()).to('cpu').cpu().detach().numpy()
    else:
        output0 = output0.as_numpy()

    if not output1.is_cpu():
        output1 = from_dlpack(
            output1.to_dlpack()).to('cpu').cpu().detach().numpy()
    else:
        output1 = output1.as_numpy()

    expected_output_0 = input0 + input1
    expected_output_1 = input0 - input1

    if not np.all(expected_output_0 == output0):
        print(f'For OUTPUT0 expected {expected_output_0} found {output0}')
        return False

    if not np.all(expected_output_1 == output1):
        print(f'For OUTPUT1 expected {expected_output_1} found {output1}')
        return False

    return True
Пример #8
0
    def test_zero_length_io(self):
        model_name = 'identity_fp32'
        input0 = np.zeros([1, 0], dtype=np.float32)
        input0_pb = pb_utils.Tensor('INPUT0', input0)
        infer_request = pb_utils.InferenceRequest(
            model_name=model_name,
            inputs=[input0_pb],
            requested_output_names=['OUTPUT0'])
        infer_response = infer_request.exec()
        self.assertFalse(infer_response.has_error())

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        self.assertTrue(np.all(output0 == input0))
Пример #9
0
    def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu):
        input0 = torch.rand(16)
        input1 = torch.rand(16)

        if is_input0_gpu:
            input0 = input0.to('cuda')

        if is_input1_gpu:
            input1 = input1.to('cuda')

        input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0))
        input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1))

        infer_request = pb_utils.InferenceRequest(
            model_name='dlpack_add_sub',
            inputs=[input0_pb, input1_pb],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertFalse(infer_response.has_error())

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
        self.assertIsNotNone(output0)
        self.assertIsNotNone(output1)

        expected_output_0 = from_dlpack(
            input0_pb.to_dlpack()).to('cpu') + from_dlpack(
                input1_pb.to_dlpack()).to('cpu')
        expected_output_1 = from_dlpack(
            input0_pb.to_dlpack()).to('cpu') - from_dlpack(
                input1_pb.to_dlpack()).to('cpu')

        self.assertTrue(
            torch.all(expected_output_0 == from_dlpack(output0.to_dlpack()).to(
                'cpu')))
        self.assertTrue(
            torch.all(expected_output_1 == from_dlpack(output1.to_dlpack()).to(
                'cpu')))
Пример #10
0
    def response_thread(self, response_sender, in_input):
        # The response_sender is used to send response(s) associated with the
        # corresponding request.
        # Sleep 5 seconds to make sure the main thread has exited.
        time.sleep(5)

        status = self.execute_gpu_bls()
        if not status:
            infer_response = pb_utils.InferenceResponse(
                error="GPU BLS test failed.")
            response_sender.send(infer_response)
        else:
            in_value = in_input
            infer_request = pb_utils.InferenceRequest(
                model_name='identity_fp32',
                requested_output_names=["OUTPUT0"],
                inputs=[pb_utils.Tensor('INPUT0', in_input)])
            infer_response = infer_request.exec()
            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, "OUTPUT0")
            if infer_response.has_error():
                response = pb_utils.InferenceResponse(
                    error=infer_response.error().message())
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
            elif np.any(in_input != output0.as_numpy()):
                error_message = (
                    "BLS Request input and BLS response output do not match."
                    f" {in_value} != {output0.as_numpy()}")
                response = pb_utils.InferenceResponse(error=error_message)
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
            else:
                output_tensors = [pb_utils.Tensor('OUT', in_value)]
                response = pb_utils.InferenceResponse(
                    output_tensors=output_tensors)
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

        with self.inflight_thread_count_lck:
            self.inflight_thread_count -= 1
Пример #11
0
    def execute(self, requests):
        responses = []
        for request in requests:
            # Get INPUT0
            input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0')
            infer_request = pb_utils.InferenceRequest(
                model_name='identity',
                requested_output_names=["OUTPUT0"],
                inputs=[input0])
            infer_response = infer_request.exec()

            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    infer_response.error().message())

            inference_response = pb_utils.InferenceResponse(output_tensors=[
                pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
            ])
            responses.append(inference_response)

        return responses
Пример #12
0
    def execute(self, requests):
        """ This function is called on inference request.
        """

        # Only generate the error for the first request
        for i, request in enumerate(requests):
            request_input = pb_utils.get_input_tensor_by_name(request, 'IN')

            # Sync BLS request
            infer_request = pb_utils.InferenceRequest(
                model_name='identity_fp32',
                requested_output_names=["OUTPUT0"],
                inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())])
            infer_response = infer_request.exec()
            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    f"BLS Response has an error: {infer_response.error().message()}"
                )

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, "OUTPUT0")
            if np.any(output0.as_numpy() != request_input.as_numpy()):
                raise pb_utils.TritonModelException(
                    f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}"
                )

            thread1 = threading.Thread(target=self.response_thread,
                                       args=(request.get_response_sender(),
                                             pb_utils.get_input_tensor_by_name(
                                                 request, 'IN').as_numpy()))
            thread1.daemon = True
            with self.inflight_thread_count_lck:
                self.inflight_thread_count += 1
            thread1.start()

        return None
    async def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference request is made
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []
        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")

            # Get INPUT1
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            # List of awaitables containing inflight inference responses.
            inference_response_awaits = []
            for model_name in ['pytorch', 'add_sub']:
                # Create inference request object
                infer_request = pb_utils.InferenceRequest(
                    model_name=model_name,
                    requested_output_names=["OUTPUT0", "OUTPUT1"],
                    inputs=[in_0, in_1])

                # Store the awaitable inside the array. We don't need
                # the inference response immediately so we do not `await`
                # here.
                inference_response_awaits.append(infer_request.async_exec())

            # Wait for all the inference requests to finish. The execution
            # of the Python script will be blocked until all the awaitables
            # are resolved.
            inference_responses = await asyncio.gather(
                *inference_response_awaits)

            for infer_response in inference_responses:
                # Make sure that the inference response doesn't have an error.
                # If it has an error and you can't proceed with your model
                # execution you can raise an exception.
                if infer_response.has_error():
                    raise pb_utils.TritonModelException(
                        infer_response.error().message())

            # Get the OUTPUT0 from the "pytorch" model inference resposne
            pytorch_output0_tensor = pb_utils.get_output_tensor_by_name(
                inference_responses[0], "OUTPUT0")

            # Get the OUTPUT1 from the "addsub" model inference resposne
            addsub_output1_tensor = pb_utils.get_output_tensor_by_name(
                inference_responses[1], "OUTPUT1")

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            #
            # Because the infer_response of the models contains the final
            # outputs with correct output names, we can just pass the list
            # of outputs to the InferenceResponse object.
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[pytorch_output0_tensor, addsub_output1_tensor])
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses
Пример #14
0
    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model.

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []

        # Every Python backend must iterate through list of requests and create
        # an instance of pb_utils.InferenceResponse class for each of them. You
        # should avoid storing any of the input Tensors in the class attributes
        # as they will be overridden in subsequent inference requests. You can
        # make a copy of the underlying NumPy array and store it if it is
        # required.

        batch_encoder_out, batch_encoder_lens = [], []
        batch_log_probs, batch_log_probs_idx = [], []
        batch_count = []
        batch_root = TrieVector()
        batch_start = []
        root_dict = {}

        encoder_max_len = 0
        hyps_max_len = 0
        total = 0
        for request in requests:
            # Perform inference on the request and append it to responses list...
            in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out")
            in_1 = pb_utils.get_input_tensor_by_name(request,
                                                     "encoder_out_lens")
            in_2 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs")
            in_3 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs_idx")

            batch_encoder_out.append(in_0.as_numpy())
            encoder_max_len = max(encoder_max_len,
                                  batch_encoder_out[-1].shape[1])

            cur_b_lens = in_1.as_numpy()
            batch_encoder_lens.append(cur_b_lens)
            cur_batch = cur_b_lens.shape[0]
            batch_count.append(cur_batch)

            cur_b_log_probs = in_2.as_numpy()
            cur_b_log_probs_idx = in_3.as_numpy()
            for i in range(cur_batch):
                cur_len = cur_b_lens[i]
                cur_probs = cur_b_log_probs[i][
                    0:cur_len, :].tolist()  # T X Beam
                cur_idx = cur_b_log_probs_idx[i][
                    0:cur_len, :].tolist()  # T x Beam
                batch_log_probs.append(cur_probs)
                batch_log_probs_idx.append(cur_idx)
                root_dict[total] = PathTrie()
                batch_root.append(root_dict[total])
                batch_start.append(True)
                total += 1

        score_hyps = ctc_beam_search_decoder_batch(
            batch_log_probs,
            batch_log_probs_idx,
            batch_root,
            batch_start,
            self.beam_size,
            min(total, self.num_processes),
            blank_id=self.blank_id,
            space_id=-2,
            cutoff_prob=self.cutoff_prob,
            ext_scorer=self.lm)
        all_hyps = []
        all_ctc_score = []
        max_seq_len = 0
        for seq_cand in score_hyps:
            # if candidates less than beam size
            if len(seq_cand) != self.beam_size:
                seq_cand = list(seq_cand)
                seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"),
                                                                 (0, ))]

            for score, hyps in seq_cand:
                all_hyps.append(list(hyps))
                all_ctc_score.append(score)
                max_seq_len = max(len(hyps), max_seq_len)

        beam_size = self.beam_size
        feature_size = self.feature_size
        hyps_max_len = max_seq_len + 2
        in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type)
        in_hyps_pad_sos_eos = np.ones(
            (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos
        if self.bidecoder:
            in_r_hyps_pad_sos_eos = np.ones(
                (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos

        in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32)

        in_encoder_out = np.zeros((total, encoder_max_len, feature_size),
                                  dtype=self.data_type)
        in_encoder_out_lens = np.zeros(total, dtype=np.int32)
        st = 0
        for b in batch_count:
            t = batch_encoder_out.pop(0)
            in_encoder_out[st:st + b, 0:t.shape[1]] = t
            in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0)
            for i in range(b):
                for j in range(beam_size):
                    cur_hyp = all_hyps.pop(0)
                    cur_len = len(cur_hyp) + 2
                    in_hyp = [self.sos] + cur_hyp + [self.eos]
                    in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp
                    in_hyps_lens_sos[st + i][j] = cur_len - 1
                    if self.bidecoder:
                        r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos]
                        in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp
                    in_ctc_score[st + i][j] = all_ctc_score.pop(0)
            st += b
        in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1)
        in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out)
        in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens)
        in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos)
        in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos)
        input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3]
        if self.bidecoder:
            in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos",
                                          in_r_hyps_pad_sos_eos)
            input_tensors.append(in_tensor_4)
        in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score)
        input_tensors.append(in_tensor_5)

        inference_request = pb_utils.InferenceRequest(
            model_name='decoder',
            requested_output_names=['best_index'],
            inputs=input_tensors)

        inference_response = inference_request.exec()
        if inference_response.has_error():
            raise pb_utils.TritonModelException(
                inference_response.error().message())
        else:
            # Extract the output tensors from the inference response.
            best_index = pb_utils.get_output_tensor_by_name(
                inference_response, 'best_index')
            best_index = best_index.as_numpy()
            hyps = []
            idx = 0
            for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos):
                best_idx = best_index[idx][0]
                best_cand_len = cand_lens[best_idx] - 1  # remove sos
                best_cand = cands[best_idx][1:1 + best_cand_len].tolist()
                hyps.append(best_cand)
                idx += 1

            hyps = map_batch(
                hyps, self.vocabulary,
                min(multiprocessing.cpu_count(), len(in_ctc_score)))
            st = 0
            for b in batch_count:
                sents = np.array(hyps[st:st + b])
                out0 = pb_utils.Tensor("OUTPUT0",
                                       sents.astype(self.out0_dtype))
                inference_response = pb_utils.InferenceResponse(
                    output_tensors=[out0])
                responses.append(inference_response)
                st += b
        return responses
Пример #15
0
    def execute(self, requests):
        responses = []
        for request in requests:
            input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            gpu_output = pb_utils.get_input_tensor_by_name(
                request, "GPU_OUTPUT").as_numpy()

            if input0.is_cpu():
                if not gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))
            else:
                if gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))

            next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT",
                                              gpu_output[1:])

            # Do not perform BLS inference if it is the first
            # model in the pipeline.
            if self._model_name != 'dlpack_io_identity_1':
                infer_request = pb_utils.InferenceRequest(
                    model_name='dlpack_io_identity_1',
                    inputs=[
                        input0,
                        pb_utils.get_input_tensor_by_name(
                            request, "GPU_OUTPUT")
                    ],
                    requested_output_names=['OUTPUT0'])
                infer_response = infer_request.exec()

                if infer_response.has_error():
                    raise pb_utils.TritonModelException(
                        infer_response.error().message())

                bls_output0 = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT0')
                if not output0.is_cpu():
                    bls_output0 = from_dlpack(
                        bls_output0.to_dlpack()).detach().cpu().numpy()
                else:
                    bls_output0 = bls_output0.as_numpy()

                if not input0.is_cpu():
                    input0 = from_dlpack(
                        input0.to_dlpack()).detach().cpu().numpy()
                else:
                    input0 = input0.as_numpy()

                if not np.allclose(bls_output0, input0):
                    raise pb_utils.TritonModelException(
                        'BLS input and output tensors are not equal')

            responses.append(
                pb_utils.InferenceResponse([output0, next_gpu_output]))

        return responses
Пример #16
0
    def batch_rescoring(self, score_hyps, hist_enc, hist_mask_len, max_len):
        """
        score_hyps: [((ctc_score, (id1, id2, id3, ....)), (), ...), ....]
        hist_enc: [len1xF, len2xF, .....]
        hist_mask: [1x1xlen1, 1x1xlen2]
        return bzx1  best_index
        """
        bz = len(hist_enc)
        f = hist_enc[0].shape[-1]
        beam_size = self.beam_size
        encoder_lens = np.zeros((bz, 1), dtype=np.int32)
        encoder_out = torch.zeros((bz, max_len, f), dtype=self.dtype)
        hyps = []
        ctc_score = torch.zeros((bz, beam_size), dtype=self.dtype)
        max_seq_len = 0
        for i in range(bz):
            cur_len = hist_enc[i].shape[0]
            encoder_out[i, 0:cur_len] = hist_enc[i]
            encoder_lens[i, 0] = hist_mask_len[i]

            # process candidate
            if len(score_hyps[i]) < beam_size:
                to_append = (beam_size - len(score_hyps[i])) * [(-10000, ())]
                score_hyps[i] = list(score_hyps[i]) + to_append
            for idx, c in enumerate(score_hyps[i]):
                score, idlist = c
                if score < -10000:
                    score = -10000
                ctc_score[i][idx] = score
                hyps.append(list(idlist))
                if len(hyps[-1]) > max_seq_len:
                    max_seq_len = len(hyps[-1])

        max_seq_len += 2
        hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64)
        hyps_pad_sos_eos = hyps_pad_sos_eos * self.eos  # fill eos
        if self.bidecoder:
            r_hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64)
            r_hyps_pad_sos_eos = r_hyps_pad_sos_eos * self.eos

        hyps_lens_sos = np.ones((bz, beam_size), dtype=np.int32)
        bz_id = 0
        for idx, cand in enumerate(hyps):
            bz_id = idx // beam_size
            length = len(cand) + 2
            bz_offset = idx % beam_size
            pad_cand = [self.sos] + cand + [self.eos]
            hyps_pad_sos_eos[bz_id][bz_offset][0 : length] = pad_cand
            if self.bidecoder:
                r_pad_cand = [self.sos] + cand[::-1] + [self.eos]
                r_hyps_pad_sos_eos[bz_id][bz_offset][0:length] = r_pad_cand
            hyps_lens_sos[bz_id][idx % beam_size] = len(cand) + 1
        in0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out))
        in1 = pb_utils.Tensor("encoder_out_lens", encoder_lens)
        in2 = pb_utils.Tensor("hyps_pad_sos_eos", hyps_pad_sos_eos)
        in3 = pb_utils.Tensor("hyps_lens_sos", hyps_lens_sos)
        input_tensors = [in0, in1, in2, in3]
        if self.bidecoder:
            in4 = pb_utils.Tensor("r_hyps_pad_sos_eos", r_hyps_pad_sos_eos)
            input_tensors.append(in4)
        in5 = pb_utils.Tensor.from_dlpack("ctc_score", to_dlpack(ctc_score))
        input_tensors.append(in5)
        request = pb_utils.InferenceRequest(model_name='decoder',
                                            requested_output_names=['best_index'],
                                            inputs=input_tensors)
        response = request.exec()
        best_index = pb_utils.get_output_tensor_by_name(response, 'best_index')
        best_index = from_dlpack(best_index.to_dlpack()).clone()
        best_index = best_index.numpy()[:, 0]
        return best_index
Пример #17
0
    def test_bls_tensor_lifecycle(self):
        model_name = 'dlpack_identity'

        # A 10 MB tensor.
        input_size = 10 * 1024 * 1024

        # Sending the tensor 50 times to test whether the deallocation is
        # happening correctly. If the deallocation doesn't happen correctly,
        # there will be an out of shared memory error.
        for _ in range(50):
            input0 = np.ones([1, input_size], dtype=np.float32)
            input0_pb = pb_utils.Tensor('INPUT0', input0)
            infer_request = pb_utils.InferenceRequest(
                model_name=model_name,
                inputs=[input0_pb],
                requested_output_names=['OUTPUT0'])
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT0')
            np.testing.assert_equal(output0.as_numpy(), input0,
                                    "BLS CPU memory lifecycle failed.")

        # Checking the same with the GPU tensors.
        for index in range(50):
            input0 = None
            infer_request = None
            input0_pb = None

            torch.cuda.empty_cache()
            free_memory, _ = torch.cuda.mem_get_info()
            if index == 1:
                recorded_memory = free_memory

            if index > 1:
                self.assertEqual(free_memory, recorded_memory,
                                 "GPU memory lifecycle test failed.")

            input0 = torch.ones([1, input_size],
                                dtype=torch.float32).to('cuda')
            input0_pb = pb_utils.Tensor.from_dlpack('INPUT0',
                                                    to_dlpack(input0))
            infer_request = pb_utils.InferenceRequest(
                model_name=model_name,
                inputs=[input0_pb],
                requested_output_names=['OUTPUT0'])
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT0')
            output0_pytorch = from_dlpack(output0.to_dlpack())

            # Set inference response and output0_pytorch to None, to make sure
            # that the DLPack is still valid.
            output0 = None
            infer_response = None
            self.assertTrue(
                torch.all(output0_pytorch == input0),
                f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model."
            )