def test_bls_out_of_memory(self): tensor_size = 1024 * 1024 * 1024 input0_np, infer_response = self._send_identity_tensor(tensor_size) out_of_memory_message = "Failed to increase the shared memory pool size for key" if infer_response.has_error(): self.assertIn(out_of_memory_message, infer_response.error().message()) else: self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') self.assertIsNotNone(output0) self.assertTrue(np.allclose(output0.as_numpy(), input0_np)) tensor_size = 50 * 1024 * 1024 for _ in range(4): input0_np, infer_response = self._send_identity_tensor(tensor_size) if infer_response.has_error(): self.assertIn(out_of_memory_message, infer_response.error().message()) else: self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') self.assertIsNotNone(output0) self.assertTrue(np.allclose(output0.as_numpy(), input0_np))
def bls_add_sub(_=None): input0_np = np.random.randn(*[16]) input0_np = input0_np.astype(np.float32) input1_np = np.random.randn(*[16]) input1_np = input1_np.astype(np.float32) input0 = pb_utils.Tensor('INPUT0', input0_np) input1 = pb_utils.Tensor('INPUT1', input1_np) infer_request = pb_utils.InferenceRequest( model_name='add_sub', inputs=[input0, input1], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() if infer_response.has_error(): return False output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') if output0 is None or output1 is None: return False expected_output_0 = input0.as_numpy() + input1.as_numpy() expected_output_1 = input0.as_numpy() - input1.as_numpy() if not np.all(expected_output_0 == output0.as_numpy()): return False if not np.all(expected_output_1 == output1.as_numpy()): return False return True
async def test_bls_out_of_memory(): tensor_size = 1024 * 1024 * 1024 input0_np, infer_response = await _send_identity_tensor(tensor_size) out_of_memory_message = "Failed to increase the shared memory pool size for key" if infer_response.has_error(): if not (out_of_memory_message in infer_response.error().message()): return False else: output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') if output0 is None: return False if not np.allclose(output0.as_numpy(), input0_np): return False tensor_size = 50 * 1024 * 1024 for _ in range(4): input0_np, infer_response = await _send_identity_tensor(tensor_size) if infer_response.has_error(): if not (out_of_memory_message in infer_response.error().message()): return False else: output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') if output0 is None: return False if not np.allclose(output0.as_numpy(), input0_np): return False return True
def _get_gpu_bls_outputs(self, input0_pb, input1_pb): """ This function is created to test that the DLPack container works properly when the inference response and outputs go out of scope. Returns True on success and False on failure. """ infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0_pb, input1_pb], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() if infer_response.has_error(): return False output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') if output0 is None or output1 is None: return False # When one of the inputs is in GPU the output returned by the model must # be in GPU, otherwise the outputs will be in CPU. if not input0_pb.is_cpu() or not input1_pb.is_cpu(): if output0.is_cpu() or output1.is_cpu(): return False else: if (not output0.is_cpu()) or (not output1.is_cpu()): return False # Make sure that the reference count is increased by one when DLPack # representation is created. rc_before_dlpack_output0 = sys.getrefcount(output0) rc_before_dlpack_output1 = sys.getrefcount(output1) output0_dlpack = output0.to_dlpack() output1_dlpack = output1.to_dlpack() rc_after_dlpack_output0 = sys.getrefcount(output0) rc_after_dlpack_output1 = sys.getrefcount(output1) if rc_after_dlpack_output0 - rc_before_dlpack_output0 != 1: return False if rc_after_dlpack_output1 - rc_before_dlpack_output1 != 1: return False # Make sure that reference count decreases after destroying the DLPack output0_dlpack = None output1_dlpack = None rc_after_del_dlpack_output0 = sys.getrefcount(output0) rc_after_del_dlpack_output1 = sys.getrefcount(output1) if rc_after_del_dlpack_output0 - rc_after_dlpack_output0 != -1: return False if rc_after_del_dlpack_output1 - rc_after_dlpack_output1 != -1: return False return output0.to_dlpack(), output1.to_dlpack()
def _send_bls_sequence_requests(self, correlation_id): # Start request try: input = pb_utils.Tensor('INPUT', np.array([1000], dtype=np.int32)) infer_request = pb_utils.InferenceRequest( model_name='onnx_nobatch_sequence_int32', inputs=[input], requested_output_names=['OUTPUT'], flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START, correlation_id=correlation_id) self.assertTrue(infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT') self.assertEqual(output.as_numpy()[0], input.as_numpy()[0]) for i in range(10): input = pb_utils.Tensor('INPUT', np.array([i], dtype=np.int32)) infer_request = pb_utils.InferenceRequest( model_name='onnx_nobatch_sequence_int32', inputs=[input], requested_output_names=['OUTPUT'], correlation_id=correlation_id) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) # The new output is the previous output + the current input expected_output = output.as_numpy()[0] + i output = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT') self.assertEqual(output.as_numpy()[0], expected_output) # Final request input = pb_utils.Tensor('INPUT', np.array([2000], dtype=np.int32)) infer_request = pb_utils.InferenceRequest( model_name='onnx_nobatch_sequence_int32', inputs=[input], requested_output_names=['OUTPUT'], correlation_id=correlation_id) infer_request.set_flags( pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END) self.assertTrue(infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) expected_output = output.as_numpy()[0] + input.as_numpy()[0] output = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT') self.assertEqual(output.as_numpy()[0], expected_output) except Exception as e: self.add_deferred_exception(e)
def _get_gpu_bls_outputs(self, input0_pb, input1_pb): """ This function is created to test that the DLPack container works properly when the inference response and outputs go out of scope. """ infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0_pb, input1_pb], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') self.assertIsNotNone(output0) self.assertIsNotNone(output1) # When one of the inputs is in GPU the output returned by the model must # be in GPU, otherwise the outputs will be in CPU. if not input0_pb.is_cpu() or not input1_pb.is_cpu(): self.assertTrue((not output0.is_cpu()) and (not output1.is_cpu())) else: self.assertTrue((output0.is_cpu()) and (output1.is_cpu())) # Make sure that the reference count is increased by one when DLPack # representation is created. rc_before_dlpack_output0 = sys.getrefcount(output0) rc_before_dlpack_output1 = sys.getrefcount(output1) output0_dlpack = output0.to_dlpack() output1_dlpack = output1.to_dlpack() rc_after_dlpack_output0 = sys.getrefcount(output0) rc_after_dlpack_output1 = sys.getrefcount(output1) self.assertEqual(rc_after_dlpack_output0 - rc_before_dlpack_output0, 1) self.assertEqual(rc_after_dlpack_output1 - rc_before_dlpack_output1, 1) # Make sure that reference count decreases after destroying the DLPack output0_dlpack = None output1_dlpack = None rc_after_del_dlpack_output0 = sys.getrefcount(output0) rc_after_del_dlpack_output1 = sys.getrefcount(output1) self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0, -1) self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1, -1) return output0.to_dlpack(), output1.to_dlpack()
def verify_add_sub_results(input0, input1, infer_response): if infer_response.has_error(): return False output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') if (output0 is None) or (output1 is None): return False if not input0.is_cpu(): input0 = from_dlpack( input0.to_dlpack()).to('cpu').cpu().detach().numpy() else: input0 = input0.as_numpy() if not input1.is_cpu(): input1 = from_dlpack( input1.to_dlpack()).to('cpu').cpu().detach().numpy() else: input1 = input1.as_numpy() if not output0.is_cpu(): output0 = from_dlpack( output0.to_dlpack()).to('cpu').cpu().detach().numpy() else: output0 = output0.as_numpy() if not output1.is_cpu(): output1 = from_dlpack( output1.to_dlpack()).to('cpu').cpu().detach().numpy() else: output1 = output1.as_numpy() expected_output_0 = input0 + input1 expected_output_1 = input0 - input1 if not np.all(expected_output_0 == output0): print(f'For OUTPUT0 expected {expected_output_0} found {output0}') return False if not np.all(expected_output_1 == output1): print(f'For OUTPUT1 expected {expected_output_1} found {output1}') return False return True
def test_zero_length_io(self): model_name = 'identity_fp32' input0 = np.zeros([1, 0], dtype=np.float32) input0_pb = pb_utils.Tensor('INPUT0', input0) infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') self.assertTrue(np.all(output0 == input0))
def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu): input0 = torch.rand(16) input1 = torch.rand(16) if is_input0_gpu: input0 = input0.to('cuda') if is_input1_gpu: input1 = input1.to('cuda') input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1)) infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0_pb, input1_pb], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') self.assertIsNotNone(output0) self.assertIsNotNone(output1) expected_output_0 = from_dlpack( input0_pb.to_dlpack()).to('cpu') + from_dlpack( input1_pb.to_dlpack()).to('cpu') expected_output_1 = from_dlpack( input0_pb.to_dlpack()).to('cpu') - from_dlpack( input1_pb.to_dlpack()).to('cpu') self.assertTrue( torch.all(expected_output_0 == from_dlpack(output0.to_dlpack()).to( 'cpu'))) self.assertTrue( torch.all(expected_output_1 == from_dlpack(output1.to_dlpack()).to( 'cpu')))
def response_thread(self, response_sender, in_input): # The response_sender is used to send response(s) associated with the # corresponding request. # Sleep 5 seconds to make sure the main thread has exited. time.sleep(5) status = self.execute_gpu_bls() if not status: infer_response = pb_utils.InferenceResponse( error="GPU BLS test failed.") response_sender.send(infer_response) else: in_value = in_input infer_request = pb_utils.InferenceRequest( model_name='identity_fp32', requested_output_names=["OUTPUT0"], inputs=[pb_utils.Tensor('INPUT0', in_input)]) infer_response = infer_request.exec() output0 = pb_utils.get_output_tensor_by_name( infer_response, "OUTPUT0") if infer_response.has_error(): response = pb_utils.InferenceResponse( error=infer_response.error().message()) response_sender.send( response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) elif np.any(in_input != output0.as_numpy()): error_message = ( "BLS Request input and BLS response output do not match." f" {in_value} != {output0.as_numpy()}") response = pb_utils.InferenceResponse(error=error_message) response_sender.send( response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) else: output_tensors = [pb_utils.Tensor('OUT', in_value)] response = pb_utils.InferenceResponse( output_tensors=output_tensors) response_sender.send( response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1
def execute(self, requests): responses = [] for request in requests: # Get INPUT0 input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0') infer_request = pb_utils.InferenceRequest( model_name='identity', requested_output_names=["OUTPUT0"], inputs=[input0]) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) inference_response = pb_utils.InferenceResponse(output_tensors=[ pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') ]) responses.append(inference_response) return responses
def execute(self, requests): """ This function is called on inference request. """ # Only generate the error for the first request for i, request in enumerate(requests): request_input = pb_utils.get_input_tensor_by_name(request, 'IN') # Sync BLS request infer_request = pb_utils.InferenceRequest( model_name='identity_fp32', requested_output_names=["OUTPUT0"], inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())]) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( f"BLS Response has an error: {infer_response.error().message()}" ) output0 = pb_utils.get_output_tensor_by_name( infer_response, "OUTPUT0") if np.any(output0.as_numpy() != request_input.as_numpy()): raise pb_utils.TritonModelException( f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}" ) thread1 = threading.Thread(target=self.response_thread, args=(request.get_response_sender(), pb_utils.get_input_tensor_by_name( request, 'IN').as_numpy())) thread1.daemon = True with self.inflight_thread_count_lck: self.inflight_thread_count += 1 thread1.start() return None
async def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference request is made for this model. Depending on the batching configuration (e.g. Dynamic Batching) used, `requests` may contain multiple requests. Every Python model, must create one pb_utils.InferenceResponse for every pb_utils.InferenceRequest in `requests`. If there is an error, you can set the error argument when creating a pb_utils.InferenceResponse Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate over everyone of the requests # and create a pb_utils.InferenceResponse for each of them. for request in requests: # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # List of awaitables containing inflight inference responses. inference_response_awaits = [] for model_name in ['pytorch', 'add_sub']: # Create inference request object infer_request = pb_utils.InferenceRequest( model_name=model_name, requested_output_names=["OUTPUT0", "OUTPUT1"], inputs=[in_0, in_1]) # Store the awaitable inside the array. We don't need # the inference response immediately so we do not `await` # here. inference_response_awaits.append(infer_request.async_exec()) # Wait for all the inference requests to finish. The execution # of the Python script will be blocked until all the awaitables # are resolved. inference_responses = await asyncio.gather( *inference_response_awaits) for infer_response in inference_responses: # Make sure that the inference response doesn't have an error. # If it has an error and you can't proceed with your model # execution you can raise an exception. if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) # Get the OUTPUT0 from the "pytorch" model inference resposne pytorch_output0_tensor = pb_utils.get_output_tensor_by_name( inference_responses[0], "OUTPUT0") # Get the OUTPUT1 from the "addsub" model inference resposne addsub_output1_tensor = pb_utils.get_output_tensor_by_name( inference_responses[1], "OUTPUT1") # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference # response: # # pb_utils.InferenceResponse( # output_tensors=..., TritonError("An error occured")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( output_tensors=[pytorch_output0_tensor, addsub_output1_tensor]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. return responses
def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference is requested for this model. Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate through list of requests and create # an instance of pb_utils.InferenceResponse class for each of them. You # should avoid storing any of the input Tensors in the class attributes # as they will be overridden in subsequent inference requests. You can # make a copy of the underlying NumPy array and store it if it is # required. batch_encoder_out, batch_encoder_lens = [], [] batch_log_probs, batch_log_probs_idx = [], [] batch_count = [] batch_root = TrieVector() batch_start = [] root_dict = {} encoder_max_len = 0 hyps_max_len = 0 total = 0 for request in requests: # Perform inference on the request and append it to responses list... in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out") in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens") in_2 = pb_utils.get_input_tensor_by_name(request, "batch_log_probs") in_3 = pb_utils.get_input_tensor_by_name(request, "batch_log_probs_idx") batch_encoder_out.append(in_0.as_numpy()) encoder_max_len = max(encoder_max_len, batch_encoder_out[-1].shape[1]) cur_b_lens = in_1.as_numpy() batch_encoder_lens.append(cur_b_lens) cur_batch = cur_b_lens.shape[0] batch_count.append(cur_batch) cur_b_log_probs = in_2.as_numpy() cur_b_log_probs_idx = in_3.as_numpy() for i in range(cur_batch): cur_len = cur_b_lens[i] cur_probs = cur_b_log_probs[i][ 0:cur_len, :].tolist() # T X Beam cur_idx = cur_b_log_probs_idx[i][ 0:cur_len, :].tolist() # T x Beam batch_log_probs.append(cur_probs) batch_log_probs_idx.append(cur_idx) root_dict[total] = PathTrie() batch_root.append(root_dict[total]) batch_start.append(True) total += 1 score_hyps = ctc_beam_search_decoder_batch( batch_log_probs, batch_log_probs_idx, batch_root, batch_start, self.beam_size, min(total, self.num_processes), blank_id=self.blank_id, space_id=-2, cutoff_prob=self.cutoff_prob, ext_scorer=self.lm) all_hyps = [] all_ctc_score = [] max_seq_len = 0 for seq_cand in score_hyps: # if candidates less than beam size if len(seq_cand) != self.beam_size: seq_cand = list(seq_cand) seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"), (0, ))] for score, hyps in seq_cand: all_hyps.append(list(hyps)) all_ctc_score.append(score) max_seq_len = max(len(hyps), max_seq_len) beam_size = self.beam_size feature_size = self.feature_size hyps_max_len = max_seq_len + 2 in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type) in_hyps_pad_sos_eos = np.ones( (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos if self.bidecoder: in_r_hyps_pad_sos_eos = np.ones( (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32) in_encoder_out = np.zeros((total, encoder_max_len, feature_size), dtype=self.data_type) in_encoder_out_lens = np.zeros(total, dtype=np.int32) st = 0 for b in batch_count: t = batch_encoder_out.pop(0) in_encoder_out[st:st + b, 0:t.shape[1]] = t in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0) for i in range(b): for j in range(beam_size): cur_hyp = all_hyps.pop(0) cur_len = len(cur_hyp) + 2 in_hyp = [self.sos] + cur_hyp + [self.eos] in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp in_hyps_lens_sos[st + i][j] = cur_len - 1 if self.bidecoder: r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos] in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp in_ctc_score[st + i][j] = all_ctc_score.pop(0) st += b in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1) in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out) in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens) in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos) in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos) input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3] if self.bidecoder: in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos", in_r_hyps_pad_sos_eos) input_tensors.append(in_tensor_4) in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score) input_tensors.append(in_tensor_5) inference_request = pb_utils.InferenceRequest( model_name='decoder', requested_output_names=['best_index'], inputs=input_tensors) inference_response = inference_request.exec() if inference_response.has_error(): raise pb_utils.TritonModelException( inference_response.error().message()) else: # Extract the output tensors from the inference response. best_index = pb_utils.get_output_tensor_by_name( inference_response, 'best_index') best_index = best_index.as_numpy() hyps = [] idx = 0 for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos): best_idx = best_index[idx][0] best_cand_len = cand_lens[best_idx] - 1 # remove sos best_cand = cands[best_idx][1:1 + best_cand_len].tolist() hyps.append(best_cand) idx += 1 hyps = map_batch( hyps, self.vocabulary, min(multiprocessing.cpu_count(), len(in_ctc_score))) st = 0 for b in batch_count: sents = np.array(hyps[st:st + b]) out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) inference_response = pb_utils.InferenceResponse( output_tensors=[out0]) responses.append(inference_response) st += b return responses
def execute(self, requests): responses = [] for request in requests: input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") gpu_output = pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT").as_numpy() if input0.is_cpu(): if not gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) else: if gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) # Do not perform BLS inference if it is the first # model in the pipeline. if self._model_name != 'dlpack_io_identity_1': infer_request = pb_utils.InferenceRequest( model_name='dlpack_io_identity_1', inputs=[ input0, pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT") ], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) bls_output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') if not output0.is_cpu(): bls_output0 = from_dlpack( bls_output0.to_dlpack()).detach().cpu().numpy() else: bls_output0 = bls_output0.as_numpy() if not input0.is_cpu(): input0 = from_dlpack( input0.to_dlpack()).detach().cpu().numpy() else: input0 = input0.as_numpy() if not np.allclose(bls_output0, input0): raise pb_utils.TritonModelException( 'BLS input and output tensors are not equal') responses.append( pb_utils.InferenceResponse([output0, next_gpu_output])) return responses
def batch_rescoring(self, score_hyps, hist_enc, hist_mask_len, max_len): """ score_hyps: [((ctc_score, (id1, id2, id3, ....)), (), ...), ....] hist_enc: [len1xF, len2xF, .....] hist_mask: [1x1xlen1, 1x1xlen2] return bzx1 best_index """ bz = len(hist_enc) f = hist_enc[0].shape[-1] beam_size = self.beam_size encoder_lens = np.zeros((bz, 1), dtype=np.int32) encoder_out = torch.zeros((bz, max_len, f), dtype=self.dtype) hyps = [] ctc_score = torch.zeros((bz, beam_size), dtype=self.dtype) max_seq_len = 0 for i in range(bz): cur_len = hist_enc[i].shape[0] encoder_out[i, 0:cur_len] = hist_enc[i] encoder_lens[i, 0] = hist_mask_len[i] # process candidate if len(score_hyps[i]) < beam_size: to_append = (beam_size - len(score_hyps[i])) * [(-10000, ())] score_hyps[i] = list(score_hyps[i]) + to_append for idx, c in enumerate(score_hyps[i]): score, idlist = c if score < -10000: score = -10000 ctc_score[i][idx] = score hyps.append(list(idlist)) if len(hyps[-1]) > max_seq_len: max_seq_len = len(hyps[-1]) max_seq_len += 2 hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64) hyps_pad_sos_eos = hyps_pad_sos_eos * self.eos # fill eos if self.bidecoder: r_hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64) r_hyps_pad_sos_eos = r_hyps_pad_sos_eos * self.eos hyps_lens_sos = np.ones((bz, beam_size), dtype=np.int32) bz_id = 0 for idx, cand in enumerate(hyps): bz_id = idx // beam_size length = len(cand) + 2 bz_offset = idx % beam_size pad_cand = [self.sos] + cand + [self.eos] hyps_pad_sos_eos[bz_id][bz_offset][0 : length] = pad_cand if self.bidecoder: r_pad_cand = [self.sos] + cand[::-1] + [self.eos] r_hyps_pad_sos_eos[bz_id][bz_offset][0:length] = r_pad_cand hyps_lens_sos[bz_id][idx % beam_size] = len(cand) + 1 in0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out)) in1 = pb_utils.Tensor("encoder_out_lens", encoder_lens) in2 = pb_utils.Tensor("hyps_pad_sos_eos", hyps_pad_sos_eos) in3 = pb_utils.Tensor("hyps_lens_sos", hyps_lens_sos) input_tensors = [in0, in1, in2, in3] if self.bidecoder: in4 = pb_utils.Tensor("r_hyps_pad_sos_eos", r_hyps_pad_sos_eos) input_tensors.append(in4) in5 = pb_utils.Tensor.from_dlpack("ctc_score", to_dlpack(ctc_score)) input_tensors.append(in5) request = pb_utils.InferenceRequest(model_name='decoder', requested_output_names=['best_index'], inputs=input_tensors) response = request.exec() best_index = pb_utils.get_output_tensor_by_name(response, 'best_index') best_index = from_dlpack(best_index.to_dlpack()).clone() best_index = best_index.numpy()[:, 0] return best_index
def test_bls_tensor_lifecycle(self): model_name = 'dlpack_identity' # A 10 MB tensor. input_size = 10 * 1024 * 1024 # Sending the tensor 50 times to test whether the deallocation is # happening correctly. If the deallocation doesn't happen correctly, # there will be an out of shared memory error. for _ in range(50): input0 = np.ones([1, input_size], dtype=np.float32) input0_pb = pb_utils.Tensor('INPUT0', input0) infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') np.testing.assert_equal(output0.as_numpy(), input0, "BLS CPU memory lifecycle failed.") # Checking the same with the GPU tensors. for index in range(50): input0 = None infer_request = None input0_pb = None torch.cuda.empty_cache() free_memory, _ = torch.cuda.mem_get_info() if index == 1: recorded_memory = free_memory if index > 1: self.assertEqual(free_memory, recorded_memory, "GPU memory lifecycle test failed.") input0 = torch.ones([1, input_size], dtype=torch.float32).to('cuda') input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') output0_pytorch = from_dlpack(output0.to_dlpack()) # Set inference response and output0_pytorch to None, to make sure # that the DLPack is still valid. output0 = None infer_response = None self.assertTrue( torch.all(output0_pytorch == input0), f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model." )