def initialize(self, args): self.args = args if args['model_name'] != 'init_args' or args[ 'model_instance_name'] != 'init_args_0': raise pb_utils.TritonModelException( 'model_instance_name/model_name does not contain correct value.' )
def execute(self, requests): """ Create a response sender object and use that for sending the response. """ # This model does not support batching, so 'request_count' should always be 1. if len(requests) != 1: raise pb_utils.TritonModelException("unsupported batch size " + len(requests)) output0_dtype = self.output0_dtype output1_dtype = self.output1_dtype response_sender = requests[0].get_response_sender() in_0 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT1") out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), in_0.as_numpy() - in_1.as_numpy()) out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) response = pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]) response_sender.send( flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) response_sender.send(response)
def execute(self, requests): """ This function is called on inference request. """ # Only generate the error for the first request for i, request in enumerate(requests): request_input = pb_utils.get_input_tensor_by_name(request, 'IN') # Sync BLS request infer_request = pb_utils.InferenceRequest( model_name='identity_fp32', requested_output_names=["OUTPUT0"], inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())]) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( f"BLS Response has an error: {infer_response.error().message()}" ) output0 = pb_utils.get_output_tensor_by_name( infer_response, "OUTPUT0") if np.any(output0.as_numpy() != request_input.as_numpy()): raise pb_utils.TritonModelException( f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}" ) thread1 = threading.Thread(target=self.response_thread, args=(request.get_response_sender(), pb_utils.get_input_tensor_by_name( request, 'IN').as_numpy())) thread1.daemon = True with self.inflight_thread_count_lck: self.inflight_thread_count += 1 thread1.start() return None
def initialize(self, args): self.model_config = model_config = json.loads(args['model_config']) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( model_config) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to serve this model""".format(args['model_name'])) output0_config = pb_utils.get_output_config_by_name( model_config, "OUTPUT0") output1_config = pb_utils.get_output_config_by_name( model_config, "OUTPUT1") self.output0_dtype = pb_utils.triton_string_to_numpy( output0_config['data_type']) self.output1_dtype = pb_utils.triton_string_to_numpy( output1_config['data_type'])
def initialize(self, args): # You must parse model_config. JSON string is not parsed here self.model_config = model_config = json.loads(args['model_config']) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( model_config) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to serve this model""".format(args['model_name'])) # Get OUT configuration out_config = pb_utils.get_output_config_by_name(model_config, "OUT") # Convert Triton types to numpy types self.out_dtype = pb_utils.triton_string_to_numpy( out_config['data_type']) self.inflight_thread_count = 0 self.inflight_thread_count_lck = threading.Lock()
def execute(self, requests): responses = [] for request in requests: # Get INPUT0 input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0') infer_request = pb_utils.InferenceRequest( model_name='identity', requested_output_names=["OUTPUT0"], inputs=[input0]) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) inference_response = pb_utils.InferenceResponse(output_tensors=[ pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') ]) responses.append(inference_response) return responses
async def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference request is made for this model. Depending on the batching configuration (e.g. Dynamic Batching) used, `requests` may contain multiple requests. Every Python model, must create one pb_utils.InferenceResponse for every pb_utils.InferenceRequest in `requests`. If there is an error, you can set the error argument when creating a pb_utils.InferenceResponse Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate over everyone of the requests # and create a pb_utils.InferenceResponse for each of them. for request in requests: # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # List of awaitables containing inflight inference responses. inference_response_awaits = [] for model_name in ['pytorch', 'add_sub']: # Create inference request object infer_request = pb_utils.InferenceRequest( model_name=model_name, requested_output_names=["OUTPUT0", "OUTPUT1"], inputs=[in_0, in_1]) # Store the awaitable inside the array. We don't need # the inference response immediately so we do not `await` # here. inference_response_awaits.append(infer_request.async_exec()) # Wait for all the inference requests to finish. The execution # of the Python script will be blocked until all the awaitables # are resolved. inference_responses = await asyncio.gather( *inference_response_awaits) for infer_response in inference_responses: # Make sure that the inference response doesn't have an error. # If it has an error and you can't proceed with your model # execution you can raise an exception. if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) # Get the OUTPUT0 from the "pytorch" model inference resposne pytorch_output0_tensor = pb_utils.get_output_tensor_by_name( inference_responses[0], "OUTPUT0") # Get the OUTPUT1 from the "addsub" model inference resposne addsub_output1_tensor = pb_utils.get_output_tensor_by_name( inference_responses[1], "OUTPUT1") # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference # response: # # pb_utils.InferenceResponse( # output_tensors=..., TritonError("An error occured")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( output_tensors=[pytorch_output0_tensor, addsub_output1_tensor]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. return responses
def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference is requested for this model. Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate through list of requests and create # an instance of pb_utils.InferenceResponse class for each of them. You # should avoid storing any of the input Tensors in the class attributes # as they will be overridden in subsequent inference requests. You can # make a copy of the underlying NumPy array and store it if it is # required. batch_encoder_out, batch_encoder_lens = [], [] batch_log_probs, batch_log_probs_idx = [], [] batch_count = [] batch_root = TrieVector() batch_start = [] root_dict = {} encoder_max_len = 0 hyps_max_len = 0 total = 0 for request in requests: # Perform inference on the request and append it to responses list... in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out") in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens") in_2 = pb_utils.get_input_tensor_by_name(request, "batch_log_probs") in_3 = pb_utils.get_input_tensor_by_name(request, "batch_log_probs_idx") batch_encoder_out.append(in_0.as_numpy()) encoder_max_len = max(encoder_max_len, batch_encoder_out[-1].shape[1]) cur_b_lens = in_1.as_numpy() batch_encoder_lens.append(cur_b_lens) cur_batch = cur_b_lens.shape[0] batch_count.append(cur_batch) cur_b_log_probs = in_2.as_numpy() cur_b_log_probs_idx = in_3.as_numpy() for i in range(cur_batch): cur_len = cur_b_lens[i] cur_probs = cur_b_log_probs[i][ 0:cur_len, :].tolist() # T X Beam cur_idx = cur_b_log_probs_idx[i][ 0:cur_len, :].tolist() # T x Beam batch_log_probs.append(cur_probs) batch_log_probs_idx.append(cur_idx) root_dict[total] = PathTrie() batch_root.append(root_dict[total]) batch_start.append(True) total += 1 score_hyps = ctc_beam_search_decoder_batch( batch_log_probs, batch_log_probs_idx, batch_root, batch_start, self.beam_size, min(total, self.num_processes), blank_id=self.blank_id, space_id=-2, cutoff_prob=self.cutoff_prob, ext_scorer=self.lm) all_hyps = [] all_ctc_score = [] max_seq_len = 0 for seq_cand in score_hyps: # if candidates less than beam size if len(seq_cand) != self.beam_size: seq_cand = list(seq_cand) seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"), (0, ))] for score, hyps in seq_cand: all_hyps.append(list(hyps)) all_ctc_score.append(score) max_seq_len = max(len(hyps), max_seq_len) beam_size = self.beam_size feature_size = self.feature_size hyps_max_len = max_seq_len + 2 in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type) in_hyps_pad_sos_eos = np.ones( (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos if self.bidecoder: in_r_hyps_pad_sos_eos = np.ones( (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32) in_encoder_out = np.zeros((total, encoder_max_len, feature_size), dtype=self.data_type) in_encoder_out_lens = np.zeros(total, dtype=np.int32) st = 0 for b in batch_count: t = batch_encoder_out.pop(0) in_encoder_out[st:st + b, 0:t.shape[1]] = t in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0) for i in range(b): for j in range(beam_size): cur_hyp = all_hyps.pop(0) cur_len = len(cur_hyp) + 2 in_hyp = [self.sos] + cur_hyp + [self.eos] in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp in_hyps_lens_sos[st + i][j] = cur_len - 1 if self.bidecoder: r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos] in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp in_ctc_score[st + i][j] = all_ctc_score.pop(0) st += b in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1) in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out) in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens) in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos) in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos) input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3] if self.bidecoder: in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos", in_r_hyps_pad_sos_eos) input_tensors.append(in_tensor_4) in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score) input_tensors.append(in_tensor_5) inference_request = pb_utils.InferenceRequest( model_name='decoder', requested_output_names=['best_index'], inputs=input_tensors) inference_response = inference_request.exec() if inference_response.has_error(): raise pb_utils.TritonModelException( inference_response.error().message()) else: # Extract the output tensors from the inference response. best_index = pb_utils.get_output_tensor_by_name( inference_response, 'best_index') best_index = best_index.as_numpy() hyps = [] idx = 0 for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos): best_idx = best_index[idx][0] best_cand_len = cand_lens[best_idx] - 1 # remove sos best_cand = cands[best_idx][1:1 + best_cand_len].tolist() hyps.append(best_cand) idx += 1 hyps = map_batch( hyps, self.vocabulary, min(multiprocessing.cpu_count(), len(in_ctc_score))) st = 0 for b in batch_count: sents = np.array(hyps[st:st + b]) out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) inference_response = pb_utils.InferenceResponse( output_tensors=[out0]) responses.append(inference_response) st += b return responses
def execute(self, requests): responses = [] for request in requests: input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") gpu_output = pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT").as_numpy() if input0.is_cpu(): if not gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) else: if gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) # Do not perform BLS inference if it is the first # model in the pipeline. if self._model_name != 'dlpack_io_identity_1': infer_request = pb_utils.InferenceRequest( model_name='dlpack_io_identity_1', inputs=[ input0, pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT") ], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) bls_output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') if not output0.is_cpu(): bls_output0 = from_dlpack( bls_output0.to_dlpack()).detach().cpu().numpy() else: bls_output0 = bls_output0.as_numpy() if not input0.is_cpu(): input0 = from_dlpack( input0.to_dlpack()).detach().cpu().numpy() else: input0 = input0.as_numpy() if not np.allclose(bls_output0, input0): raise pb_utils.TritonModelException( 'BLS input and output tensors are not equal') responses.append( pb_utils.InferenceResponse([output0, next_gpu_output])) return responses
def initialize(self, args): if file1.FILE_NAME != 'FILE1' or file2.FILE_NAME != 'FILE2': raise pb_utils.TritonModelException('Imports do not work')
def initialize(self, args): # Make sure that environment variables are correctly propagated # to the Python models if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != 'MY_ENV': raise pb_utils.TritonModelException( "MY_ENV doesn't exists or contains incorrect value")
def initialize(self, args): if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != 'MY_ENV': raise pb_utils.TritonModelException( "MY_ENV doesn't exists or contains incorrect value")
def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference request is made for this model. Depending on the batching configuration (e.g. Dynamic Batching) used, `requests` may contain multiple requests. Every Python model, must create one pb_utils.InferenceResponse for every pb_utils.InferenceRequest in `requests`. If there is an error, you can set the error argument when creating a pb_utils.InferenceResponse Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate over everyone of the requests # and create a pb_utils.InferenceResponse for each of them. for request in requests: # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # Get Model Name model_name = pb_utils.get_input_tensor_by_name( request, "MODEL_NAME") # Model Name string model_name_string = model_name.as_numpy()[0] # Create inference request object infer_request = pb_utils.InferenceRequest( model_name=model_name_string, requested_output_names=["OUTPUT0", "OUTPUT1"], inputs=[in_0, in_1]) # Perform synchronous blocking inference request infer_response = infer_request.exec() # Make sure that the inference response doesn't have an error. If # it has an error and you can't proceed with your model execution # you can raise an exception. if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference # response: # # pb_utils.InferenceResponse( # output_tensors=..., TritonError("An error occured")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( output_tensors=infer_response.output_tensors()) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. return responses