def setUp(self): global _deferred_exceptions _deferred_exceptions = [] # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient( "localhost:8001") self.model_name_ = 'identity_2_float32' # This will not be changed even when ensemble is under test, # as the dynamic batching is performed within the composing model self.check_status_model = 'identity_2_float32' self.tensor_shape_ = (1, 1) self.inputs_ = { "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"), "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32") } self.input_data_ = { "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32) } self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) self.outputs_ = { "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'), "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1') }
def _basic_inference(self, shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle, error_msg, big_shm_name="", big_shm_size=64): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] outputs = [] if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) inputs[0].set_shared_memory("input0_data", 64) if type(shm_ip1_handle) == np.array: inputs[1].set_data_from_numpy(input0_data, binary_data=True) elif big_shm_name != "": inputs[1].set_shared_memory(big_shm_name, big_shm_size) else: inputs[1].set_shared_memory("input1_data", 64) outputs[0].set_shared_memory("output0_data", 64) outputs[1].set_shared_memory("output1_data", 64) try: results = triton_client.infer("simple", inputs, model_version="", outputs=outputs) output = results.get_output('OUTPUT0') if _protocol == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = utils.triton_to_np_dtype(output_datatype) output_data = shm.get_contents_as_numpy(shm_op0_handle, output_dtype, output_shape) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all(), "Model output does not match expected output") except Exception as ex: error_msg.append(str(ex))
def simple_string_inference(triton_client): model_name = 'simple_string' inputs = [] outputs = [] inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES")) # Create the data for the two input tensors. Initialize the first # to unique integers and the second to all ones. in0 = np.arange(start=0, stop=16, dtype=np.int32) in0 = np.expand_dims(in0, axis=0) in1 = np.ones(shape=(1, 16), dtype=np.int32) expected_sum = np.add(in0, in1) expected_diff = np.subtract(in0, in1) # The 'simple_string' model expects 2 BYTES tensors where each # element in those tensors is the utf-8 string representation of # an integer. The BYTES tensors must be represented by a numpy # array with dtype=np.object_. in0n = np.array([str(x).encode('utf-8') for x in in0.reshape(in0.size)], dtype=np.object_) input0_data = in0n.reshape(in0.shape) in1n = np.array([str(x).encode('utf-8') for x in in1.reshape(in1.size)], dtype=np.object_) input1_data = in1n.reshape(in1.shape) # Initialize the data inputs[0].set_data_from_numpy(input0_data) inputs[1].set_data_from_numpy(input1_data) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Get the output arrays from the results output0_data = results.as_numpy('OUTPUT0') output1_data = results.as_numpy('OUTPUT1') for i in range(16): print( str(input0_data[0][i]) + " + " + str(input1_data[0][i]) + " = " + str(output0_data[0][i])) print( str(input0_data[0][i]) + " - " + str(input1_data[0][i]) + " = " + str(output1_data[0][i])) # Convert result from string to int to check result r0 = int(output0_data[0][i]) r1 = int(output1_data[0][i]) if expected_sum[0][i] != r0: print("error: incorrect sum") sys.exit(1) if expected_diff[0][i] != r1: print("error: incorrect difference") sys.exit(1)
def get_result(url, model_name, x): try: triton_client = grpcclient.InferenceServerClient(url=url, verbose=False, ssl=False) print("Channel creation success") except Exception as e: print("channel creation failed: " + str(e)) inputs = [] outputs = [] inputs.append(grpcclient.InferInput('input0', x.shape, "FP32")) input0_data = x print("X Shape : ", x.shape) inputs[0].set_data_from_numpy(input0_data) outputs.append(grpcclient.InferRequestedOutput('output0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) inputs[0].set_data_from_numpy(input0_data) output0_data = results.as_numpy('output0') output0_data = sigmoid(output0_data.squeeze()) print(output0_data) result = np.mean(output0_data) return output0_data
def run(self, client_metadata): trial = self.get_trial() model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_) triton_client = client_metadata[0] input_name = self.input_name_ if "librotch" in trial: input_name = "INPUT__0" tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) // np.dtype(self.input_dtype_).itemsize), ) in0 = np.random.random(tensor_shape).astype(self.input_dtype_) inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(self.input_dtype_)), ] inputs[0].set_data_from_numpy(in0) # Expect an exception for small timeout values. try: triton_client.infer(model_name, inputs, client_timeout=0.1) assert False, "expected inference failure from deadline exceeded" except Exception as ex: if "Deadline Exceeded" not in ex.message(): assert False, "timeout_client failed {}".format(self.name_) # Expect timeout error as success case return 1
def infer(self, _need_tensor_check=False, **_input_tensor): self.check_ready() inputs = [] assert _input_tensor.keys() == set(self.all_inputs.keys( )), f'{self.model_name} the input tensor not match' for m_name, m_tensor_info in self.all_inputs.items(): m_tensor = _input_tensor[m_name] if not (isinstance(m_tensor, np.ndarray) and m_tensor.dtype.name in self.numpy_data_type_mapper): raise InferenceTensorCheckFailException( f'tensor {m_name} is available numpy array') if _need_tensor_check: check_status, check_result = m_tensor_info.tensor_check( m_tensor) if not check_status: raise InferenceTensorCheckFailException(check_result) m_normalized_tensor = m_tensor_info.normalize( m_tensor, _tensor_format='chw').astype(m_tensor.dtype) m_infer_input = grpcclient.InferInput( m_name, m_normalized_tensor.shape, self.numpy_data_type_mapper[m_normalized_tensor.dtype.name]) m_infer_input.set_data_from_numpy(m_normalized_tensor) inputs.append(m_infer_input) results = self.triton_client.infer(model_name=self.model_name, model_version=self.model_version, inputs=inputs) to_return_result = dict() for m_result_name in self.all_outputs.keys(): to_return_result[m_result_name] = results.as_numpy(m_result_name) return to_return_result
def test_decoupled_bls(self): # Test combinations of BLS and decoupled API in Python backend. model_name = "decoupled_bls" shape = [1, 2] user_data = UserData() with grpcclient.InferenceServerClient( "localhost:8001") as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) input_datas = [] input_data = np.random.randn(*shape).astype(np.float32) input_datas.append(input_data) inputs = [ grpcclient.InferInput("IN", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) # Check the results of the decoupled model using BLS def check_result(result): # Make sure the result is not an exception self.assertIsNot(type(result), InferenceServerException) output_data = result.as_numpy("OUT") self.assertIsNotNone(output_data, "error: expected 'OUT'") self.assertTrue( np.array_equal(output_data, input_data), "error: expected output {} to match input {}".format( output_data, input_data)) result = user_data._completed_requests.get() check_result(result)
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS): # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( grpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( httpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( grpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def identity_inference(triton_client, np_array): model_name = "simple_identity" inputs = [] outputs = [] inputs.append(grpcclient.InferInput('INPUT0', np_array.shape, "BYTES")) inputs[0].set_data_from_numpy(np_array) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) if (np_array.dtype == np.object_): print(results.as_numpy('OUTPUT0')) if not np.array_equal(np_array, results.as_numpy('OUTPUT0')): print(results.as_numpy('OUTPUT0')) print("error: incorrect output") sys.exit(1) else: encoded_results = np.char.encode( results.as_numpy('OUTPUT0').astype(str)) if not np.array_equal(np_array, encoded_results): print(encoded_results) print("error: incorrect output") sys.exit(1)
def sync_send(triton_client, result_list, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append(grpcclient.InferInput('INPUT', value_data.shape, "INT32")) # Initialize the data inputs[0].set_data_from_numpy(value_data) outputs = [] outputs.append(grpcclient.InferRequestedOutput('OUTPUT')) # Issue the synchronous sequence inference. result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) result_list.append(result.as_numpy('OUTPUT')) count = count + 1
def detector(self, frames): infer_inputs = [ triton.InferInput('input_1', (len(frames), 3, *self.resize[::-1]), "FP32") ] frames = np.array(frames, dtype=np.float32) frames = np.transpose(frames, (0, 3, 1, 2)) infer_inputs[0].set_data_from_numpy(frames) result = self.triton_client.infer('retinanet', infer_inputs) scores = result.as_numpy('scores').reshape((-1, 100)) boxes = result.as_numpy('boxes').reshape((-1, 100, 4)) classes = result.as_numpy('classes').reshape((-1, 100)) # Calculate embeddings for all the detected subjects embs = [] scores_filtered = [] boxes_filters = [] for i in range(len(frames)): mask = (scores[i] > 0.4) & ( classes[i] == 0) # only care about 'person' with score > 0.4 scores_i = scores[i, mask] boxes_i = boxes[i, mask] scores_i, boxes_i = self.bbox_filter(scores_i, boxes_i) img = frames[i].astype(np.uint8) # (3, 800, 1280) embs_i = [] boxes_i = boxes_i.astype(int) for j in range(len(boxes_i)): imp = img[:, boxes_i[j, 1]:boxes_i[j, 3], boxes_i[j, 0]:boxes_i[j, 2]] imp = np.transpose(imp, (1, 2, 0)) imp = Image.fromarray(imp) data = [ np.asarray(transforms.Resize(size=(256, 128))(imp)).astype( np.float32) ] inputs = [] inputs.append( tritongrpcclient.InferInput('image', [len(data), 256, 128, 3], "FP32")) # Initialize the data inputs[0].set_data_from_numpy(np.asarray(data)) outputs = [] outputs.append( tritongrpcclient.InferRequestedOutput('features')) results = self.triton_client.infer('osnet_ensemble', inputs, outputs=outputs) emb = np.squeeze(results.as_numpy('features')) embs_i.append(emb / np.linalg.norm(emb)) embs.append(embs_i) scores_filtered.append(scores_i) boxes_filters.append(boxes_i) return np.asarray(scores_filtered), np.asarray( boxes_filters), np.asarray(embs)
def test_grpc(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) try: triton_client.infer(model_name="query", inputs=inputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message())
def _run_query( client, n_rows, model_name, workflow_path, data_path, actual_output_filename, output_name, input_cols_name=None, backend="tensorflow", ): workflow = nvt.Workflow.load(workflow_path) if input_cols_name is None: batch = cudf.read_csv( data_path, nrows=n_rows)[workflow.output_node.input_columns.names] else: batch = cudf.read_csv(data_path, nrows=n_rows)[input_cols_name] input_dtypes = workflow.input_dtypes columns = [(col, batch[col]) for col in batch.columns] inputs = [] for i, (name, col) in enumerate(columns): d = col.values_host.astype(input_dtypes[name]) d = d.reshape(len(d), 1) inputs.append( grpcclient.InferInput(name, d.shape, np_to_triton_dtype(input_dtypes[name]))) inputs[i].set_data_from_numpy(d) outputs = [grpcclient.InferRequestedOutput(output_name)] time_start = dt.datetime.now() response = client.infer(model_name, inputs, request_id="1", outputs=outputs) run_time = dt.datetime.now() - time_start output_key = "output" if backend == "hugectr" else "0" output_actual = cudf.read_csv(os.path.expanduser(actual_output_filename), nrows=n_rows) output_actual = cp.asnumpy(output_actual[output_key].values) output_predict = response.as_numpy(output_name) if backend == "tensorflow": output_predict = output_predict[:, 0] diff = abs(output_actual - output_predict) return diff, run_time
def setUp(self): global _deferred_exceptions _deferred_exceptions = [] # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") self.model_name_ = 'identity_2_float32' self.tensor_shape_ = (1, 1) self.inputs_ = { "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"), "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32") } self.input_data_ = { "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32) } self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) self.outputs_ = { "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'), "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1') }
def set_inputs(self, inputs_node: list): """ Args: inputs: a list of NodeInfo Returns: a list of input tensors """ inputs_tensor = [] for node in inputs_node: input_tensor = grpcclient.InferInput(node.node_name, node.node_data.shape, node.node_type) input_tensor.set_data_from_numpy(node.node_data) inputs_tensor.append(input_tensor) return inputs_tensor
def test_decoupled_send_after_close_error(self): model_name = "decoupled_send_after_close_error" shape = [16] user_data = UserData() with grpcclient.InferenceServerClient("localhost:8001") as client: client.start_stream(callback=partial(callback, user_data)) input_data_0 = np.random.random(shape).astype(np.float32) input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ grpcclient.InferInput("INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype)), grpcclient.InferInput("INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype)) ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) client.async_stream_infer(model_name=model_name, inputs=inputs) # Because the model has closed the response sender there is no # way to deliver the error message to the client. The error # will be logged on the server side. time.sleep(4) self.assertEqual(user_data._completed_requests.qsize(), 0, "The completed request size must be zero.")
def test_decoupled_return_response_error(self): model_name = "decoupled_return_response_error" shape = [16] user_data = UserData() with grpcclient.InferenceServerClient("localhost:8001") as client: client.start_stream(callback=partial(callback, user_data)) input_data_0 = np.random.random(shape).astype(np.float32) input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ grpcclient.InferInput("INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype)), grpcclient.InferInput("INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype)) ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) client.async_stream_infer(model_name=model_name, inputs=inputs) data_item = user_data._completed_requests.get() if type(data_item) == InferenceServerException: self.assertEqual( data_item.message(), "Python model 'decoupled_return_response_error_0' is using " "the decoupled mode and the execute function must return " "None.", "Exception message didn't match.")
def run(self, client_metadata): triton_client = client_metadata[0] inputs = [ grpcclient.InferInput("input", self.image_data_.shape, "FP32") ] inputs[0].set_data_from_numpy(self.image_data_) outputs = [ grpcclient.InferRequestedOutput("resnet_v1_50/predictions/Softmax", class_count=1) ] res = triton_client.infer(self.model_name_, inputs, outputs=outputs) self.postprocess(res) return self.batch_size_
def _run_test(self): model_name = "ensemble_io" user_data = UserData() with grpcclient.InferenceServerClient("localhost:8001") as client: input0 = np.random.random([1000]).astype(np.float32) client.start_stream(callback=partial(callback, user_data)) for model_1_in_gpu in [True, False]: for model_2_in_gpu in [True, False]: for model_3_in_gpu in [True, False]: gpu_output = np.asarray( [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu], dtype=bool) inputs = [ grpcclient.InferInput( "INPUT0", input0.shape, np_to_triton_dtype(input0.dtype)), grpcclient.InferInput( "GPU_OUTPUT", gpu_output.shape, np_to_triton_dtype(gpu_output.dtype)) ] inputs[0].set_data_from_numpy(input0) inputs[1].set_data_from_numpy(gpu_output) client.async_stream_infer(model_name=model_name, inputs=inputs) if TRIAL == 'default': result = user_data._completed_requests.get() output0 = result.as_numpy('OUTPUT0') self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input0)) else: response_repeat = 2 for _ in range(response_repeat): result = user_data._completed_requests.get() output0 = result.as_numpy('OUTPUT0') self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input0))
def _prepare_request(self, protocol): if (protocol == "grpc"): self.inputs_ = [] self.inputs_.append( grpcclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0')) else: self.inputs_ = [] self.inputs_.append( httpclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0')) self.inputs_[0].set_data_from_numpy(self.input0_data_)
def requestGenerator(batched_image_data, input_name, output_name, dtype, model_name, model_version, classes=1): # Set the input data inputs = [] inputs.append( grpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) outputs = [] outputs.append( grpcclient.InferRequestedOutput(output_name, class_count=classes)) yield inputs, outputs, model_name, model_version
def get_embedding(self, face_img): if not isinstance(face_img, list): face_img = [face_img] face_img = np.stack(face_img) input_size = tuple(face_img[0].shape[0:2][::-1]) blob = cv2.dnn.blobFromImages( face_img, 1.0 / self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) blob = blob.astype(triton_to_np_dtype(self.dtype)) inputs = [] inputs.append( grpcclient.InferInput(self.input_name, [blob.shape[0], self.c, self.h, self.w], "FP32")) # inputs[0].set_data_from_numpy(face_img) cudashm.set_shared_memory_region(self.in_handle, [blob]) input_bytesize = 12 * blob.shape[0] * self.w * self.h inputs[-1].set_shared_memory(self.in_handle_name, input_bytesize) outputs = [] out_bytesize = 12 * 512 * self.max_batch_size outputs.append(grpcclient.InferRequestedOutput(self.output_name[0])) outputs[-1].set_shared_memory(self.out_handle_name, out_bytesize) out = self.triton_client.infer(self.model_name, inputs, model_version=self.model_version, outputs=outputs) out = [ cudashm.get_contents_as_numpy(self.out_handle, triton_to_np_dtype(self.dtype), [blob.shape[0], 512]) ] # out = [out.as_numpy(e) for e in self.output_name] return out[0]
def run(self, input): inputs = [] outputs = [ grpcclient.InferRequestedOutput(e) for e in self.output_order ] inputs.append( grpcclient.InferInput(self.input_name, [1, self.c, self.h, self.w], self.dtype)) # inputs[0].set_data_from_numpy(input) cudashm.set_shared_memory_region(self.in_handle, [input]) inputs[-1].set_shared_memory(self.in_handle_name, self.input_bytesize) out = self.triton_client.infer(self.model_name, inputs, model_version=self.model_version, outputs=outputs) out = [out.as_numpy(e) for e in self.output_order] return out
def _to_trt(self, inputs, model_name, model_version='1'): tt_inputs = [] if self.protocol == 'http': input_metadata = self.metadata[model_name][model_version]['inputs'] for input, metadata in zip([inputs], input_metadata): tt_input = httpclient.InferInput(metadata['name'], list(input.shape), metadata['datatype']) tt_input.set_data_from_numpy(input) tt_inputs.append(tt_input) elif self.protocol == 'grpc': input_metadata = self.metadata[model_name][model_version].inputs for input, metadata in zip([inputs], input_metadata): tt_input = grpcclient.InferInput(metadata.name, list(input.shape), metadata.datatype) tt_input.set_data_from_numpy(input) tt_inputs.append(tt_input) return tt_inputs
def requestGenerator(input_name, input_data, output_name, dtype, protocol): # Set the input data inputs = [] if protocol.lower() == "grpc": inputs.append(grpcclient.InferInput(input_name, input_data.shape, dtype)) inputs[0].set_data_from_numpy(input_data) else: inputs.append(httpclient.InferInput(input_name, input_data.shape, dtype)) inputs[0].set_data_from_numpy(input_data, binary_data=True) outputs = [] if protocol.lower() == "grpc": outputs.append(grpcclient.InferRequestedOutput(output_name)) else: outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True)) return inputs, outputs
def test_decoupled_execute_error(self): # The decoupled_execute_error model returns an error for the first # request and sucessfully processes the second request. This is making # sure that an error in a single request does not completely fail the # batch. model_name = "decoupled_execute_error" shape = [2, 2] number_of_requests = 2 user_data = UserData() with grpcclient.InferenceServerClient( "localhost:8001") as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) input_datas = [] for i in range(number_of_requests): input_data = np.random.randn(*shape).astype(np.float32) input_datas.append(input_data) inputs = [ grpcclient.InferInput("IN", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) for i in range(number_of_requests): result = user_data._completed_requests.get() if i == 0: self.assertIs(type(result), InferenceServerException) continue print(result) output_data = result.as_numpy("OUT") self.assertIsNotNone(output_data, "error: expected 'OUT'") self.assertTrue( np.array_equal(output_data, input_datas[i]), "error: expected output {} to match input {}".format( output_data, input_datas[i]))
def test_grpc_out_of_shared_memory(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up too small CUDA shared memory for outputs, expect query # returns default value triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", 1, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", 1, 0) triton_client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1) triton_client.register_cuda_shared_memory( "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1) outputs = [] outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs[-1].set_shared_memory("output0_data", 1) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) outputs[-1].set_shared_memory("output1_data", 1) try: triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message()) cudashm.destroy_shared_memory_region(shm_op0_handle) cudashm.destroy_shared_memory_region(shm_op1_handle) triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory()
def test_nvt_hugectr_inference(n_rows, err_tol): warnings.simplefilter("ignore") model_name = "test_model_ens" col_names = ["userId", "movieId", "new_cat1"] # read in a batch of data to get transforms for batch = cudf.read_csv(DATA_DIR + "test/data.csv", nrows=n_rows)[col_names] # convert the batch to a triton inputs columns = [(col, batch[col]) for col in col_names] inputs = [] col_dtypes = [np.int64, np.int64, np.int64] for i, (name, col) in enumerate(columns): d = col.values_host.astype(col_dtypes[i]) d = d.reshape(len(d), 1) inputs.append( httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i]))) inputs[i].set_data_from_numpy(d) # placeholder variables for the output outputs = [] outputs.append(httpclient.InferRequestedOutput("OUTPUT0")) # make the request with httpclient.InferenceServerClient("localhost:8001") as client: response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) output_actual = cudf.read_csv(DATA_DIR + "test/output.csv", nrows=n_rows) output_actual = cp.asnumpy(output_actual["output"].values) output_predict = response.as_numpy("OUTPUT0") diff = abs(output_actual - output_predict) assert (diff < err_tol).all()
def inputs_outputs_generator(self, raw_inputs): """ Generate inputs and outptus blob for triton client inference :param raw_inputs: list of raw numpy inputs :return: inputs outputs data """ inputs = [] for input_specs, raw_input in zip(self.inputs_specs, raw_inputs): # parse data type raw_input = raw_input.astype( triton_to_np_dtype(input_specs.datatype)) infer_input = grpcclient.InferInput(input_specs.name, raw_input.shape, input_specs.datatype) infer_input.set_data_from_numpy(raw_input) inputs.append(infer_input) outputs = [] for output_specs in self.outputs_specs: outputs.append( grpcclient.InferRequestedOutput(output_specs.name, class_count=0)) return inputs, outputs
def crashing_client(model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0"): in0 = np.random.random(tensor_shape).astype(dtype) if "libtorch" in model_name: input_name = "INPUT__0" inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)), ] inputs[0].set_data_from_numpy(in0) # Run in a loop so that it is guaranteed that # the inference will not have completed when being terminated. while True: existing_shm = shared_memory.SharedMemory(shm_name) count = np.ndarray((1, ), dtype=np.int32, buffer=existing_shm.buf) count[0] += 1 existing_shm.close() results = triton_client.infer(model_name, inputs)