def _test_helper(self, client, model_name, input_name='INPUT0', output_name='OUTPUT0'): try: inputs = [ client[0].InferInput(input_name, self.in0_.shape, np_to_triton_dtype(self.data_type_)) ] inputs[0].set_data_from_numpy(self.in0_) results = client[1].infer(model_name, inputs) # if the inference is completed, examine results to ensure that # the framework and protocol do support large payload self.assertTrue( np.array_equal(self.in0_, results.as_numpy(output_name)), "output is different from input") except InferenceServerException as ex: # if the inference failed, inference server should return error # gracefully. In addition to this, send a small payload to # verify if the server is still functional inputs = [ client[0].InferInput(input_name, self.sin0_.shape, np_to_triton_dtype(self.data_type_)) ] inputs[0].set_data_from_numpy(self.sin0_) results = client[1].infer(model_name, inputs) self.assertTrue( np.array_equal(self.sin0_, results.as_numpy(output_name)), "output is different from input")
def _test_helper(self, client, model_name, input_name='INPUT0', output_name='OUTPUT0'): # plan does not supoort large batch sizes. if not model_name.startswith('plan'): inputs = [ client[0].InferInput(input_name, self._large_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._large_in0) results = client[1].infer(model_name, inputs) # if the inference is completed, examine results to ensure that # the framework and protocol do support large payload self.assertTrue( np.array_equal(self._large_in0, results.as_numpy(output_name)), "output is different from input") if client[0] == httpclient: # FIXME HTTPServer cannot support large payloads. See DLIS-1776. inputs = [ client[0].InferInput(input_name, self._very_large_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._very_large_in0) with self.assertRaises(InferenceServerException): results = client[1].infer(model_name, inputs) # FIXME Test is terminated due to libprotobuf FATAL error when GRPC sends # the second request with input tensors larger than 1.3GBs. In this test # GRPC has been currently exempted from testing for Very Large tensor(3GBs) # until the problem is resolved. Should be uncommented once the GRPC issue is resolved. # See DLIS-2474. # if client[0] == grpcclient: # inputs = [ # client[0].InferInput(input_name, self._very_large_in0.shape, # np_to_triton_dtype(self._data_type)) # ] # inputs[0].set_data_from_numpy(self._very_large_in0) # # GRPC must fail for large payloads because of a 2GB protobuf limit # with self.assertRaises(InferenceServerException): # results = client[1].infer(model_name, inputs) # Send a small payload to verify if the server is still functional inputs = [ client[0].InferInput(input_name, self._small_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._small_in0) results = client[1].infer(model_name, inputs) self.assertTrue( np.array_equal(self._small_in0, results.as_numpy(output_name)), "output is different from input")
def crashing_client(model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0"): in0 = np.random.random(tensor_shape).astype(dtype) if "libtorch" in model_name: input_name = "INPUT__0" inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)), ] inputs[0].set_data_from_numpy(in0) # Run in a loop so that it is guaranteed that # the inference will not have completed when being terminated. while True: existing_shm = shared_memory.SharedMemory(shm_name) count = np.ndarray((1,), dtype=np.int32, buffer=existing_shm.buf) count[0] += 1 existing_shm.close() results = triton_client.infer(model_name, inputs)
# We use identity string models that takes 1 input tensor of a single string # and returns 1 output tensor of a single string. The output tensor is the # same as the input tensor. batch_size = 1 # Create the data for the input tensor. It contains a null character in # the middle of the string. tmp_str = "abc\0def" input0_data = np.array([tmp_str], dtype=object) # Send inference request to the inference server. Get results for # output tensor. inputs = [ client_util.InferInput("INPUT0", input0_data.shape, np_to_triton_dtype(np.object)) ] inputs[0].set_data_from_numpy(input0_data) results = client.infer(FLAGS.model_name, inputs) # We expect there to be 1 result (with batch-size 1). Compare the input # and output tensor calculated by the model. They must be the same. output0_data = results.as_numpy('OUTPUT0') # Element type returned is different between HTTP and GRPC client. # The former is str and the latter is bytes output0_data2 = np.array([ output0_data[0] if type(output0_data[0]) == str else output0_data[0].decode('utf8') ], dtype=object)
# Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) # Input tensor will be raw content from image file image_path = FLAGS.image_filename with open(image_path, "rb") as fd: input_data = np.array([[fd.read()]], dtype=bytes) expected_res_path = FLAGS.preprocessed_filename with open(expected_res_path, "r") as fd: expected_data = np.fromfile(fd, np.float32) inputs = [ client_util.InferInput("INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) results = client.infer(model_name, inputs) output = results.as_numpy("OUTPUT") if output is None: print("error: expected 'OUTPUT'") sys.exit(1) if output.shape[0] != 1: print("error: expected 1 output result, got {}".format( len(result["OUTPUT"]))) sys.exit(1)
def _test_helper(self, client, model_name, input_name='INPUT0', output_name='OUTPUT0'): # FIXME libtorch seems to have an issue with handling large batch sizes see DLIS-1770 if model_name.startswith('libtorch'): try: inputs = [ client[0].InferInput(input_name, self._large_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._large_in0) results = client[1].infer(model_name, inputs) # if the inference is completed, examine results to ensure that # the framework and protocol do support large payload self.assertTrue( np.array_equal(self._large_in0, results.as_numpy(output_name)), "output is different from input") except InferenceServerException as ex: self.assertTrue( ex.message() == "OUTPUT__0: failed to perform CUDA copy: invalid argument") # plan does not supoort large batch sizes. elif not model_name.startswith('plan'): inputs = [ client[0].InferInput(input_name, self._large_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._large_in0) results = client[1].infer(model_name, inputs) # if the inference is completed, examine results to ensure that # the framework and protocol do support large payload self.assertTrue( np.array_equal(self._large_in0, results.as_numpy(output_name)), "output is different from input") if client[0] == httpclient: # FIXME HTTPServer cannot support large payloads. See DLIS-1776. inputs = [ client[0].InferInput(input_name, self._very_large_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._very_large_in0) with self.assertRaises(InferenceServerException): results = client[1].infer(model_name, inputs) if client[0] == grpcclient: inputs = [ client[0].InferInput(input_name, self._very_large_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._very_large_in0) # GRPC must fail for large payloads because of a 2GB protobuf limit with self.assertRaises(InferenceServerException): results = client[1].infer(model_name, inputs) # Send a small payload to verify if the server is still functional inputs = [ client[0].InferInput(input_name, self._small_in0.shape, np_to_triton_dtype(self._data_type)) ] inputs[0].set_data_from_numpy(self._small_in0) results = client[1].infer(model_name, inputs) self.assertTrue( np.array_equal(self._small_in0, results.as_numpy(output_name)), "output is different from input")
dtype = np.float32 # Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) # Create the data for one input tensor. input_data = [] input_data.append(np.ones((3, 5), dtype=np.float32)) input_data.append(np.ones((3, 5), dtype=np.float32)) inputs = [] for i in range(len(input_data)): inputs.append( client_util.InferInput("input_{}".format(i + 1), shape, np_to_triton_dtype(dtype))) inputs[i].set_data_from_numpy(input_data[i]) results = client.infer(model_name, inputs) # We expect 1 result of size 10 with alternating 1 and 0. output_data = results.as_numpy('output') if output_data is None: print("error: expected 'output'") sys.exit(1) for i in range(3): for j in range(5): print( str(input_data[0][i][j]) + " + " + str(input_data[1][i][j]) + " = " + str(output_data[i][j]))
# Run the custom_modulo model, which depends on a custom mod operation model_name = FLAGS.model elements = 10 # Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) # Create the data for one input tensor. input_data = [] input_data.append(np.arange(start=1, stop=1+elements, dtype=np.float32)) input_data.append(np.array([2] * elements, dtype=np.float32)) inputs = [] for i in range(len(input_data)): inputs.append(client_util.InferInput( "INPUT__{}".format(i), input_data[0].shape, np_to_triton_dtype(input_data[0].dtype))) inputs[i].set_data_from_numpy(input_data[i]) results = client.infer(model_name, inputs) # We expect 1 result of size 10 with alternating 1 and 0. output_data = results.as_numpy('OUTPUT__0') if output_data is None: print("error: expected 'OUTPUT__0'") sys.exit(1) for i in range(elements): print(str(i) + ": " + str(input_data[0][i]) + " % " + str(input_data[1][i]) + " = " + str(output_data[i])) if ((input_data[0][i] % input_data[1][i]) != output_data[i]): print("error: incorrect value") sys.exit(1)
def check_sequence_async(client_metadata, trial, model_name, input_dtype, steps, timeout_ms=DEFAULT_TIMEOUT_MS, sequence_name="<unknown>"): """Perform sequence of inferences using async run. The 'steps' holds a list of tuples, one for each inference with format: (flag_str, value, expected_result, delay_ms) """ if (("savedmodel" in trial) or ("graphdef" in trial) or ("custom" in trial) or ("plan" in trial)): tensor_shape = ( 1, 1, ) else: assert False, "unknown trial type: " + trial triton_client = client_metadata[0] sequence_id = client_metadata[1] # Execute the sequence of inference... seq_start_ms = int(round(time.time() * 1000)) user_data = UserData() # Ensure there is no running stream triton_client.stop_stream() triton_client.start_stream(partial(completion_callback, user_data)) sent_count = 0 for flag_str, value, expected_result, delay_ms in steps: seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) if input_dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) else: in0 = np.full(tensor_shape, value, dtype=input_dtype) inputs = [ grpcclient.InferInput("INPUT", tensor_shape, np_to_triton_dtype(input_dtype)), ] inputs[0].set_data_from_numpy(in0) triton_client.async_stream_infer(model_name, inputs, sequence_id=sequence_id, sequence_start=seq_start, sequence_end=seq_end) sent_count += 1 if delay_ms is not None: time.sleep(delay_ms / 1000.0) # Process the results in order that they were sent result = None processed_count = 0 while processed_count < sent_count: (results, error) = user_data._completed_requests.get() if error is not None: raise error (_, value, expected, _) = steps[processed_count] processed_count += 1 if timeout_ms != None: now_ms = int(round(time.time() * 1000)) if (now_ms - seq_start_ms) > timeout_ms: raise TimeoutException( "Timeout expired for {}".format(sequence_name)) result = results.as_numpy("OUTPUT")[0][0] if FLAGS.verbose: print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, result)) if expected is not None: if input_dtype == np.object_: assert int( result ) == expected, "{}: expected result {}, got {}".format( sequence_name, expected, int(result)) else: assert result == expected, "{}: expected result {}, got {}".format( sequence_name, expected, result) triton_client.stop_stream()
# Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) # We use identity string models that takes 1 input tensor of a single string # and returns 1 output tensor of a single string. The output tensor is the # same as the input tensor. batch_size = 1 # Create the data for the input tensor. It contains a null character in # the middle of the string. tmp_str = "abc\0def" input0_data = np.array([tmp_str], dtype=object) # Send inference request to the inference server. Get results for # output tensor. inputs = [client_util.InferInput( "INPUT0", input0_data.shape, np_to_triton_dtype(np.object))] inputs[0].set_data_from_numpy(input0_data) results = client.infer(FLAGS.model_name, inputs) # We expect there to be 1 result (with batch-size 1). Compare the input # and output tensor calculated by the model. They must be the same. output0_data = results.as_numpy('OUTPUT0') # Element type returned is different between HTTP and GRPC client. # The former is str and the latter is bytes output0_data2 = np.array([output0_data[0] if type(output0_data[0]) == str else output0_data[0].decode('utf8')], dtype=object) print(input0_data,"?=?",output0_data2) assert np.equal(input0_data,output0_data2).all()
model_name = FLAGS.model elements = 10 # Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, FLAGS.verbose) # Create the data for one input tensor. input_data = [] input_data.append(np.arange(start=1, stop=1 + elements, dtype=np.float32)) input_data.append(np.array([2] * elements, dtype=np.float32)) inputs = [] for i in range(len(input_data)): inputs.append( client_util.InferInput("INPUT__{}".format(i), input_data[0].shape, np_to_triton_dtype(input_data[0].dtype))) inputs[i].set_data_from_numpy(input_data[i]) results = client.infer(model_name, inputs) # We expect 1 result of size 10 with alternating 1 and 0. output_data = results.as_numpy('OUTPUT__0') if output_data is None: print("error: expected 'OUTPUT__0'") sys.exit(1) for i in range(elements): print( str(i) + ": " + str(input_data[0][i]) + " % " + str(input_data[1][i]) + " = " + str(output_data[i])) if ((input_data[0][i] % input_data[1][i]) != output_data[i]):
if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(FLAGS.protocol)) exit(1) client_util = httpclient if FLAGS.protocol == "http" else grpcclient model_name = "param" # Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, FLAGS.verbose) # Input tensor can be any size int32 vector... input_data = np.zeros(shape=1, dtype=np.int32) inputs = [client_util.InferInput( "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype))] inputs[0].set_data_from_numpy(input_data) results = client.infer(model_name, inputs) print(results) params = results.as_numpy("OUTPUT") if params is None: print("error: expected 'OUTPUT'") sys.exit(1) if params.size != 5: print("error: expected 5 output strings, got {}".format(params.size)) sys.exit(1)
model_name = FLAGS.model shape = (3, 5) dtype = np.float32 # Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, FLAGS.verbose) # Create the data for one input tensor. input_data = [] input_data.append(np.ones((3, 5), dtype=np.float32)) input_data.append(np.ones((3, 5), dtype=np.float32)) inputs = [] for i in range(len(input_data)): inputs.append(client_util.InferInput( "input_{}".format(i+1), shape, np_to_triton_dtype(dtype))) inputs[i].set_data_from_numpy(input_data[i]) results = client.infer(model_name, inputs) # We expect 1 result of size 10 with alternating 1 and 0. output_data = results.as_numpy('output') if output_data is None: print("error: expected 'output'") sys.exit(1) for i in range(3): for j in range(5): print(str(input_data[0][i][j]) + " + " + str(input_data[1][i][j]) + " = " + str(output_data[i][j])) if ((input_data[0][i][j] + input_data[1][i][j]) != output_data[i][j]):