def create_set_either_shm_region(shm_region_names, input_list, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory): if use_cuda_shared_memory and use_system_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [] if use_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( shm_region_names[0] + "_data", input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( shm_region_names[1] + "_data", output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list) elif use_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( shm_region_names[0] + "_data", "/" + shm_region_names[0], input_byte_size) shm_op_handle = shm.create_shared_memory_region( shm_region_names[1] + "_data", "/" + shm_region_names[1], output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list) return [shm_ip_handle, shm_op_handle]
def prepare(self, **kwargs): concurrency = 10 # Make sure the model matches our requirements, and get some # properties of the model that we need for preprocessing print("Model metadata:", self.model_name, self.model_version) try: model_metadata = self.triton_client.get_model_metadata( model_name=self.model_name, model_version=self.model_version) except InferenceServerException as e: print("failed to retrieve the metadata: " + str(e)) sys.exit(1) try: model_config = self.triton_client.get_model_config( model_name=self.model_name, model_version=self.model_version) except InferenceServerException as e: print("failed to retrieve the config: " + str(e)) sys.exit(1) self.max_batch_size, self.input_name, self.output_name, self.c, self.h, self.w, self.format, self.dtype, self.out_shapes = parse_model_grpc( model_metadata, model_config.config) self.in_handle_name = f'{self.model_name}_data_{os.getpid()}' self.input_bytesize = 12 * self.w * self.h * self.max_batch_size self.in_handle = cudashm.create_shared_memory_region( self.in_handle_name, self.input_bytesize, 0) self.out_handle_name = f'{self.model_name}_data_out_{os.getpid()}' self.out_bytesize = 12 * 512 * self.max_batch_size self.out_handle = cudashm.create_shared_memory_region( self.out_handle_name, self.out_bytesize, 0) self.triton_client.unregister_cuda_shared_memory(self.in_handle_name) self.triton_client.unregister_cuda_shared_memory(self.out_handle_name) self.triton_client.register_cuda_shared_memory( self.in_handle_name, cudashm.get_raw_handle(self.in_handle), 0, self.input_bytesize) self.triton_client.register_cuda_shared_memory( self.out_handle_name, cudashm.get_raw_handle(self.out_handle), 0, self.out_bytesize)
def test_http_out_of_shared_memory(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up too small CUDA shared memory for outputs, expect query # returns default value triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", 1, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", 1, 0) triton_client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1) triton_client.register_cuda_shared_memory( "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1) outputs = [] outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs[-1].set_shared_memory("output0_data", 1) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) outputs[-1].set_shared_memory("output1_data", 1) try: triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message()) cudashm.destroy_shared_memory_region(shm_op0_handle) cudashm.destroy_shared_memory_region(shm_op1_handle) triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory()
def create_set_shm_regions(input0_list, input1_list, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory): if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [], [] input0_byte_size = sum([i0.nbytes for i0 in input0_list]) input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if shm_region_names is None: shm_region_names = ['input0', 'input1', 'output0', 'output1'] shm_op0_handle = None shm_op1_handle = None if use_system_shared_memory: shm_ip0_handle = shm.create_shared_memory_region( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = shm.create_shared_memory_region( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = shm.create_shared_memory_region( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) else: shm_op1_handle = precreated_shm_regions[i] shm.set_shared_memory_region(shm_ip0_handle, input0_list) shm.set_shared_memory_region(shm_ip1_handle, input1_list) if use_cuda_shared_memory: shm_ip0_handle = cudashm.create_shared_memory_region( shm_region_names[0] + '_data', input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( shm_region_names[1] + '_data', input1_byte_size, 0) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = cudashm.create_shared_memory_region( shm_region_names[2] + '_data', output0_byte_size, 0) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = cudashm.create_shared_memory_region( shm_region_names[2 + i] + '_data', output1_byte_size, 0) else: shm_op1_handle = precreated_shm_regions[i] cudashm.set_shared_memory_region(shm_ip0_handle, input0_list) cudashm.set_shared_memory_region(shm_ip1_handle, input1_list) return shm_region_names, [ shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle ]
def create_set_shm_regions(input0_list, input1_list, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory): # Lazy shm imports... if use_system_shared_memory: import tritonclient.utils.shared_memory as shm if use_cuda_shared_memory: import tritonclient.utils.cuda_shared_memory as cudashm if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [], [] if input0_list[0].dtype == np.object_: input0_byte_size = sum( [serialized_byte_size(i0) for i0 in input0_list]) else: input0_byte_size = sum([i0.nbytes for i0 in input0_list]) if input1_list[0].dtype == np.object_: input1_byte_size = sum( [serialized_byte_size(i1) for i1 in input1_list]) else: input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if shm_region_names is None: shm_region_names = ['input0', 'input1', 'output0', 'output1'] shm_op0_handle = None shm_op1_handle = None if use_system_shared_memory: shm_ip0_handle = shm.create_shared_memory_region( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = shm.create_shared_memory_region( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = shm.create_shared_memory_region( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) else: shm_op1_handle = precreated_shm_regions[i] shm.set_shared_memory_region(shm_ip0_handle, input0_list) shm.set_shared_memory_region(shm_ip1_handle, input1_list) if use_cuda_shared_memory: shm_ip0_handle = cudashm.create_shared_memory_region( shm_region_names[0] + '_data', input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( shm_region_names[1] + '_data', input1_byte_size, 0) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = cudashm.create_shared_memory_region( shm_region_names[2] + '_data', output0_byte_size, 0) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = cudashm.create_shared_memory_region( shm_region_names[2 + i] + '_data', output1_byte_size, 0) else: shm_op1_handle = precreated_shm_regions[i] cudashm.set_shared_memory_region(shm_ip0_handle, input0_list) cudashm.set_shared_memory_region(shm_ip1_handle, input1_list) return shm_region_names, [ shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle ]
def check_sequence(self, trial, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, protocol, batch_size=1, sequence_name="<unknown>", tensor_shape=(1,)): """Perform sequence of inferences. The 'values' holds a list of tuples, one for each inference with format: (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): self.assertFalse(True, "unknown trial type: " + trial) # Can only send the request exactly once since it is a # sequence model with state, so can have only a single config. configs = [] if protocol == "http": configs.append((f"{_tritonserver_ipaddr}:8000", "http", False)) if protocol == "grpc": configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False)) if protocol == "streaming": configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", True)) self.assertFalse( _test_system_shared_memory and _test_cuda_shared_memory, "Cannot set both System and CUDA shared memory flags to 1") self.assertEqual(len(configs), 1) full_shape = tensor_shape if "nobatch" in trial else ( batch_size,) + tensor_shape # create and register shared memory output region in advance, # knowing that this function will not be called concurrently. if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() output_byte_size = 512 if _test_system_shared_memory: shm_op_handle = shm.create_shared_memory_region( "output_data", "/output", output_byte_size) self.triton_client_.register_system_shared_memory( "output_data", "/output", output_byte_size) elif _test_cuda_shared_memory: shm_op_handle = cudashm.create_shared_memory_region( "output_data", output_byte_size, 0) self.triton_client_.register_cuda_shared_memory( "output_data", cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_ip_handles = [] for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) if config[2]: user_data = UserData() triton_client.start_stream( partial(completion_callback, user_data)) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT" OUTPUT = "OUTPUT__0" if trial.startswith( "libtorch") else "OUTPUT" for flag_str, value, thresholds, delay_ms in values: if _test_valgrind or _test_jetson: if delay_ms is not None: delay_ms[0] = max(_valgrind_delay_ms, delay_ms[0]) delay_ms[1] = max(_valgrind_delay_ms, delay_ms[1]) else: delay_ms = (_valgrind_delay_ms, _valgrind_delay_ms) if delay_ms is not None: time.sleep(delay_ms[0] / 1000.0) seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) # Construct request IOs inputs = [] outputs = [] inputs.append( client_utils.InferInput( INPUT, full_shape, np_to_triton_dtype(input_dtype))) outputs.append(client_utils.InferRequestedOutput(OUTPUT)) if input_dtype == np.object_: in0 = np.full(full_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(full_shape) else: in0 = np.full(full_shape, value, dtype=input_dtype) # create input shared memory and copy input data values into it if _test_system_shared_memory or _test_cuda_shared_memory: if input_dtype == np.object_: input_list_tmp = iu.serialize_byte_tensor_list( [in0]) input_byte_size = sum([ serialized_byte_size(i0) for i0 in input_list_tmp ]) else: input_list_tmp = [in0] input_byte_size = sum( [i0.nbytes for i0 in input_list_tmp]) ip_name = "ip{}".format(len(shm_ip_handles)) if _test_system_shared_memory: shm_ip_handles.append( shm.create_shared_memory_region( ip_name, "/" + ip_name, input_byte_size)) shm.set_shared_memory_region( shm_ip_handles[-1], input_list_tmp) triton_client.register_system_shared_memory( ip_name, "/" + ip_name, input_byte_size) elif _test_cuda_shared_memory: shm_ip_handles.append( cudashm.create_shared_memory_region( ip_name, input_byte_size, 0)) cudashm.set_shared_memory_region( shm_ip_handles[-1], input_list_tmp) triton_client.register_cuda_shared_memory( ip_name, cudashm.get_raw_handle(shm_ip_handles[-1]), 0, input_byte_size) inputs[0].set_shared_memory(ip_name, input_byte_size) outputs[0].set_shared_memory("output_data", output_byte_size) else: inputs[0].set_data_from_numpy(in0) start_ms = int(round(time.time() * 1000)) if config[2]: triton_client.async_stream_infer( model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer( model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) end_ms = int(round(time.time() * 1000)) # Get value of "OUTPUT", for shared memory, need to get it via # shared memory utils if (not _test_system_shared_memory) and ( not _test_cuda_shared_memory): out = results.as_numpy(OUTPUT) else: output = results.get_output(OUTPUT) if config[1] == "http": output_shape = output["shape"] else: output_shape = output.shape output_type = input_dtype if _test_system_shared_memory: out = shm.get_contents_as_numpy( shm_op_handle, output_type, output_shape) else: out = cudashm.get_contents_as_numpy( shm_op_handle, output_type, output_shape) result = out[0] if "nobatch" in trial else out[0][0] print("{}: {}".format(sequence_name, result)) if thresholds is not None: lt_ms = thresholds[0] gt_ms = thresholds[1] if lt_ms is not None: self.assertTrue((end_ms - start_ms) < lt_ms, "expected less than " + str(lt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, "expected greater than " + str(gt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if delay_ms is not None: time.sleep(delay_ms[1] / 1000.0) seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object_: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: if _test_jetson: lt_ms *= _jetson_slowdown_factor self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) if config[2]: triton_client.stop_stream() if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region destroy_func(shm_op_handle) for shm_ip_handle in shm_ip_handles: destroy_func(shm_ip_handle)
def precreate_register_regions(self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)): if _test_system_shared_memory or _test_cuda_shared_memory: shm_region_handles = [] for j, value in enumerate(value_list): # For string we can't know the size of the output # so we conservatively assume 64 bytes for each # element of the output if dtype == np.object_: output_byte_size = 4 # size of empty string else: output_byte_size = 0 # create data input_list = list() for b in range(batch_size): if dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array([ str(x).encode('utf-8') for x in in0.reshape(in0.size) ], dtype=object) in0 = in0n.reshape(tensor_shape) output_byte_size += 64 * in0.size else: in0 = np.full(tensor_shape, value, dtype=dtype) output_byte_size += np.dtype(dtype).itemsize * in0.size input_list.append(in0) if dtype == np.object_: input_list_tmp = iu.serialize_byte_tensor_list(input_list) input_byte_size = sum( [serialized_byte_size(i0) for i0 in input_list_tmp]) else: input_list_tmp = input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) # create shared memory regions and copy data for input values ip_name = 'ip{}{}'.format(i, j) op_name = 'op{}{}_data'.format(i, j) if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( ip_name, '/' + ip_name, input_byte_size) shm_op_handle = shm.create_shared_memory_region( op_name, '/' + op_name, output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) self.triton_client_.register_system_shared_memory( ip_name, '/' + ip_name, input_byte_size) self.triton_client_.register_system_shared_memory( op_name, '/' + op_name, output_byte_size) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( ip_name, input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( op_name, output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp) self.triton_client_.register_cuda_shared_memory( ip_name, cudashm.get_raw_handle(shm_ip_handle), 0, input_byte_size) self.triton_client_.register_cuda_shared_memory( op_name, cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_region_handles.append( (ip_name, input_byte_size, shm_ip_handle)) shm_region_handles.append( (op_name, output_byte_size, shm_op_handle)) return shm_region_handles else: return []
# each and returns 2 output tensors of 16 integers each. One # output tensor is the element-wise sum of the inputs and one # output is the element-wise difference. model_name = "simple" model_version = "" # Create the data for the two input tensors. Initialize the first # to unique integers and the second to all ones. input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) input_byte_size = input0_data.size * input0_data.itemsize output_byte_size = input_byte_size # Create Output0 and Output1 in Shared Memory and store shared memory handles shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", output_byte_size, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", output_byte_size, 0) # Register Output0 and Output1 shared memory with Triton Server triton_client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, output_byte_size) triton_client.register_cuda_shared_memory( "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, output_byte_size) # Create Input0 and Input1 in Shared Memory and store shared memory handles shm_ip0_handle = cudashm.create_shared_memory_region( "input0_data", input_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region(
def test_buffer_attributes(self): model_name = 'bls' # Infer clients = [ httpclient.InferenceServerClient(url='localhost:8000'), grpcclient.InferenceServerClient(url='localhost:8001') ] triton_clients = [httpclient, grpcclient] for i, client in enumerate(clients): # To make sure no shared memory regions are registered with the # server. client.unregister_system_shared_memory() client.unregister_cuda_shared_memory() triton_client = triton_clients[i] inputs = [] outputs = [] inputs.append( triton_client.InferInput('INPUT0', [1, 1000], "INT32")) input0_data = np.arange(start=0, stop=1000, dtype=np.int32) input0_data = np.expand_dims(input0_data, axis=0) input_byte_size = input0_data.size * input0_data.itemsize output_byte_size = input_byte_size shm_ip0_handle = cudashm.create_shared_memory_region( "input0_data", input_byte_size, 0) shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", output_byte_size, 0) client.register_cuda_shared_memory( "input0_data", cudashm.get_raw_handle(shm_ip0_handle), 0, input_byte_size) client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, input_byte_size) cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data]) inputs[0].set_shared_memory("input0_data", input_byte_size) if triton_client is grpcclient: outputs.append(triton_client.InferRequestedOutput('OUTPUT0')) outputs[0].set_shared_memory("output0_data", output_byte_size) else: outputs.append( triton_client.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs[0].set_shared_memory("output0_data", output_byte_size) results = client.infer(model_name=model_name, inputs=inputs, outputs=outputs) output0 = results.get_output("OUTPUT0") self.assertIsNotNone(output0) if triton_client is grpcclient: output0_data = cudashm.get_contents_as_numpy( shm_op0_handle, triton_to_np_dtype(output0.datatype), output0.shape) else: output0_data = cudashm.get_contents_as_numpy( shm_op0_handle, triton_to_np_dtype(output0['datatype']), output0['shape']) self.assertTrue(np.all(output0_data == input0_data))