def __call__(self, raw_img): input_image = self._preprocess_image(raw_img) shm.set_shared_memory_region(self.input_handles["input_1"], [input_image]) outputs = self.triton_client.infer(model_name=self.model_name, model_version=self.model_version, inputs=self.input_layers, outputs=self.output_layers) coverages_output = outputs.get_output("output_cov/Sigmoid") bboxes_output = outputs.get_output("output_bbox/BiasAdd") if coverages_output is not None: coverages = shm.get_contents_as_numpy( self.output_handles["output_cov/Sigmoid"], utils.triton_to_np_dtype(coverages_output.datatype), self._prod(coverages_output.shape)) else: raise Exception( "output_cov/Sigmoid layer data is missing in the response.") if bboxes_output is not None: bboxes = shm.get_contents_as_numpy( self.output_handles["output_bbox/BiasAdd"], utils.triton_to_np_dtype(bboxes_output.datatype), self._prod(bboxes_output.shape)) else: raise Exception( "output_bbox/BiasAdd layer data is missing in the response.") boxes = self.postprocessor.start(bboxes, coverages) boxes = NMS.filter(boxes) return boxes
def _configure_sever(self): shm_ip0_handle = shm.create_shared_memory_region( "input0_data", "/input0_data", 64) shm_ip1_handle = shm.create_shared_memory_region( "input1_data", "/input1_data", 64) shm_op0_handle = shm.create_shared_memory_region( "output0_data", "/output0_data", 64) shm_op1_handle = shm.create_shared_memory_region( "output1_data", "/output1_data", 64) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.register_system_shared_memory("input0_data", "/input0_data", 64) triton_client.register_system_shared_memory("input1_data", "/input1_data", 64) triton_client.register_system_shared_memory("output0_data", "/output0_data", 64) triton_client.register_system_shared_memory("output1_data", "/output1_data", 64) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def create_set_either_shm_region(shm_region_names, input_list, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory): if use_cuda_shared_memory and use_system_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [] if use_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( shm_region_names[0] + "_data", input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( shm_region_names[1] + "_data", output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list) elif use_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( shm_region_names[0] + "_data", "/" + shm_region_names[0], input_byte_size) shm_op_handle = shm.create_shared_memory_region( shm_region_names[1] + "_data", "/" + shm_region_names[1], output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list) return [shm_ip_handle, shm_op_handle]
def precreate_register_regions(self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)): if _test_system_shared_memory or _test_cuda_shared_memory: shm_region_handles = [] for j, value in enumerate(value_list): # For string we can't know the size of the output # so we conservatively assume 64 bytes for each # element of the output if dtype == np.object: output_byte_size = 4 # size of empty string else: output_byte_size = 0 # create data input_list = list() for b in range(batch_size): if dtype == np.object: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) output_byte_size += 64 * in0.size else: in0 = np.full(tensor_shape, value, dtype=dtype) output_byte_size += np.dtype(dtype).itemsize * in0.size input_list.append(in0) input_list_tmp = iu.serialize_byte_tensor_list(input_list) if (dtype == np.object) else input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) # create shared memory regions and copy data for input values ip_name = 'ip{}{}'.format(i,j) op_name = 'op{}{}_data'.format(i,j) if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( ip_name, '/'+ip_name, input_byte_size) shm_op_handle = shm.create_shared_memory_region( op_name, '/'+op_name, output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) self.triton_client_.register_system_shared_memory(ip_name, '/'+ip_name, input_byte_size) self.triton_client_.register_system_shared_memory(op_name, '/'+op_name, output_byte_size) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( ip_name, input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( op_name, output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp) self.triton_client_.register_cuda_shared_memory(ip_name, cudashm.get_raw_handle(shm_ip_handle), 0, input_byte_size) self.triton_client_.register_cuda_shared_memory(op_name, cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_region_handles.append((ip_name, input_byte_size, shm_ip_handle)) shm_region_handles.append((op_name, output_byte_size, shm_op_handle)) return shm_region_handles else: return []
def test_valid_create_set_register(self): # Create a valid system shared memory region, fill data in it and register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) shm.set_shared_memory_region(shm_op0_handle, [np.array([1, 2], dtype=np.float32)]) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle)
"/output0_simple", output0_byte_size) triton_client.register_system_shared_memory("output1_data", "/output1_simple", output1_byte_size) # Create Input0 and Input1 in Shared Memory and store shared memory handles shm_ip0_handle = shm.create_shared_memory_region("input0_data", "/input0_simple", input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region("input1_data", "/input1_simple", input1_byte_size) # Put input data values into shared memory shm.set_shared_memory_region(shm_ip0_handle, [input0_data_serialized]) shm.set_shared_memory_region(shm_ip1_handle, [input1_data_serialized]) # Register Input0 and Input1 shared memory with Triton Server triton_client.register_system_shared_memory("input0_data", "/input0_simple", input0_byte_size) triton_client.register_system_shared_memory("input1_data", "/input1_simple", input1_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input0_data", input0_byte_size) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input1_data", input1_byte_size)
def infer_shape_tensor(tester, pf, tensor_dtype, input_shape_values, dummy_input_shapes, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, priority=0, timeout_us=0, batch_size=1): tester.assertTrue(use_http or use_grpc or use_streaming) tester.assertTrue(pf == "plan" or pf == "plan_nobatch") tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) configs = [] if use_http: configs.append(("localhost:8000", "http", False)) if use_grpc: configs.append(("localhost:8001", "grpc", False)) if use_streaming: configs.append(("localhost:8001", "grpc", True)) io_cnt = len(input_shape_values) # FIXME wrap up shm handle cleanup # item is (handle, byte_size) input_shm_handle_list = [] output_shm_handle_list= [] dummy_input_list = [] input_list = [] expected_dict = dict() # Prepare IO in advance for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()], dtype=object).reshape(dummy_in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected value for the output. Skip dummy output as we # only care about its shape (== value of OUTPUT*) expected_dict[output_name] = np.ndarray.copy(in0) # Only need to create region once input_byte_size = in0.size * np.dtype(np.int32).itemsize output_byte_size = input_byte_size * batch_size if use_system_shared_memory: input_shm_handle_list.append((shm.create_shared_memory_region(input_name+shm_suffix, '/'+input_name+shm_suffix, input_byte_size), input_byte_size)) output_shm_handle_list.append((shm.create_shared_memory_region(output_name+shm_suffix, '/'+output_name+shm_suffix, output_byte_size), output_byte_size)) shm.set_shared_memory_region(input_shm_handle_list[-1][0], [in0,]) model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) # Run inference and check results for each config for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) inputs = [] outputs = [] # Set IOs for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) inputs.append(client_utils.InferInput( dummy_input_name, dummy_input_shapes[io_num], np_to_triton_dtype(tensor_dtype))) inputs.append(client_utils.InferInput( input_name, input_list[io_num].shape, "INT32")) outputs.append(client_utils.InferRequestedOutput( dummy_output_name)) outputs.append(client_utils.InferRequestedOutput( output_name)) # -2: dummy; -1: input inputs[-2].set_data_from_numpy(dummy_input_list[io_num]) if (not use_system_shared_memory): inputs[-1].set_data_from_numpy(input_list[io_num]) else: input_byte_size = input_shm_handle_list[io_num][1] output_byte_size = output_shm_handle_list[io_num][1] triton_client.register_system_shared_memory(input_name+shm_suffix, "/"+input_name+shm_suffix, input_byte_size) triton_client.register_system_shared_memory(output_name+shm_suffix, "/"+output_name+shm_suffix, output_byte_size) inputs[-1].set_shared_memory(input_name+shm_suffix, input_byte_size) outputs[-1].set_shared_memory(output_name+shm_suffix, output_byte_size) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) for io_num in range(io_cnt): output_name = "OUTPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) expected = expected_dict[output_name] # get outputs as numpy array dummy_out = results.as_numpy(dummy_output_name) if (not use_system_shared_memory): out = results.as_numpy(output_name) else: output = results.get_output(output_name) if config[1] == "grpc": output_shape = output.shape else: output_shape = output["shape"] out = shm.get_contents_as_numpy(output_shm_handle_list[io_num][0], np.int32, output_shape) # if out shape is 2D, it is batched if (len(out.shape) == 2): # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue(np.array_equal(dummy_out.shape[1:], out[0]), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out[0], dummy_out.shape[1:])) for b in range(1, out.shape[0]): tester.assertTrue(np.array_equal(out[b-1], out[b]), "expect shape tensor has consistent value, " "expected: {}, got {}".format(out[b-1], out[b])) out = out[0] else: tester.assertTrue(np.array_equal(dummy_out.shape, out), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out, dummy_out.shape)) tester.assertTrue(np.array_equal(out, expected), "{}, {}, expected: {}, got {}".format( model_name, output_name, expected, out)) # unregister shared memory region for next config if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_name+shm_suffix) triton_client.unregister_system_shared_memory(output_name+shm_suffix) for handle in input_shm_handle_list: shm.destroy_shared_memory_region(handle[0]) for handle in output_shm_handle_list: shm.destroy_shared_memory_region(handle[0])
def check_sequence(self, trial, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, protocol, batch_size=1, sequence_name="<unknown>", tensor_shape=(1,)): """Perform sequence of inferences. The 'values' holds a list of tuples, one for each inference with format: (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("netdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): self.assertFalse(True, "unknown trial type: " + trial) # Can only send the request exactly once since it is a # sequence model with state, so can have only a single config. configs = [] if protocol == "http": configs.append(("localhost:8000", "http", False)) if protocol == "grpc": configs.append(("localhost:8001", "grpc", False)) if protocol == "streaming": configs.append(("localhost:8001", "grpc", True)) self.assertFalse(_test_system_shared_memory and _test_cuda_shared_memory, "Cannot set both System and CUDA shared memory flags to 1") self.assertEqual(len(configs), 1) full_shape = tensor_shape if "nobatch" in trial else (batch_size,) + tensor_shape # create and register shared memory output region in advance, # knowing that this function will not be called concurrently. if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() output_byte_size = 512 if _test_system_shared_memory: shm_op_handle = shm.create_shared_memory_region("output_data", "/output", output_byte_size) self.triton_client_.register_system_shared_memory("output_data", "/output", output_byte_size) elif _test_cuda_shared_memory: shm_op_handle = cudashm.create_shared_memory_region("output_data", output_byte_size, 0) self.triton_client_.register_cuda_shared_memory("output_data", cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_ip_handles = [] for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT" OUTPUT = "OUTPUT__0" if trial.startswith("libtorch") else "OUTPUT" for flag_str, value, thresholds, delay_ms in values: if delay_ms is not None: time.sleep(delay_ms[0] / 1000.0) seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) # Construct request IOs inputs = [] outputs = [] inputs.append(client_utils.InferInput(INPUT, full_shape, np_to_triton_dtype(input_dtype))) outputs.append(client_utils.InferRequestedOutput(OUTPUT)) if input_dtype == np.object: in0 = np.full(full_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(full_shape) else: in0 = np.full(full_shape, value, dtype=input_dtype) # create input shared memory and copy input data values into it if _test_system_shared_memory or _test_cuda_shared_memory: input_list_tmp = iu.serialize_byte_tensor_list([in0]) if (input_dtype == np.object) else [in0] input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) ip_name = "ip{}".format(len(shm_ip_handles)) if _test_system_shared_memory: shm_ip_handles.append(shm.create_shared_memory_region(ip_name, "/"+ip_name, input_byte_size)) shm.set_shared_memory_region(shm_ip_handles[-1], input_list_tmp) triton_client.register_system_shared_memory(ip_name, "/"+ip_name, input_byte_size) elif _test_cuda_shared_memory: shm_ip_handles.append(cudashm.create_shared_memory_region(ip_name, input_byte_size, 0)) cudashm.set_shared_memory_region(shm_ip_handles[-1], input_list_tmp) triton_client.register_cuda_shared_memory(ip_name, cudashm.get_raw_handle(shm_ip_handles[-1]), 0, input_byte_size) inputs[0].set_shared_memory(ip_name, input_byte_size) outputs[0].set_shared_memory("output_data", output_byte_size) else: inputs[0].set_data_from_numpy(in0) start_ms = int(round(time.time() * 1000)) if config[2]: triton_client.async_stream_infer(model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) end_ms = int(round(time.time() * 1000)) # Get value of "OUTPUT", for shared memory, need to get it via # shared memory utils if (not _test_system_shared_memory) and (not _test_cuda_shared_memory): out = results.as_numpy(OUTPUT) else: output = results.get_output(OUTPUT) if config[1] == "http": output_shape = output["shape"] else: output_shape = output.shape output_type = input_dtype if _test_system_shared_memory: out = shm.get_contents_as_numpy(shm_op_handle, output_type, output_shape) else: out = cudashm.get_contents_as_numpy(shm_op_handle, output_type, output_shape) result = out[0] if "nobatch" in trial else out[0][0] print("{}: {}".format(sequence_name, result)) if thresholds is not None: lt_ms = thresholds[0] gt_ms = thresholds[1] if lt_ms is not None: self.assertTrue((end_ms - start_ms) < lt_ms, "expected less than " + str(lt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if gt_ms is not None: self.assertTrue((end_ms - start_ms) > gt_ms, "expected greater than " + str(gt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if delay_ms is not None: time.sleep(delay_ms[1] / 1000.0) seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) if config[2]: triton_client.stop_stream() if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region destroy_func(shm_op_handle) for shm_ip_handle in shm_ip_handles: destroy_func(shm_ip_handle)
def precreate_register_dynaseq_shape_tensor_regions(self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)): if _test_system_shared_memory or _test_cuda_shared_memory: shm_region_handles = [] for j, (shape_value, value) in enumerate(value_list): input_list = list() shape_input_list = list() dummy_input_list = list() for b in range(batch_size): if dtype == np.object: dummy_in0 = np.full(tensor_shape, value, dtype=np.int32) dummy_in0n = np.array([str(x) for x in dummy_in0.reshape(in0.size)], dtype=object) dummy_in0 = dummy_in0n.reshape(tensor_shape) else: dummy_in0 = np.full(tensor_shape, value, dtype=dtype) dummy_input_list.append(dummy_in0) in0 = np.full(tensor_shape, value, dtype=np.int32) input_list.append(in0) # Only one shape tensor input per batch shape_input_list.append(np.full(tensor_shape, shape_value, dtype=np.int32)) input_list_tmp = iu.serialize_byte_tensor_list(input_list) if (dtype == np.object) else input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) shape_input_byte_size = sum([i0.nbytes for i0 in shape_input_list]) dummy_input_byte_size = sum([i0.nbytes for i0 in dummy_input_list]) shape_output_byte_size = shape_input_byte_size output_byte_size = np.dtype(np.int32).itemsize + 2 resized_output_byte_size = 32 * shape_value # create shared memory regions and copy data for input values ip_name = 'ip{}{}'.format(i,j) shape_ip_name = 'shape_ip{}{}'.format(i,j) dummy_ip_name = 'dummy_ip{}{}'.format(i,j) shape_op_name = 'shape_op{}{}'.format(i,j) op_name = 'op{}{}'.format(i,j) resized_op_name = 'resized_op{}{}'.format(i,j) if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( ip_name, '/'+ip_name, input_byte_size) shm_shape_ip_handle = shm.create_shared_memory_region( shape_ip_name, '/'+shape_ip_name, shape_input_byte_size) shm_dummy_ip_handle = shm.create_shared_memory_region( dummy_ip_name, '/'+dummy_ip_name, dummy_input_byte_size) shm_shape_op_handle = shm.create_shared_memory_region( shape_op_name, '/'+shape_op_name, shape_output_byte_size) shm_op_handle = shm.create_shared_memory_region( op_name, '/'+op_name, output_byte_size) shm_resized_op_handle = shm.create_shared_memory_region( resized_op_name, '/'+resized_op_name, resized_output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) shm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list) shm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list) self.triton_client_.register_system_shared_memory(ip_name, '/'+ip_name, input_byte_size) self.triton_client_.register_system_shared_memory(shape_ip_name, '/'+shape_ip_name, shape_input_byte_size) self.triton_client_.register_system_shared_memory(dummy_ip_name, '/'+dummy_ip_name, dummy_input_byte_size) self.triton_client_.register_system_shared_memory(shape_op_name, '/'+shape_op_name, shape_output_byte_size) self.triton_client_.register_system_shared_memory(op_name, '/'+op_name, output_byte_size) self.triton_client_.register_system_shared_memory(resized_op_name, '/'+resized_op_name, resized_output_byte_size) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( ip_name, input_byte_size, 0) shm_shape_ip_handle = cudashm.create_shared_memory_region( shape_ip_name, shape_input_byte_size, 0) shm_dummy_ip_handle = cudashm.create_shared_memory_region( dummy_ip_name, dummy_input_byte_size, 0) shm_shape_op_handle = cudashm.create_shared_memory_region( shape_op_name, shape_output_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( op_name, output_byte_size, 0) shm_resized_op_handle = cudashm.create_shared_memory_region( resized_op_name, resized_output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp) cudashm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list) cudashm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list) self.triton_client_.register_cuda_shared_memory(ip_name, cudashm.get_raw_handle(shm_ip_handle), 0, input_byte_size) self.triton_client_.register_cuda_shared_memory(shape_ip_name, cudashm.get_raw_handle(shm_shape_ip_handle), 0, shape_input_byte_size) self.triton_client_.register_cuda_shared_memory(dummy_ip_name, cudashm.get_raw_handle(shm_dummy_ip_handle), 0, dummy_input_byte_size) self.triton_client_.register_cuda_shared_memory(shape_op_name, cudashm.get_raw_handle(shm_shape_op_handle), 0, shape_output_byte_size) self.triton_client_.register_cuda_shared_memory(op_name, cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) self.triton_client_.register_cuda_shared_memory(resized_op_name, cudashm.get_raw_handle(shm_resized_op_handle), 0, resized_output_byte_size) shm_region_handles.append((ip_name, input_byte_size, shm_ip_handle)) shm_region_handles.append((shape_ip_name, shape_input_byte_size, shm_shape_ip_handle)) shm_region_handles.append((dummy_ip_name, dummy_input_byte_size, shm_dummy_ip_handle)) shm_region_handles.append((shape_op_name, shape_output_byte_size, shm_shape_op_handle)) shm_region_handles.append((op_name, output_byte_size, shm_op_handle)) shm_region_handles.append((resized_op_name, resized_output_byte_size, shm_resized_op_handle)) return shm_region_handles else: return []
def create_set_shm_regions(input0_list, input1_list, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory): if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [], [] input0_byte_size = sum([i0.nbytes for i0 in input0_list]) input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if shm_region_names is None: shm_region_names = ['input0', 'input1', 'output0', 'output1'] shm_op0_handle = None shm_op1_handle = None if use_system_shared_memory: shm_ip0_handle = shm.create_shared_memory_region( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = shm.create_shared_memory_region( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = shm.create_shared_memory_region( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) else: shm_op1_handle = precreated_shm_regions[i] shm.set_shared_memory_region(shm_ip0_handle, input0_list) shm.set_shared_memory_region(shm_ip1_handle, input1_list) if use_cuda_shared_memory: shm_ip0_handle = cudashm.create_shared_memory_region( shm_region_names[0] + '_data', input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( shm_region_names[1] + '_data', input1_byte_size, 0) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = cudashm.create_shared_memory_region( shm_region_names[2] + '_data', output0_byte_size, 0) else: shm_op0_handle = precreated_shm_regions[0] i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = cudashm.create_shared_memory_region( shm_region_names[2 + i] + '_data', output1_byte_size, 0) else: shm_op1_handle = precreated_shm_regions[i] cudashm.set_shared_memory_region(shm_ip0_handle, input0_list) cudashm.set_shared_memory_region(shm_ip1_handle, input1_list) return shm_region_names, [ shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle ]