def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_list = list() input1_list = list() expected0_list = list() expected1_list = list() expected0_val_list = list() expected1_val_list = list() for b in range(batch_size): in0 = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) in1 = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: in0 = in0.astype(input_dtype) in1 = in1.astype(input_dtype) if not swap: op0 = in0 + in1 op1 = in0 - in1 else: op0 = in0 - in1 op1 = in0 + in1 expected0_val_list.append(op0) expected1_val_list.append(op1) if output0_dtype == np.object: expected0_list.append( np.array([ unicode(str(x), encoding='utf-8') for x in (op0.flatten()) ], dtype=object).reshape(op0.shape)) else: expected0_list.append(op0.astype(output0_dtype)) if output1_dtype == np.object: expected1_list.append( np.array([ unicode(str(x), encoding='utf-8') for x in (op1.flatten()) ], dtype=object).reshape(op1.shape)) else: expected1_list.append(op1.astype(output1_dtype)) if input_dtype == np.object: in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(in0.shape) in1n = np.array([str(x) for x in in1.reshape(in1.size)], dtype=object) in1 = in1n.reshape(in1.shape) input0_list.append(in0) input1_list.append(in1) # prepend size of string to string input string data if input_dtype == np.object: input0_list_tmp = _prepend_string_size(input0_list) input1_list_tmp = _prepend_string_size(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list]) input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if output0_dtype == np.object: expected0_list_tmp = _prepend_string_size(expected0_list) else: expected0_list_tmp = expected0_list if output1_dtype == np.object: expected1_list_tmp = _prepend_string_size(expected1_list) else: expected1_list_tmp = expected1_list # Create and register system/cuda shared memory regions if needed shm_handles = su.create_register_set_shm_regions( input0_list_tmp, input1_list_tmp, expected0_list_tmp, expected1_list_tmp, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=correlation_id, streaming=config[2], verbose=True) expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in expected0_val_list ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in expected1_val_list ] output_req = {} OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" if pf == "libtorch" or pf == "libtorch_nobatch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" i = 0 if "OUTPUT0" in outputs: if len(shm_handles) != 0: output_req[OUTPUT0] = (InferContext.ResultFormat.RAW, shm_handles[2]) else: if output0_raw: output_req[OUTPUT0] = InferContext.ResultFormat.RAW else: output_req[OUTPUT0] = (InferContext.ResultFormat.CLASS, num_classes) i += 1 if "OUTPUT1" in outputs: if len(shm_handles) != 0: output_req[OUTPUT1] = (InferContext.ResultFormat.RAW, shm_handles[2 + i]) else: if output1_raw: output_req[OUTPUT1] = InferContext.ResultFormat.RAW else: output_req[OUTPUT1] = (InferContext.ResultFormat.CLASS, num_classes) if len(shm_handles) != 0: results = ctx.run( { INPUT0: (shm_handles[0], tensor_shape), INPUT1: (shm_handles[1], tensor_shape) }, output_req, batch_size) else: results = ctx.run({ INPUT0: input0_list, INPUT1: input1_list }, output_req, batch_size) if not skip_request_id_check: global _seen_request_ids request_id = ctx.get_last_request_id() tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), len(outputs)) for (result_name, result_val) in iteritems(results): for b in range(batch_size): if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if result_name == OUTPUT0: tester.assertTrue( np.array_equal(result_val[b], expected0_list[b]), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, expected0_list[b], result_val[b])) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(result_val[b], expected1_list[b]), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, expected1_list[b], result_val[b])) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: # num_classes values must be returned and must # match expected top values class_list = result_val[b] tester.assertEqual(len(class_list), num_classes) expected0_flatten = expected0_list[b].flatten() expected1_flatten = expected1_list[b].flatten() for idx, ctuple in enumerate(class_list): if result_name == OUTPUT0: # can't compare indices since could have # different indices with the same # value/prob, so compare that the value of # each index equals the expected # value. Can only compare labels when the # indices are equal. tester.assertEqual(ctuple[1], expected0_flatten[ctuple[0]]) tester.assertEqual( ctuple[1], expected0_flatten[expected0_sort_idx[b][idx]]) if ctuple[0] == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(ctuple[1], expected1_flatten[ctuple[0]]) tester.assertEqual( ctuple[1], expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) # configs [ url, protocol, async stream, binary data ] configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if output0_raw == output1_raw: # Float16 not supported for Input and Output via JSON if use_http_json_tensors and (input_dtype != np.float16) and \ (output0_dtype != np.float16) and (output1_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) input1_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: input0_array = input0_array.astype(input_dtype) input1_array = input1_array.astype(input_dtype) if not swap: output0_array = input0_array + input1_array output1_array = input0_array - input1_array else: output0_array = input0_array - input1_array output1_array = input0_array + input1_array if output0_dtype == np.object: output0_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output0_array.flatten()) ], dtype=object).reshape(output0_array.shape) else: output0_array = output0_array.astype(output0_dtype) if output1_dtype == np.object: output1_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output1_array.flatten()) ], dtype=object).reshape(output1_array.shape) else: output1_array = output1_array.astype(output1_dtype) if input_dtype == np.object: in0n = np.array( [str(x) for x in input0_array.reshape(input0_array.size)], dtype=object) input0_array = in0n.reshape(input0_array.shape) in1n = np.array( [str(x) for x in input1_array.reshape(input1_array.size)], dtype=object) input1_array = in1n.reshape(input1_array.shape) # prepend size of string to output string data if output0_dtype == np.object: if batch_size == 1: output0_array_tmp = serialize_byte_tensor_list([output0_array]) else: output0_array_tmp = serialize_byte_tensor_list(output0_array) else: output0_array_tmp = output0_array if output1_dtype == np.object: if batch_size == 1: output1_array_tmp = serialize_byte_tensor_list([output1_array]) else: output1_array_tmp = serialize_byte_tensor_list(output1_array) else: output1_array_tmp = output1_array # Get model platform model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform if platform == "pytorch_libtorch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" else: OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp]) output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp]) if batch_size == 1: input0_list = [input0_array] input1_list = [input1_array] else: input0_list = [x for x in input0_array] input1_list = [x for x in input1_array] # Serialization of string tensors in the case of shared memory must be done manually if input_dtype == np.object: input0_list_tmp = serialize_byte_tensor_list(input0_list) input1_list_tmp = serialize_byte_tensor_list(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp]) input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp]) # Create system/cuda shared memory regions if needed shm_regions, shm_handles = su.create_set_shm_regions( input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] if config[1] == "http": inputs.append( httpclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( httpclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) else: inputs.append( grpcclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( grpcclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[0].set_data_from_numpy(input0_array, binary_data=config[3]) inputs[1].set_data_from_numpy(input1_array, binary_data=config[3]) else: inputs[0].set_data_from_numpy(input0_array) inputs[1].set_data_from_numpy(input1_array) else: # Register necessary shared memory regions/handles su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if batch_size == 1: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape((1, ) + tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape((1, ) + tensor_shape) ] else: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape(tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape(tensor_shape) ] # Force binary_data = False for shared memory and class output_req = [] i = 0 if "OUTPUT0" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT0, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT0)) output_req[-1].set_shared_memory(shm_regions[2] + '_data', output0_byte_size) else: if output0_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT0)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT0, class_count=num_classes)) i += 1 if "OUTPUT1" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT1, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT1)) output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data', output1_byte_size) else: if output1_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT1)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT1, class_count=num_classes)) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) last_response = results.get_response() if not skip_request_id_check: global _seen_request_ids if config[1] == "http": request_id = int(last_response["id"]) else: request_id = int(last_response.id) tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(str(response_model_version), model_version) tester.assertEqual(len(response_outputs), len(outputs)) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if use_system_shared_memory or use_cuda_shared_memory: if result_name == OUTPUT0: shm_handle = shm_handles[2] else: shm_handle = shm_handles[3] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) if result_name == OUTPUT0: tester.assertTrue( np.array_equal(output_data, output0_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, output0_array, output_data)) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(output_data, output1_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, output1_array, output_data)) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: for b in range(batch_size): # num_classes values must be returned and must # match expected top values if "nobatch" in pf: class_list = results.as_numpy(result_name) else: class_list = results.as_numpy(result_name)[b] tester.assertEqual(len(class_list), num_classes) if batch_size == 1: expected0_flatten = output0_array.flatten() expected1_flatten = output1_array.flatten() else: expected0_flatten = output0_array[b].flatten() expected1_flatten = output1_array[b].flatten() for idx, class_label in enumerate(class_list): # can't compare indices since could have different # indices with the same value/prob, so check that # the value of each index equals the expected value. # Only compare labels when the indices are equal. if type(class_label) == str: ctuple = class_label.split(':') else: ctuple = "".join(chr(x) for x in class_label).split(':') cval = float(ctuple[0]) cidx = int(ctuple[1]) if result_name == OUTPUT0: tester.assertEqual(cval, expected0_flatten[cidx]) tester.assertEqual( cval, expected0_flatten[expected0_sort_idx[b][idx]]) if cidx == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(cval, expected1_flatten[cidx]) tester.assertEqual( cval, expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results