def test_http_out_of_shared_memory(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up too small CUDA shared memory for outputs, expect query # returns default value triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", 1, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", 1, 0) triton_client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1) triton_client.register_cuda_shared_memory( "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1) outputs = [] outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs[-1].set_shared_memory("output0_data", 1) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) outputs[-1].set_shared_memory("output1_data", 1) try: triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message()) cudashm.destroy_shared_memory_region(shm_op0_handle) cudashm.destroy_shared_memory_region(shm_op1_handle) triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory()
def _request_generator(cls, batched_image_data): """ Set the input data """ inputs = [ httpclient.InferInput(cls.INPUT_NAME, batched_image_data.shape, cls.DTYPE) ] inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [ httpclient.InferRequestedOutput(output_name, binary_data=True) for output_name in cls.OUTPUT_NAMES ] yield inputs, outputs
def run_inference(X, X_shape=(1, 3, 224, 224), X_dtype='FP32', model_name='cub200_resnet34', input_name=['INPUT__0'], output_name='OUTPUT__0', url='ecm-clearml-compute-gpu-002.westeurope.cloudapp.azure.com', model_version='1', port=8000, VERBOSE=False): url = url+':'+str(port) triton_client = http.InferenceServerClient(url=url, verbose=VERBOSE) input0 = http.InferInput(input_name[0], X_shape, X_dtype) input0.set_data_from_numpy(X, binary_data=False) output = http.InferRequestedOutput(output_name, binary_data=False) response = triton_client.infer(model_name, model_version=model_version, inputs=[input0], outputs=[output]) y_pred_proba = response.as_numpy(output_name) y_pred = y_pred_proba.argmax(1) return y_pred_proba, y_pred