def requestGenerator(input_name, output_name, c, h, w, format, dtype, FLAGS): # Preprocess image into input data according to model requirements image_data = None with Image.open(FLAGS.image_filename) as img: image_data = preprocess(img, format, dtype, c, h, w, FLAGS.scaling) repeated_image_data = [image_data for _ in range(FLAGS.batch_size)] batched_image_data = np.stack(repeated_image_data, axis=0) # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( grpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( httpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=False) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( grpcclient.InferOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( httpclient.InferOutput(output_name, binary_data=False, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def async_send(triton_client, stream, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append(grpcclient.InferInput('INPUT')) # Initialize the data inputs[0].set_data_from_numpy(value_data) outputs = [] outputs.append(grpcclient.InferOutput('OUTPUT')) # Issue the asynchronous sequence inference. triton_client.async_stream_infer(model_name=model_name, inputs=inputs, stream=stream, outputs=outputs, request_id='{}_{}'.format( sequence_id, count), sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) count = count + 1
def requestGenerator(input_name, output_name, c, h, w, format, dtype, FLAGS): inputs = [] inputs.append(grpcclient.InferInput(input_name)) # Preprocess image into input data according to model requirements image_data = None with Image.open(FLAGS.image_filename) as img: image_data = preprocess(img, format, dtype, c, h, w, FLAGS.scaling) repeated_image_data = [image_data for _ in range(FLAGS.batch_size)] batched_image_data = np.stack(repeated_image_data, axis=0) # Set the input data inputs[0].set_data_from_numpy(batched_image_data) outputs = [] outputs.append(grpcclient.InferOutput(output_name)) outputs[0].set_parameter("classification", 2) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def sync_send(triton_client, result_list, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append(grpcclient.InferInput('INPUT', value_data.shape, "INT32")) # Initialize the data inputs[0].set_data_from_numpy(value_data) outputs = [] outputs.append(grpcclient.InferOutput('OUTPUT')) # Issue the synchronous sequence inference. result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) result_list.append(result.as_numpy('OUTPUT')) count = count + 1
triton_client.register_cuda_shared_memory( "input1_data", cudashm.get_raw_handle(shm_ip1_handle), 0, input_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "INT32")) inputs[-1].set_parameter("shared_memory_region", "input0_data") inputs[-1].set_parameter("shared_memory_byte_size", input_byte_size) inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "INT32")) inputs[-1].set_parameter("shared_memory_region", "input1_data") inputs[-1].set_parameter("shared_memory_byte_size", input_byte_size) outputs = [] outputs.append(grpcclient.InferOutput('OUTPUT0')) # outputs[-1].set_parameter("shared_memory_region", "output0_data") # outputs[-1].set_parameter("shared_memory_byte_size", output_byte_size) outputs.append(grpcclient.InferOutput('OUTPUT1')) # outputs[-1].set_parameter("shared_memory_region", "output1_data") # outputs[-1].set_parameter("shared_memory_byte_size", output_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # TODO : Currently, this example doesn't use shared memory for output. # This is done to effectively validate the results. # tritongrpcclient.cuda_shared_memory module will be enhanced to read # data from a specified shared memory handle, data_type and shape;