Пример #1
0
    def test_batch_request_for_batching_model(self):
        input_size = 16

        # graphdef_nobatch_int32_int8_int8 is non batching version.
        # The server should return an error if the batch size dimension 
        # is included in the shape
        tensor_shape = (1, input_size)
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True)
                inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True)
                inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            results = triton_client.infer(model_name,
                                          inputs,
                                          outputs=outputs)
Пример #2
0
    def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url,
                                                             verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url,
                                                             verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=False)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all())
        except Exception as ex:
            error_msg.append(str(ex))
Пример #3
0
def requestGenerator(input_name, output_name, c, h, w, format, dtype, FLAGS):
    # Preprocess image into input data according to model requirements
    image_data = None
    with Image.open(FLAGS.image_filename) as img:
        image_data = preprocess(img, format, dtype, c, h, w, FLAGS.scaling)

    repeated_image_data = [image_data for _ in range(FLAGS.batch_size)]
    batched_image_data = np.stack(repeated_image_data, axis=0)

    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            tritongrpcclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            tritonhttpclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=False)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            tritongrpcclient.InferRequestedOutput(output_name,
                                            class_count=FLAGS.classes))
    else:
        outputs.append(
            tritonhttpclient.InferRequestedOutput(output_name,
                                            binary_data=False,
                                            class_count=FLAGS.classes))

    yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
Пример #4
0
def sync_send(triton_client, result_list, values, batch_size, sequence_id,
              model_name, model_version):

    count = 1
    for value in values:
        # Create the tensor for INPUT
        value_data = np.full(shape=[batch_size, 1],
                             fill_value=value,
                             dtype=np.int32)
        inputs = []
        inputs.append(
            tritongrpcclient.InferInput('INPUT', value_data.shape, "INT32"))
        # Initialize the data
        inputs[0].set_data_from_numpy(value_data)
        outputs = []
        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT'))
        # Issue the synchronous sequence inference.
        result = triton_client.infer(model_name=model_name,
                                     inputs=inputs,
                                     outputs=outputs,
                                     sequence_id=sequence_id,
                                     sequence_start=(count == 1),
                                     sequence_end=(count == len(values)))
        result_list.append(result.as_numpy('OUTPUT'))
        count = count + 1
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS):

    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            tritongrpcclient.InferInput(input_name, batched_image_data.shape,
                                        dtype))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            tritonhttpclient.InferInput(input_name, batched_image_data.shape,
                                        dtype))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            tritongrpcclient.InferRequestedOutput(output_name,
                                                  class_count=FLAGS.classes))
    else:
        outputs.append(
            tritonhttpclient.InferRequestedOutput(output_name,
                                                  binary_data=True,
                                                  class_count=FLAGS.classes))

    yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
    def detector(self, frames):
        infer_inputs = [
            triton.InferInput('input_1', (len(frames), 3, *self.resize[::-1]),
                              "FP32")
        ]
        frames = np.array(frames, dtype=np.float32)
        frames = np.transpose(frames, (0, 3, 1, 2))
        infer_inputs[0].set_data_from_numpy(frames)
        result = self.triton_client.infer('retinanet', infer_inputs)
        scores = result.as_numpy('scores').reshape((-1, 100))
        boxes = result.as_numpy('boxes').reshape((-1, 100, 4))
        classes = result.as_numpy('classes').reshape((-1, 100))

        # Calculate embeddings for all the detected subjects
        embs = []
        scores_filtered = []
        boxes_filters = []
        for i in range(len(frames)):
            mask = (scores[i] > 0.4) & (
                classes[i] == 0)  # only care about 'person' with score > 0.4
            scores_i = scores[i, mask]
            boxes_i = boxes[i, mask]

            scores_i, boxes_i = self.bbox_filter(scores_i, boxes_i)

            img = frames[i].astype(np.uint8)  # (3, 800, 1280)
            embs_i = []
            boxes_i = boxes_i.astype(int)
            for j in range(len(boxes_i)):
                imp = img[:, boxes_i[j, 1]:boxes_i[j, 3],
                          boxes_i[j, 0]:boxes_i[j, 2]]
                imp = np.transpose(imp, (1, 2, 0))
                imp = Image.fromarray(imp)
                data = [
                    np.asarray(transforms.Resize(size=(256, 128))(imp)).astype(
                        np.float32)
                ]

                inputs = []
                inputs.append(
                    tritongrpcclient.InferInput('image',
                                                [len(data), 256, 128, 3],
                                                "FP32"))
                # Initialize the data
                inputs[0].set_data_from_numpy(np.asarray(data))
                outputs = []
                outputs.append(
                    tritongrpcclient.InferRequestedOutput('features'))
                results = self.triton_client.infer('osnet_ensemble',
                                                   inputs,
                                                   outputs=outputs)
                emb = np.squeeze(results.as_numpy('features'))
                embs_i.append(emb / np.linalg.norm(emb))
            embs.append(embs_i)
            scores_filtered.append(scores_i)
            boxes_filters.append(boxes_i)

        return np.asarray(scores_filtered), np.asarray(
            boxes_filters), np.asarray(embs)
Пример #7
0
    def setUp(self):
        self.trials_ = [("repeat_int32", None), ("simple_repeat", None),
                        ("sequence_repeat", None),
                        ("repeat_square", self._nested_validate),
                        ("nested_square", self._nested_validate)]
        self.model_name_ = "repeat_int32"

        self.inputs_ = []
        self.inputs_.append(grpcclient.InferInput('IN', [1], "INT32"))
        self.inputs_.append(grpcclient.InferInput('DELAY', [1], "UINT32"))
        self.inputs_.append(grpcclient.InferInput('WAIT', [1], "UINT32"))

        self.outputs_ = []
        self.outputs_.append(grpcclient.InferRequestedOutput('OUT'))
        self.outputs_.append(grpcclient.InferRequestedOutput('IDX'))
        # Some trials only expect a subset of outputs
        self.requested_outputs_ = self.outputs_
    def _initialize_model(self):
        input_cfg = self.model_config['config']['input']
        output_cfg = self.model_config['config']['output']

        input_names = [i['name'] for i in input_cfg]
        output_names = [o['name'] for o in output_cfg]
        print('Input layers: ', output_names)
        print('Output layers: ', output_names)

        input_dims = [[int(dim) for dim in input_cfg[i]['dims']]
                      for i in range(len(input_cfg))]
        output_dims = [[int(dim) for dim in output_cfg[i]['dims']]
                       for i in range(len(output_cfg))]
        self.input_shape = input_dims[0]
        self.output_dims = output_dims

        if self.triton_cfg['model']['precision'] == "FP32":
            mult = 4
        elif self.triton_cfg['model']['precision'] == "FP16":
            mult = 2  # TODO: Fix this
        elif self.triton_cfg['model']['precision'] == "INT8":
            mult = 1  # TODO: Fix this
        else:
            print("unsupported precision in config file: " +
                  str(self.triton_cfg['model']['precision']))
            sys.exit()

        input_byte_sizes_list = [
            self._prod(dims) * mult for dims in input_dims
        ]
        output_byte_sizes_list = [
            self._prod(dims) * mult for dims in output_dims
        ]

        for i in range(len(input_cfg)):
            shm_region_name = self.model_name + "_input" + str(i)
            self._register_system_shm_regions(shm_region_name,
                                              self.input_handles,
                                              input_byte_sizes_list[i],
                                              input_names[i])
            self.input_layers.append(
                tritongrpcclient.InferInput(
                    input_names[i],
                    [1, input_dims[i][0], input_dims[i][1], input_dims[i][2]],
                    "FP32"))
            self.input_layers[-1].set_shared_memory(shm_region_name,
                                                    input_byte_sizes_list[i])

        for i in range(len(output_cfg)):
            shm_region_name = self.model_name + "_output" + str(i)
            self._register_system_shm_regions(shm_region_name,
                                              self.output_handles,
                                              output_byte_sizes_list[i],
                                              output_names[i])
            self.output_layers.append(
                tritongrpcclient.InferRequestedOutput(output_names[i]))
            self.output_layers[-1].set_shared_memory(shm_region_name,
                                                     output_byte_sizes_list[i])
def main():
    FLAGS = parse_args()
    try:
        triton_client = tritongrpcclient.InferenceServerClient(url=FLAGS.url, verbose=FLAGS.verbose)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit(1)

    model_name = FLAGS.model_name
    model_version = -1

    print("Loading images")

    image_data, labels = load_images(FLAGS.img_dir if FLAGS.img_dir is not None else FLAGS.img)
    image_data = array_from_list(image_data)

    print("Images loaded, inferring")

    # Infer
    outputs = []
    input_name = "INPUT"
    output_name = "OUTPUT"
    input_shape = list(image_data.shape)
    outputs.append(tritongrpcclient.InferRequestedOutput(output_name))

    img_idx = 0
    for batch in batcher(image_data, FLAGS.batch_size):
        print("Input mean before backend processing:", np.mean(batch))
        input_shape[0] = np.shape(batch)[0]
        print("Batch size: ", input_shape[0])
        inputs = [tritongrpcclient.InferInput(input_name, input_shape, "UINT8")]
        # Initialize the data
        inputs[0].set_data_from_numpy(batch)

        # Test with outputs
        results = triton_client.infer(model_name=model_name,
                                      inputs=inputs,
                                      outputs=outputs)

        # Get the output arrays from the results
        output0_data = results.as_numpy(output_name)
        print("Output mean after backend processing:", np.mean(output0_data))
        print("Output shape: ", np.shape(output0_data))
        maxs = np.argmax(output0_data, axis=1)
        for i in range(len(maxs)):
            print("Sample ", i, " - label: ", maxs[i], " ~ ", output0_data[i, maxs[i]])
            if maxs[i] != labels[img_idx]:
                sys.exit(1)
            else:
                print("pass")
            img_idx += 1

    statistics = triton_client.get_inference_statistics(model_name=model_name)
    if len(statistics.model_stats) != 1:
        print("FAILED: Inference Statistics")
        sys.exit(1)
    def setUp(self):
        self.model_name_ = "repeat_int32"

        self.inputs_ = []
        self.inputs_.append(grpcclient.InferInput('IN', [1, 1], "INT32"))
        self.inputs_.append(grpcclient.InferInput('DELAY', [1, 1], "UINT32"))
        self.inputs_.append(grpcclient.InferInput('WAIT', [1, 1], "UINT32"))

        self.outputs_ = []
        self.outputs_.append(grpcclient.InferRequestedOutput('OUT'))
Пример #11
0
def main():
    FLAGS = parse_args()
    try:
        triton_client = tritongrpcclient.InferenceServerClient(
            url=FLAGS.url, verbose=FLAGS.verbose)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit(1)

    model_name = FLAGS.model_name
    model_version = -1

    input_data = [
        randint(0, 255, size=randint(100), dtype='uint8')
        for _ in range(randint(100) * FLAGS.batch_size)
    ]
    input_data = array_from_list(input_data)

    # Infer
    outputs = []
    input_name = "DALI_INPUT_0"
    output_name = "DALI_OUTPUT_0"
    input_shape = list(input_data.shape)
    outputs.append(tritongrpcclient.InferRequestedOutput(output_name))

    for batch in batcher(input_data, FLAGS.batch_size):
        print("Input mean before backend processing:", np.mean(batch))
        input_shape[0] = np.shape(batch)[0]
        print("Batch size: ", input_shape[0])
        inputs = [
            tritongrpcclient.InferInput(input_name, input_shape, "UINT8")
        ]
        # Initialize the data
        inputs[0].set_data_from_numpy(batch)

        # Test with outputs
        results = triton_client.infer(model_name=model_name,
                                      inputs=inputs,
                                      outputs=outputs)

        # Get the output arrays from the results
        output0_data = results.as_numpy(output_name)
        print("Output mean after backend processing:", np.mean(output0_data))
        print("Output shape: ", np.shape(output0_data))
        if not math.isclose(np.mean(output0_data), np.mean(batch)):
            print("Pre/post average does not match")
            sys.exit(1)
        else:
            print("pass")

    statistics = triton_client.get_inference_statistics(model_name=model_name)
    if len(statistics.model_stats) != 1:
        print("FAILED: Inference Statistics")
        sys.exit(1)
Пример #12
0
    def test_nobatch_request_for_batching_model(self):
        input_size = 16

        # graphdef_int32_int8_int8 has a batching version with max batch size of 8.
        # The server should return an error if the batch size is not included in the
        # input shapes.
        tensor_shape = (input_size,)
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True)
                inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True)
                inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            try:
                results = triton_client.infer(model_name,
                                  inputs,
                                  outputs=outputs)
                self.assertTrue(False, "expected failure with no batch request for batching model")
            except InferenceServerException as ex:
                pass
    def _prepare_request(self, protocol):
        if (protocol == "grpc"):
            self.inputs_ = []
            self.inputs_.append(grpcclient.InferInput('INPUT0', [1, 1],
                                                      "INT32"))
            self.outputs_ = []
            self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0'))
        else:
            self.inputs_ = []
            self.inputs_.append(httpclient.InferInput('INPUT0', [1, 1],
                                                      "INT32"))
            self.outputs_ = []
            self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0'))

        self.inputs_[0].set_data_from_numpy(self.input0_data_)
Пример #14
0
def request_eval(hit_data,row_splits, triton_client, model_name):
    
    np_rs_type = 'int64'
    tr_rs_type = 'INT64'
    
    inputs = []
    outputs = []
    
    
    #print(hit_data.shape)
    #print(row_splits.shape)
    
    inputs.append(tritongrpcclient.InferInput('input_1', hit_data.shape, 'FP32'))
    inputs.append(tritongrpcclient.InferInput('input_2', row_splits.shape, tr_rs_type)) #INT64
    
    inputs[0].set_data_from_numpy(hit_data)
    inputs[1].set_data_from_numpy(row_splits)
    
    outputs.append(tritongrpcclient.InferRequestedOutput('output'))
    outputs.append(tritongrpcclient.InferRequestedOutput('output_1'))
    #outputs.append(tritongrpcclient.InferRequestedOutput('predicted_final_condensates'))
    #outputs.append(tritongrpcclient.InferRequestedOutput('output_row_splits'))
    # predicted_final_1 doesn't matter
    
    results = triton_client.infer(
        model_name=model_name,
        inputs=inputs,
        outputs=outputs
        )
    
    condensates = results.as_numpy('output')
    #condensates = results.as_numpy('predicted_final_condensates')
    #rs = results.as_numpy('output_row_splits')
    
    #print('output',condensates,condensates.shape)
    return condensates
Пример #15
0
def main(_):
    """
    Ask a question of context on Triton.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """
    os.environ[
        "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"  #causes memory fragmentation for bert leading to OOM

    tf.compat.v1.logging.info("***** Configuaration *****")
    for key in FLAGS.__flags.keys():
        tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
    tf.compat.v1.logging.info("**************************")

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Get the Data
    if FLAGS.question and FLAGS.context:
        input_data = [{
            "paragraphs": [{
                "context": FLAGS.context,
                "qas": [{
                    "id": 0,
                    "question": FLAGS.question
                }]
            }]
        }]
        eval_examples = read_squad_examples(
            input_file=None,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative,
            input_data=input_data)
    elif FLAGS.predict_file:
        eval_examples = read_squad_examples(
            input_file=FLAGS.predict_file,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative)
    else:
        raise ValueError(
            "Either predict_file or question+answer need to defined")

    # Get Eval Features = Preprocessing
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples,
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.triton_server_url
    verbose = False
    model_name = FLAGS.triton_model_name
    model_version = str(FLAGS.triton_model_version)
    batch_size = FLAGS.predict_batch_size

    triton_client = tritongrpcclient.InferenceServerClient(url, verbose)
    model_metadata = triton_client.get_model_metadata(
        model_name=model_name, model_version=model_version)
    model_config = triton_client.get_model_config(model_name=model_name,
                                                  model_version=model_version)

    user_data = UserData()

    max_outstanding = 20
    # Number of outstanding requests
    outstanding = 0

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait, outstanding):

        if (outstanding == 0 or do_wait is False):
            return outstanding

        # Wait for deferred items from callback functions
        (result, error, idx, start_time,
         inputs) = user_data._completed_requests.get()

        if (result is None):
            return outstanding

        stop = time.time()

        if (error is not None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outstanding -= 1

        time_list.append(stop - start_time)

        batch_count = len(inputs[label_id_key])
        if FLAGS.trt_engine:
            cls_squad_logits = result.as_numpy("cls_squad_logits")
            try:  #when batch size > 1
                start_logits_results = np.array(
                    cls_squad_logits.squeeze()[:, :, 0])
                end_logits_results = np.array(cls_squad_logits.squeeze()[:, :,
                                                                         1])
            except:
                start_logits_results = np.expand_dims(np.array(
                    cls_squad_logits.squeeze()[:, 0]),
                                                      axis=0)
                end_logits_results = np.expand_dims(np.array(
                    cls_squad_logits.squeeze()[:, 1]),
                                                    axis=0)
        else:
            start_logits_results = result.as_numpy("start_logits")
            end_logits_results = result.as_numpy("end_logits")
        for i in range(batch_count):
            unique_id = int(inputs[label_id_key][i][0])
            start_logits = [float(x) for x in start_logits_results[i].flat]
            end_logits = [float(x) for x in end_logits_results[i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)
        return outstanding

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()
    idx = 0
    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        if not FLAGS.trt_engine:
            label_ids_data = np.stack(inputs_dict[label_id_key])
        input_ids_data = np.stack(inputs_dict['input_ids'])
        input_mask_data = np.stack(inputs_dict['input_mask'])
        segment_ids_data = np.stack(inputs_dict['segment_ids'])

        inputs = []
        inputs.append(
            tritongrpcclient.InferInput('input_ids', input_ids_data.shape,
                                        "INT32"))
        inputs[0].set_data_from_numpy(input_ids_data)
        inputs.append(
            tritongrpcclient.InferInput('input_mask', input_mask_data.shape,
                                        "INT32"))
        inputs[1].set_data_from_numpy(input_mask_data)
        inputs.append(
            tritongrpcclient.InferInput('segment_ids', segment_ids_data.shape,
                                        "INT32"))
        inputs[2].set_data_from_numpy(segment_ids_data)
        if not FLAGS.trt_engine:
            inputs.append(
                tritongrpcclient.InferInput(label_id_key, label_ids_data.shape,
                                            "INT32"))
            inputs[3].set_data_from_numpy(label_ids_data)

        outputs = []
        if FLAGS.trt_engine:
            outputs.append(
                tritongrpcclient.InferRequestedOutput('cls_squad_logits'))
        else:
            outputs.append(
                tritongrpcclient.InferRequestedOutput('start_logits'))
            outputs.append(tritongrpcclient.InferRequestedOutput('end_logits'))

        start_time = time.time()
        triton_client.async_infer(model_name,
                                  inputs,
                                  partial(completion_callback, user_data, idx,
                                          start_time, inputs_dict),
                                  request_id=str(idx),
                                  model_version=model_version,
                                  outputs=outputs)
        outstanding += 1
        idx += 1

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        outstanding = process_outstanding(outstanding >= max_outstanding,
                                          outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            outstanding))

    # Now process all outstanding requests
    while (outstanding > 0):
        outstanding = process_outstanding(True, outstanding)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    if FLAGS.output_dir and FLAGS.predict_file:
        # When inferencing on a dataset, get inference statistics and write results to json file
        time_list.sort()

        avg = np.mean(time_list)
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        print("-----------------------------")
        print("Summary Statistics")
        print("Batch size =", FLAGS.predict_batch_size)
        print("Sequence Length =", FLAGS.max_seq_length)
        print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
        print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
        print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
        print("Latency Average (ms)  =", avg * 1000)
        print("-----------------------------")

        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                 "null_odds.json")

        write_predictions(eval_examples, eval_features, all_results,
                          FLAGS.n_best_size, FLAGS.max_answer_length,
                          FLAGS.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          FLAGS.version_2_with_negative, FLAGS.verbose_logging)
    else:
        # When inferencing on a single example, write best answer to stdout
        all_predictions, all_nbest_json, scores_diff_json = get_predictions(
            eval_examples, eval_features, all_results, FLAGS.n_best_size,
            FLAGS.max_answer_length, FLAGS.do_lower_case,
            FLAGS.version_2_with_negative, FLAGS.verbose_logging)
        print(
            "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" %
            (FLAGS.context, FLAGS.question, all_predictions[0]))
    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            tritongrpcclient.InferInput(input_name, batched_image_data.shape,
                                        "BYTES"))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            tritonhttpclient.InferInput(input_name, batched_image_data.shape,
                                        "BYTES"))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            tritongrpcclient.InferRequestedOutput(output_name,
                                                  class_count=FLAGS.classes))
    else:
        outputs.append(
            tritonhttpclient.InferRequestedOutput(output_name,
                                                  binary_data=True,
                                                  class_count=FLAGS.classes))

    # Send request
    result = triton_client.infer(model_name, inputs, outputs=outputs)

    postprocess(result, output_name, input_filenames, batch_size)

    print("PASS")
        "input0_data", cudashm.get_raw_handle(shm_ip0_handle), 0,
        input_byte_size)
    triton_client.register_cuda_shared_memory(
        "input1_data", cudashm.get_raw_handle(shm_ip1_handle), 0,
        input_byte_size)

    # Set the parameters to use data from shared memory
    inputs = []
    inputs.append(tritongrpcclient.InferInput('INPUT0', [1, 16], "INT32"))
    inputs[-1].set_shared_memory("input0_data", input_byte_size)

    inputs.append(tritongrpcclient.InferInput('INPUT1', [1, 16], "INT32"))
    inputs[-1].set_shared_memory("input1_data", input_byte_size)

    outputs = []
    outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
    outputs[-1].set_shared_memory("output0_data", output_byte_size)

    outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
    outputs[-1].set_shared_memory("output1_data", output_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        output0_data = cudashm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
            output0.shape)
Пример #18
0
    # Infer
    inputs = []
    outputs = []
    # the built engine with input NCHW
    inputs.append(tritongrpcclient.InferInput("data", [1, 3, 608, 608],
                                              "FP32"))

    # Initialize the data
    image_obj = Image("image_id", raw_image_path=FLAGS.img)
    ori_w, ori_h = image_obj.pil_image_obj.size
    image_frame, scale_ratio = preprocess(image_obj.pil_image_obj,
                                          input_image_shape=(608, 608))
    inputs[0].set_data_from_numpy(image_frame)

    outputs.append(tritongrpcclient.InferRequestedOutput("prob"))

    # Test with outputs
    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs,
                                  headers={"test": "1"})

    statistics = triton_client.get_inference_statistics(model_name=model_name)
    print(statistics)

    # Get the output arrays from the results
    output0_data = results.as_numpy("prob")
    n_bbox = int(output0_data[0, 0, 0, 0])
    bbox_matrix = output0_data[0, 1:(n_bbox * 7 + 1), 0, 0].reshape(-1, 7)
def infer_exact(tester, pf, tensor_shape, batch_size,
                input_dtype, output0_dtype, output1_dtype,
                output0_raw=True, output1_raw=True,
                model_version=None, swap=False,
                outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True,
                use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True,
                correlation_id=0, shm_region_names=None, precreated_shm_regions=None,
                use_system_shared_memory=False, use_cuda_shared_memory=False,
                priority=0, timeout_us=0):
    tester.assertTrue(
        use_http or use_http_json_tensors or use_grpc or use_streaming)
    configs = []
    if use_http:
            configs.append(("localhost:8000", "http", False, True))
    if output0_raw == output1_raw:
        # Float16 not supported for Input and Output via JSON
        if use_http_json_tensors and (input_dtype != np.float16) and \
            (output0_dtype != np.float16) and (output1_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(np.iinfo(rinput_dtype).min,
                  np.iinfo(routput0_dtype).min,
                  np.iinfo(routput1_dtype).min) / 2
    val_max = min(np.iinfo(rinput_dtype).max,
                  np.iinfo(routput0_dtype).max,
                  np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_array = np.random.randint(low=val_min, high=val_max,
                                     size=tensor_shape, dtype=rinput_dtype)
    input1_array = np.random.randint(low=val_min, high=val_max,
                                     size=tensor_shape, dtype=rinput_dtype)
    if input_dtype != np.object:
        input0_array = input0_array.astype(input_dtype)
        input1_array = input1_array.astype(input_dtype)

    if not swap:
        output0_array = input0_array + input1_array
        output1_array = input0_array - input1_array
    else:
        output0_array = input0_array - input1_array
        output1_array = input0_array + input1_array

    if output0_dtype == np.object:
        output0_array = np.array([unicode(str(x), encoding='utf-8')
                                  for x in (output0_array.flatten())], dtype=object).reshape(output0_array.shape)
    else:
        output0_array = output0_array.astype(output0_dtype)
    if output1_dtype == np.object:
        output1_array = np.array([unicode(str(x), encoding='utf-8')
                                  for x in (output1_array.flatten())], dtype=object).reshape(output1_array.shape)
    else:
        output1_array = output1_array.astype(output1_dtype)

    if input_dtype == np.object:
        in0n = np.array([str(x)
                         for x in input0_array.reshape(input0_array.size)], dtype=object)
        input0_array = in0n.reshape(input0_array.shape)
        in1n = np.array([str(x)
                         for x in input1_array.reshape(input1_array.size)], dtype=object)
        input1_array = in1n.reshape(input1_array.shape)

    # prepend size of string to output string data
    if output0_dtype == np.object:
        if batch_size == 1:
            output0_array_tmp = serialize_byte_tensor_list([output0_array])
        else:
            output0_array_tmp = serialize_byte_tensor_list(output0_array)
    else:
        output0_array_tmp = output0_array

    if output1_dtype == np.object:
        if batch_size == 1:
            output1_array_tmp = serialize_byte_tensor_list([output1_array])
        else:
            output1_array_tmp = serialize_byte_tensor_list(output1_array)
    else:
        output1_array_tmp = output1_array

    OUTPUT0 = "OUTPUT0"
    OUTPUT1 = "OUTPUT1"
    INPUT0 = "INPUT0"
    INPUT1 = "INPUT1"
    if pf == "libtorch" or pf == "libtorch_nobatch":
        OUTPUT0 = "OUTPUT__0"
        OUTPUT1 = "OUTPUT__1"
        INPUT0 = "INPUT__0"
        INPUT1 = "INPUT__1"

    output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
    output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])

    if batch_size == 1:
        input0_list = [input0_array]
        input1_list = [input1_array]
    else:
        input0_list = [x for x in input0_array]
        input1_list = [x for x in input1_array]

    # Serialization of string tensors in the case of shared memory must be done manually
    if input_dtype == np.object:
        input0_list_tmp = serialize_byte_tensor_list(input0_list)
        input1_list_tmp = serialize_byte_tensor_list(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])

    # Create system/cuda shared memory regions if needed
    shm_regions, shm_handles = su.create_set_shm_regions(input0_list_tmp, input1_list_tmp, output0_byte_size,
                                                        output1_byte_size, outputs, shm_region_names, precreated_shm_regions,
                                                        use_system_shared_memory, use_cuda_shared_memory)

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(
            pf, input_dtype, output0_dtype, output1_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(
                config[0], verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(
                config[0], verbose=True)

        inputs = []
        if config[1] == "http":
            inputs.append(httpclient.InferInput(
                INPUT0, tensor_shape, np_to_triton_dtype(input_dtype)))
            inputs.append(httpclient.InferInput(
                INPUT1, tensor_shape, np_to_triton_dtype(input_dtype)))
        else:
            inputs.append(grpcclient.InferInput(
                INPUT0, tensor_shape, np_to_triton_dtype(input_dtype)))
            inputs.append(grpcclient.InferInput(
                INPUT1, tensor_shape, np_to_triton_dtype(input_dtype)))

        if not (use_cuda_shared_memory or use_system_shared_memory):
            if config[1] == "http":
                inputs[0].set_data_from_numpy(
                    input0_array, binary_data=config[3])
                inputs[1].set_data_from_numpy(
                    input1_array, binary_data=config[3])
            else:
                inputs[0].set_data_from_numpy(input0_array)
                inputs[1].set_data_from_numpy(input1_array)
        else:
            # Register necessary shared memory regions/handles
            su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles,
                                input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size,
                                use_system_shared_memory, use_cuda_shared_memory, triton_client)

        if batch_size == 1:
            expected0_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output0_array.reshape((1,) + tensor_shape)]
            expected1_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output1_array.reshape((1,) + tensor_shape)]
        else:
            expected0_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output0_array.reshape(tensor_shape)]
            expected1_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output1_array.reshape(tensor_shape)]

        # Force binary_data = False for shared memory and class
        output_req = []
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(httpclient.InferRequestedOutput(
                        OUTPUT0, binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))

                output_req[-1].set_shared_memory(
                    shm_regions[2]+'_data', output0_byte_size)
            else:
                if output0_raw:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT0, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT0))
                else:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT0, binary_data=config[3], class_count=num_classes))
                    else:
                        output_req.append(grpcclient.InferRequestedOutput(
                            OUTPUT0, class_count=num_classes))
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(httpclient.InferRequestedOutput(
                        OUTPUT1, binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))

                output_req[-1].set_shared_memory(
                    shm_regions[2+i]+'_data', output1_byte_size)
            else:
                if output1_raw:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT1, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT1))
                else:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT1, binary_data=config[3], class_count=num_classes))
                    else:
                        output_req.append(grpcclient.InferRequestedOutput(
                            OUTPUT1, class_count=num_classes))

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))

        last_response = results.get_response()

        if not skip_request_id_check:
            global _seen_request_ids
            if config[1] == "http":
                request_id = int(last_response["id"])
            else:
                request_id = int(last_response.id)
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(str(response_model_version), model_version)

        tester.assertEqual(len(response_outputs), len(outputs))

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            if ((result_name == OUTPUT0 and output0_raw) or
                    (result_name == OUTPUT1 and output1_raw)):
                if use_system_shared_memory or use_cuda_shared_memory:
                    if result_name == OUTPUT0:
                        shm_handle = shm_handles[2]
                    else:
                        shm_handle = shm_handles[3]

                    output = results.get_output(result_name)
                    if config[1] == "http":
                        output_datatype = output['datatype']
                        output_shape = output['shape']
                    else:
                        output_datatype = output.datatype
                        output_shape = output.shape
                    output_dtype = triton_to_np_dtype(output_datatype)
                if use_system_shared_memory:
                    output_data = shm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                elif use_cuda_shared_memory:
                    output_data = cudashm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                else:
                    output_data = results.as_numpy(result_name)

                if (output_data.dtype == np.object) and (config[3] == False):
                    output_data = output_data.astype(np.bytes_)

                if result_name == OUTPUT0:
                    tester.assertTrue(np.array_equal(output_data, output0_array),
                                      "{}, {} expected: {}, got {}".format(
                        model_name, OUTPUT0, output0_array, output_data))
                elif result_name == OUTPUT1:
                    tester.assertTrue(np.array_equal(output_data, output1_array),
                                      "{}, {} expected: {}, got {}".format(
                        model_name, OUTPUT1, output1_array, output_data))
                else:
                    tester.assertTrue(
                        False, "unexpected raw result {}".format(result_name))
            else:
                for b in range(batch_size):
                    # num_classes values must be returned and must
                    # match expected top values
                    if "nobatch" in pf:
                      class_list = results.as_numpy(result_name)
                    else:
                      class_list = results.as_numpy(result_name)[b]

                    tester.assertEqual(len(class_list), num_classes)
                    if batch_size == 1:
                        expected0_flatten = output0_array.flatten()
                        expected1_flatten = output1_array.flatten()
                    else:
                        expected0_flatten = output0_array[b].flatten()
                        expected1_flatten = output1_array[b].flatten()

                    for idx, class_label in enumerate(class_list):
                        # can't compare indices since could have different
                        # indices with the same value/prob, so check that
                        # the value of each index equals the expected value.
                        # Only compare labels when the indices are equal.
                        if type(class_label) == str:
                            ctuple = class_label.split(':')
                        else:
                            ctuple = "".join(chr(x)
                                         for x in class_label).split(':')
                        cval = float(ctuple[0])
                        cidx = int(ctuple[1])
                        if result_name == OUTPUT0:
                            tester.assertEqual(cval, expected0_flatten[cidx])
                            tester.assertEqual(
                                cval, expected0_flatten[expected0_sort_idx[b][idx]])
                            if cidx == expected0_sort_idx[b][idx]:
                                tester.assertEqual(ctuple[2], 'label{}'.format(
                                    expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(cval, expected1_flatten[cidx])
                            tester.assertEqual(
                                cval, expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs,
                                      use_system_shared_memory, use_cuda_shared_memory)

    return results
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes,
               model_version=None, use_http=True, use_grpc=True,
               use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None,
               use_system_shared_memory=False, use_cuda_shared_memory=False,
               priority=0, timeout_us=0):
    tester.assertTrue(
        use_http or use_grpc or use_http_json_tensors or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
    if use_http_json_tensors and (tensor_dtype != np.float16):
        configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    for io_num in range(io_cnt):
        if pf == "libtorch" or pf == "libtorch_nobatch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape, dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([unicode(str(x), encoding='utf-8')
                                       for x in input_array.flatten()], dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region([shm_region_name_prefix[0]+str(io_num),
                                                        shm_region_name_prefix[1]+str(io_num)],
                                                        input_list_tmp, input_byte_size, output_byte_size,
                                                        use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(
                config[0], verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(
                config[0], verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(httpclient.InferInput(
                    input_name, input_data.shape, np_to_triton_dtype(tensor_dtype)))
                output_req.append(httpclient.InferRequestedOutput(
                    output_name, binary_data=config[3]))
            else:
                inputs.append(grpcclient.InferInput(
                    input_name, input_data.shape, np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data, binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size,
                    use_system_shared_memory, use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority, timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority, timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if pf == "libtorch" or pf == "libtorch_nobatch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(np.array_equal(output_data, expected),
                                "{}, {}, expected: {}, got {}".format(
                                    model_name, result_name, expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0]+str(io_num)+'_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0]+str(io_num)+'_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1]+str(io_num)+'_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1]+str(io_num)+'_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
Пример #21
0
        print("channel creation failed: " + str(e))
        sys.exit(1)

    with open(args.label_file) as f:
        labels_dict = {idx: line.strip() for idx, line in enumerate(f)}

    inputs = []
    outputs = []
    input_name = "INPUT"
    output_name = "OUTPUT"
    image_data = load_image(args.image)
    image_data = np.expand_dims(image_data, axis=0)

    inputs.append(
        tritongrpcclient.InferInput(input_name, image_data.shape, "UINT8"))
    outputs.append(tritongrpcclient.InferRequestedOutput(output_name))

    inputs[0].set_data_from_numpy(image_data)
    start_time = time.time()
    # Test with outputs
    results = triton_client.infer(model_name=args.model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    latency = time.time() - start_time

    output0_data = results.as_numpy(output_name)

    maxs = np.argmax(output0_data, axis=1)

    print("{}ms class: {}".format(latency, labels_dict[maxs[0]]))
Пример #22
0
            edge_index = build_edge_index(x.shape[0], data['Ri_rows'],
                                          data['Ri_cols'], data['Ro_rows'],
                                          data['Ro_cols'])
            print(x.shape, edge_index.shape)

        nnodes = x.shape[0]
        nedges = edge_index.shape[1]

        inputs.append(tritongrpcclient.InferInput('x__0', [nnodes, 5], 'FP32'))
        inputs.append(
            tritongrpcclient.InferInput('edge_index__1', [2, nedges], "INT64"))

        inputs[0].set_data_from_numpy(x)
        inputs[1].set_data_from_numpy(edge_index)

        outputs.append(tritongrpcclient.InferRequestedOutput('output__0'))

        results = triton_client.infer(model_name=model_name,
                                      inputs=inputs,
                                      outputs=outputs)
        output0_data = results.as_numpy('output__0')
        print(output0_data)
        del output0_data

    statistics = triton_client.get_inference_statistics(model_name=model_name)
    print(statistics)
    if len(statistics.model_stats) != 1:
        print("FAILED: Inference Statistics")
        sys.exit(1)
    print('PASS: infer')