def create_sequence_ensemble_modelconfig(base_model, models_dir, max_batch, model_version, shape, dtype): # No validation as long as the base model supports the type and shape model_dtype = np_to_model_dtype(dtype) for ensemble_type in BASIC_ENSEMBLE_TYPES: # Use a different model name for the non-batching variant ensemble_model_name = "{}_{}{}".format( ensemble_type, base_model, "_nobatch" if max_batch == 0 else "") model_name = tu.get_sequence_model_name(ensemble_model_name, dtype) base_model_name = tu.get_sequence_model_name( "{}{}".format(base_model, "_nobatch" if max_batch == 0 else ""), dtype) ensemble_schedule = SequenceEnsembleSchedule( ensemble_type).get_schedule(base_model_name, shape, model_dtype) config_dir = models_dir + "/" + model_name config = create_general_modelconfig(model_name, "ensemble", max_batch, [dtype], [shape], [None], [dtype], [shape], [None], [None]) config += ensemble_schedule try: os.makedirs(config_dir) except OSError as ex: pass # ignore existing dir with open(config_dir + "/config.pbtxt", "w") as cfile: cfile.write(config)
def run(self, client_metadata, len_mean=SEQUENCE_LENGTH_MEAN, len_stddev=SEQUENCE_LENGTH_STDEV): trial = self.get_trial() dtype = self.get_datatype(trial) model_name = tu.get_sequence_model_name(trial, dtype) if not self.check_constraints(model_name, client_metadata[1]): return None # Track that the sequence id of the model is used for no-end sequence if not model_name in self.sequence_constraints_: self.sequence_constraints_[model_name] = {} self.sequence_constraints_[model_name][client_metadata[1]] = False # Create two variable length sequences with "start" and "end" # flags, where both sequences use the same correlation ID and are # sent back-to-back. seqlen = [ max(1, int(self.rng_.normal(len_mean, len_stddev))), max(1, int(self.rng_.normal(len_mean, len_stddev))) ] print("{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format( self.name_, client_metadata[1], seqlen[0], seqlen[1]), file=self.out_stream_) values = [ self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype), self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype) ] for p in [0, 1]: steps = [] expected_result = 0 for idx, _ in enumerate(range(seqlen[p])): flags = "" if idx == 0: flags += ",start" if idx == (seqlen[p] - 1): flags += ",end" val = values[p][idx] delay_ms = None expected_result += val expected_result = self.get_expected_result( expected_result, val, trial, flags) # (flag_str, value, expected_result, delay_ms) steps.append((flags, val, expected_result, delay_ms), ) return self.check_sequence_async(client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_)
def run(self, client_metadata, len_mean=SEQUENCE_LENGTH_MEAN, len_stddev=SEQUENCE_LENGTH_STDEV): trial = self.get_trial() dtype = self.get_datatype(trial) model_name = tu.get_sequence_model_name(trial, dtype) if not self.check_constraints(model_name, client_metadata[1]): return None # Track that the sequence id of the model is used for no-end sequence if not model_name in self.sequence_constraints_: self.sequence_constraints_[model_name] = {} self.sequence_constraints_[model_name][client_metadata[1]] = True # Create a variable length sequence with "start" flag but that # never ends. The sequence should be aborted by the server and its # slot reused for another sequence. seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev))) print("{} {}: no-end seqlen = {}".format(self.name_, client_metadata[1], seqlen), file=self.out_stream_) values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype) steps = [] expected_result = 0 for idx, _ in enumerate(range(seqlen)): flags = "" if idx == 0: flags = "start" val = values[idx] delay_ms = None expected_result += val expected_result = self.get_expected_result(expected_result, val, trial, flags) # (flag_str, value, expected_result, delay_ms) steps.append((flags, val, expected_result, delay_ms), ) return self.check_sequence_async(client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_)
def create_sequence_ensemble_modelfile(base_model, models_dir, max_batch, model_version, shape, dtype): # No actual model file in ensemble model # Use a different model name for the non-batching variant for ensemble_type in BASIC_ENSEMBLE_TYPES: ensemble_model_name = "{}_{}{}".format( ensemble_type, base_model, "_nobatch" if max_batch == 0 else "") model_name = tu.get_sequence_model_name(ensemble_model_name, dtype) model_version_dir = models_dir + "/" + model_name + "/" + str( model_version) try: os.makedirs(model_version_dir) except OSError as ex: pass # ignore existing dir
def run(self, client_metadata): trial = self.get_trial() dtype = self.get_datatype(trial) model_name = tu.get_sequence_model_name(trial, dtype) if not self.check_constraints(model_name, client_metadata[1]): return None # Track that the sequence id of the model is used for no-end sequence if not model_name in self.sequence_constraints_: self.sequence_constraints_[model_name] = {} self.sequence_constraints_[model_name][client_metadata[1]] = False # Create a sequence without a "start" flag. Sequence should get an # error from the server. seqlen = 1 print("{} {}: no-start seqlen = {}".format(self.name_, client_metadata[1], seqlen), file=self.out_stream_) values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype) steps = [] for idx, _ in enumerate(range(seqlen)): flags = None val = values[idx] delay_ms = None # (flag_str, value, expected_result, delay_ms) steps.append((flags, val, None, delay_ms), ) try: self.check_sequence_async(client_metadata, trial, model_name, dtype, steps) # Hit this point if sending no-start sequence to sequence id that # was used for no-end sequence and that means the constraints check # is inaccurate assert False, "expected inference failure from missing START flag" except Exception as ex: if "must specify the START flag" not in ex.message(): raise # Expect no START error as success case return seqlen
# Initialize the random seed. For reproducibility each thread # maintains its own RNG which is initialized based on this seed. randseed = 0 if FLAGS.random_seed != None: randseed = FLAGS.random_seed else: randseed = int(time.time()) np.random.seed(randseed) print("random seed = {}".format(randseed)) print("concurrency = {}".format(FLAGS.concurrency)) print("iterations = {}".format(FLAGS.iterations)) trial = "custom" dtype = get_datatype(trial) model_name = tu.get_sequence_model_name(trial, dtype) threads = [] for idx, thd in enumerate(range(FLAGS.concurrency)): thread_name = "thread_{}".format(idx) # Create the seed for the thread. Since these are created in # reproducible order off of the initial seed we will get # reproducible results when given the same seed. seed = np.random.randint(2**32) # Each thread is reserved a block of correlation IDs or size # CORRELATION_ID_BLOCK_SIZE correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE) threads.append(
def test_sequence_different_shape_values(self): # Test model instances together are configured with # total-batch-size 4. Send four equal-length sequences with # different shape values in 2 sequences and 2 sequences that # share the same shape value. Make sure that the 2 sequences # with same shapes batch together but other two sequences do # not. self.clear_deferred_exceptions() dtype = np.float32 precreated_shm0_handles = self.precreate_register_shape_tensor_regions( ((1, 1), (1, 2), (1, 3)), dtype, 0) precreated_shm1_handles = self.precreate_register_shape_tensor_regions( ((32, 11), (32, 12), (32, 13)), dtype, 1) precreated_shm2_handles = self.precreate_register_shape_tensor_regions( ((16, 111), (16, 112), (16, 113)), dtype, 2) precreated_shm3_handles = self.precreate_register_shape_tensor_regions( ((1, 1111), (1, 1112), (1, 1113)), dtype, 3) try: model_name = tu.get_sequence_model_name("plan", dtype) self.check_setup(model_name) # Need scheduler to wait for queue to contain all # inferences for both sequences. self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) self.assertTrue( "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) self.assertEqual( int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) threads = [] threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1001, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 1, 1, None), (None, 1, 2, None), ("end", 1, 3, None)), self.get_expected_result(6, 3, "end"), precreated_shm0_handles), kwargs={ 'sequence_name': "{}".format(self._testMethodName) })) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1002, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 32, 11, None), (None, 32, 12, None), ("end", 32, 13, None)), self.get_expected_result(36, 13, "end"), precreated_shm1_handles), kwargs={ 'sequence_name': "{}".format(self._testMethodName) })) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1003, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 16, 111, None), (None, 16, 112, None), ("end", 16, 113, None)), self.get_expected_result(336, 113, "end"), precreated_shm2_handles), kwargs={ 'sequence_name': "{}".format(self._testMethodName) })) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1004, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 1, 1111, None), (None, 1, 1112, None), ("end", 1, 1113, None)), self.get_expected_result(3336, 1113, "end"), precreated_shm3_handles), kwargs={ 'sequence_name': "{}".format(self._testMethodName) })) for t in threads: t.start() time.sleep(1) for t in threads: t.join() self.check_deferred_exception() self.check_status(model_name, {4: 3, 3: 6}, 9, 12) except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) finally: if TEST_SYSTEM_SHARED_MEMORY: self.cleanup_shm_regions(precreated_shm0_handles) self.cleanup_shm_regions(precreated_shm1_handles) self.cleanup_shm_regions(precreated_shm2_handles) self.cleanup_shm_regions(precreated_shm3_handles)
def __init__(self, name, rng, sequence_trials, identity_trials, queue_latency_range_us=(10000, 100000), sequence_id_range=None, verbose=False, out_stream=sys.stdout): super().__init__(name, [], verbose, out_stream) self.rng_ = rng self.sequence_id_range_ = sequence_id_range # List of tuples # (model_name, max_concurrency, batch_size, list(more PA options), # real_data_file), self.options_ = [] # Add no validation models self.options_.append( PerfAnalyzerScenario.ModelOption("resnet_v1_50_graphdef_def", 32, (1, 4, 1), queue_latency_range_us)) for trial in sequence_trials: dtype = self.get_datatype(trial) # Skip string sequence model for now, it is hard for PA to generate # valid input if dtype == np.dtype(object): continue model_name = tu.get_sequence_model_name(trial, dtype) self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us)) for trial in identity_trials: dtype = np.float32 model_name = tu.get_zero_model_name(trial, 1, dtype) if "libtorch" in trial: input_shapes = [("INPUT__0", "16")] else: input_shapes = [("INPUT0", "16")] self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us, input_shapes)) # Add output validation version of the models # Skip resnet as the output data has variation which makes exact # matching hard for trial in sequence_trials: dtype = self.get_datatype(trial) model_name = tu.get_sequence_model_name(trial, dtype) data_file = os.path.join("validation_data", "{}.json".format(model_name)) self.generate_sequence_data(trial, dtype, data_file) self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us, input_file=data_file)) for trial in identity_trials: dtype = np.float32 model_name = tu.get_zero_model_name(trial, 1, dtype) data_file = os.path.join("validation_data", "{}.json".format(model_name)) self.generate_identity_data(trial, dtype, data_file) self.options_.append( PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1), queue_latency_range_us, input_file=data_file))
def test_sequence_different_shape_values(self): # Test model instances together are configured with # total-batch-size 4. Send four equal-length sequences # with different shape values in parallel. As the # sequence batcher currently doesn't support ragged batch # the requests will still get batched together but the # result will be incorrect. self.clear_deferred_exceptions() dtype = np.float32 precreated_shm0_handles = self.precreate_register_shape_tensor_regions( ((1, 1), (1, 2), (1, 3)), dtype, 0) precreated_shm1_handles = self.precreate_register_shape_tensor_regions( ((32, 11), (32, 12), (32, 13)), dtype, 1) precreated_shm2_handles = self.precreate_register_shape_tensor_regions( ((16, 111), (16, 112), (16, 113)), dtype, 2) precreated_shm3_handles = self.precreate_register_shape_tensor_regions( ((1, 1111), (1, 1112), (1, 1113)), dtype, 3) try: model_name = tu.get_sequence_model_name("plan", dtype) protocol = "streaming" self.check_setup(model_name) # Need scheduler to wait for queue to contain all # inferences for both sequences. self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 12) self.assertTrue("TRTSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) self.assertEqual( int(os.environ["TRTSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) threads = [] threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1001, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 1, 1, None), (None, 1, 2, None), ("end", 1, 3, None)), self.get_expected_result(6, 3, "end"), protocol, precreated_shm0_handles), kwargs={ 'sequence_name': "{}_{}".format(self._testMethodName, protocol) })) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1002, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 32, 11, None), (None, 32, 12, None), ("end", 32, 13, None)), self.get_expected_result(36, 13, "end"), protocol, precreated_shm1_handles), kwargs={ 'sequence_name': "{}_{}".format(self._testMethodName, protocol) })) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1003, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 16, 111, None), (None, 16, 112, None), ("end", 16, 113, None)), self.get_expected_result(336, 113, "end"), protocol, precreated_shm2_handles), kwargs={ 'sequence_name': "{}_{}".format(self._testMethodName, protocol) })) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, args=( model_name, dtype, 1004, (None, None), # (flag_str, shape_value, value, pre_delay_ms) (("start", 1, 1111, None), (None, 1, 1112, None), ("end", 1, 1113, None)), self.get_expected_result(3336, 1113, "end"), protocol, precreated_shm3_handles), kwargs={ 'sequence_name': "{}_{}".format(self._testMethodName, protocol) })) for t in threads: t.start() for t in threads: t.join() self.check_failure() except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) finally: if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: self.cleanup_shm_regions(precreated_shm0_handles) self.cleanup_shm_regions(precreated_shm1_handles) self.cleanup_shm_regions(precreated_shm2_handles) self.cleanup_shm_regions(precreated_shm3_handles)