def test_cpu_stats(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH gpus = ['all'] # Test local server cpu_stats self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) self.server.start() _, _ = self.server.cpu_stats() self.server_local_mock.assert_cpu_stats_called() self.server.stop() # Test docker server cpu stats self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=gpus) self.server.start() # The following needs to be called as it resets exec_run return value self.server_docker_mock.assert_server_process_start_called_with( TRITON_DOCKER_BIN_PATH + ' ' + server_config.to_cli_string(), MODEL_REPOSITORY_PATH, TRITON_IMAGE, 8000, 8001, 8002) _, _ = self.server.cpu_stats() self.server_docker_mock.assert_cpu_stats_called() self.server.stop()
def test_server_config(self): # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Check config initializations self.assertIsNone(server_config[CONFIG_TEST_ARG], msg="Server config had unexpected initial" f"value for {CONFIG_TEST_ARG}") # Set value server_config[CONFIG_TEST_ARG] = True # Test get again self.assertTrue(server_config[CONFIG_TEST_ARG], msg=f"{CONFIG_TEST_ARG} was not set") # Try to set an unsupported config argument, expect failure with self.assertRaises(TritonModelAnalyzerException, msg="Expected exception on trying to set" "unsupported argument in Triton server" "config"): server_config['dummy'] = 1 # Reset test arg server_config[CONFIG_TEST_ARG] = None # Finally set a couple of args and then check the cli string for arg, value in CLI_TO_STRING_TEST_ARGS.items(): server_config[arg] = value cli_string = server_config.to_cli_string() for argstring in cli_string.split(): # Parse the created string arg, value = argstring.split('=') arg = arg[2:] # Make sure each parsed arg was in test dict self.assertIn(arg, CLI_TO_STRING_TEST_ARGS, msg=f"CLI string contained unknown argument: {arg}") # Make sure parsed value is the one from dict, check type too test_value = CLI_TO_STRING_TEST_ARGS[arg] self.assertEqual( test_value, type(test_value)(value), msg=f"CLI string contained unknown value: {value}")
def test_triton_server_ssl_options(self): server_config = TritonServerConfig() triton_server_flags = { 'grpc-use-ssl': '1', 'grpc-use-ssl-mutual': '1', 'grpc-server-cert': 'a', 'grpc-server-key': 'b', 'grpc-root-cert': 'c', } server_config.update_config(triton_server_flags) expected_cli_str = f"--grpc-use-ssl=1 --grpc-use-ssl-mutual=1 "\ f"--grpc-server-cert=a --grpc-server-key=b --grpc-root-cert=c" self.assertEqual(server_config.to_cli_string(), expected_cli_str)
def _test_create_server(self, gpus): # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Run for both types of environments self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=gpus) self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=gpus) # Try to create a server without specifying model repository and expect # error server_config['model-repository'] = None with self.assertRaises( AssertionError, msg="Expected AssertionError for trying to create" "server without specifying model repository."): self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=gpus) with self.assertRaises( AssertionError, msg="Expected AssertionError for trying to create" "server without specifying model repository."): self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=gpus)
def start_stop_docker_args(self): device_requests, gpu_uuids = self._find_correct_gpu_settings( self._sys_gpus) # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create mounts and labels mounts = [ '/host/path:/dest/path:ro', '/another/host/path:/some/dest/path:rw' ] labels = {'RUNNER_ID': 'TEST_RUNNER_ID'} environment = {'VARIABLE': 'VALUE'} # Create server in docker, start , wait, and stop self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=self._sys_gpus, mounts=mounts, labels=labels) # Start server check that mocked api is called self.server.start(env=environment) self.server_docker_mock.assert_server_process_start_called_with( f"{TRITON_DOCKER_BIN_PATH} {server_config.to_cli_string()}", MODEL_REPOSITORY_PATH, TRITON_IMAGE, device_requests, gpu_uuids, 8000, 8001, 8002, mounts, labels) # Stop container and check api calls self.server.stop() self.server_docker_mock.assert_server_process_terminate_called()
def test_monitor_disable(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH gpus = [ GPUDevice('TEST_DEVICE_NAME', 0, "TEST_PCI_BUS_ID", "TEST_UUID") ] frequency = 1 monitoring_time = 2 metrics = [] server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=gpus) # Start triton and monitor server.start() cpu_monitor = CPUMonitor(server, frequency, metrics) cpu_monitor.start_recording_metrics() time.sleep(monitoring_time) records = cpu_monitor.stop_recording_metrics() # Assert no library calls self.server_local_mock.assert_cpu_stats_not_called() cpu_monitor.destroy() server.stop()
def test_measurement_interval_increase(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus) perf_analyzer_config = PerfAnalyzerConfig() perf_analyzer_config['model-name'] = TEST_MODEL_NAME perf_analyzer_config['concurrency-range'] = TEST_CONCURRENCY_RANGE perf_analyzer_config['measurement-mode'] = 'time_windows' perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, max_retries=10, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() # Test failure to stabilize for measurement windows self.client.wait_for_server_ready(num_retries=1) test_stabilize_output = "Please use a larger time window" self.perf_mock.set_perf_analyzer_result_string(test_stabilize_output) self.perf_mock.set_perf_analyzer_return_code(1) perf_metrics = [PerfThroughput, PerfLatencyP99] perf_analyzer.run(perf_metrics) self.assertEqual( self.perf_mock.get_perf_analyzer_popen_read_call_count(), 10)
def test_measurement_request_count_increase(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, max_retries=10, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() # Test the timeout for count mode self.client.wait_for_server_ready(num_retries=1) test_both_output = "Please use a larger time window" self.perf_mock.set_perf_analyzer_result_string(test_both_output) self.perf_mock.set_perf_analyzer_return_code(1) perf_metrics = [PerfThroughput, PerfLatencyP99] perf_analyzer.run(perf_metrics) self.assertEqual( self.perf_mock.get_perf_analyzer_popen_read_call_count(), 10)
def _test_cpu_stats(self, gpus): device_requests = [device.device_id() for device in gpus] # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Test local server cpu_stats self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=gpus) self.server.start() _, _ = self.server.cpu_stats() self.server_local_mock.assert_cpu_stats_called() self.server.stop() # Test docker server cpu stats self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=gpus) self.server.start() # The following needs to be called as it resets exec_run return value self.server_docker_mock.assert_server_process_start_called_with( f'{TRITON_DOCKER_BIN_PATH} {server_config.to_cli_string()}', MODEL_REPOSITORY_PATH, TRITON_IMAGE, device_requests, gpus, 8000, 8001, 8002) _, _ = self.server.cpu_stats() self.server_docker_mock.assert_cpu_stats_called() self.server.stop()
def test_run(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() self.client.wait_for_server_ready(num_retries=1) # Run perf analyzer with dummy metrics to check command parsing perf_metrics = [id] test_latency_output = "Avg latency: 5000 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_analyzer.run(perf_metrics) self.perf_mock.assert_perf_analyzer_run_as( [PERF_BIN_PATH, '-m', TEST_MODEL_NAME]) # Test latency parsing test_latency_output = "Avg latency: 5000 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_metrics = [PerfLatency] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 5000) # Test throughput parsing test_throughput_output = "Throughput: 46.8 infer/sec\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_throughput_output) perf_metrics = [PerfThroughput] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 46.8) # Test parsing for both test_both_output = "Throughput: 0.001 infer/sec\nAvg latency: 3.6 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_both_output) perf_metrics = [PerfThroughput, PerfLatency] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 2) self.assertEqual(records[0].value(), 0.001) self.assertEqual(records[1].value(), 3.6) # Test exception handling with self.assertRaisesRegex( expected_exception=TritonModelAnalyzerException, expected_regex="Running perf_analyzer with", msg="Expected TritonModelAnalyzerException"): self.perf_mock.raise_exception_on_run() perf_analyzer.run(perf_metrics) self.server.stop()
def test_run(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() self.client.wait_for_server_ready(num_retries=1) # Run perf analyzer with dummy metrics to check command parsing perf_metrics = [id] test_latency_output = "p99 latency: 5000 us\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_analyzer.run(perf_metrics) self.perf_mock.assert_perf_analyzer_run_as([ PERF_BIN_PATH, '-m', TEST_MODEL_NAME, '--measurement-interval', str(self.config['measurement-interval']) ]) # Test latency parsing test_latency_output = "p99 latency: 5000 us\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_metrics = [PerfLatency] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 5) # Test throughput parsing test_throughput_output = "Throughput: 46.8 infer/sec\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_throughput_output) perf_metrics = [PerfThroughput] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 46.8) # Test parsing for both test_both_output = "Throughput: 0.001 infer/sec\np99 latency: 3600 us\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_both_output) perf_metrics = [PerfThroughput, PerfLatency] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 2) self.assertEqual(records[0].value(), 0.001) self.assertEqual(records[1].value(), 3.6) # Test exception handling self.perf_mock.set_perf_analyzer_return_code(1) self.assertTrue(perf_analyzer.run(perf_metrics), 1) self.server.stop()
def test_start_wait_stop_gpus(self): # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Create server in docker, start , wait, and stop self.server = TritonServerFactory.create_server_docker( model_path=MODEL_LOCAL_PATH, image=TRITON_IMAGE, config=server_config) # Set mock status_code to error, and generate exception self._mock_server_wait_for_ready(assert_raises=True) # Start server check that mocked api is called self.server.start() self.server_docker_mock.assert_server_process_start_called_with( TRITON_DOCKER_BIN_PATH + ' ' + server_config.to_cli_string(), MODEL_LOCAL_PATH, MODEL_REPOSITORY_PATH, TRITON_IMAGE, 8000, 8001, 8002) # Mock status code for connected server then stop self._mock_server_wait_for_ready(assert_raises=False) # Stop container and check api calls self.server.stop() self.server_docker_mock.assert_server_process_terminate_called() # Create local server which runs triton as a subprocess self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) self._mock_server_wait_for_ready(assert_raises=True) # Check that API functions are called self.server.start() self.server_local_mock.assert_server_process_start_called_with(cmd=[ TRITON_LOCAL_BIN_PATH, '--model-repository', MODEL_REPOSITORY_PATH ]) self._mock_server_wait_for_ready(assert_raises=False) self.server.stop() self.server_local_mock.assert_server_process_terminate_called()
def test_start_stop_gpus(self): # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH gpus = ['all'] # Create server in docker, start , wait, and stop self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=gpus) # Start server check that mocked api is called self.server.start() self.server_docker_mock.assert_server_process_start_called_with( TRITON_DOCKER_BIN_PATH + ' ' + server_config.to_cli_string(), MODEL_REPOSITORY_PATH, TRITON_IMAGE, 8000, 8001, 8002) self.server_docker_mock.raise_exception_on_container_run() with self.assertRaises(TritonModelAnalyzerException): self.server.start() self.server_docker_mock.stop_raise_exception_on_container_run() # Stop container and check api calls self.server.stop() self.server_docker_mock.assert_server_process_terminate_called() # Create local server which runs triton as a subprocess self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) # Check that API functions are called self.server.start() self.server_local_mock.assert_server_process_start_called_with(cmd=[ TRITON_LOCAL_BIN_PATH, '--model-repository', MODEL_REPOSITORY_PATH ]) self.server.stop() self.server_local_mock.assert_server_process_terminate_called()
def setUp(self): # Mocks self.mock_server_docker = MockServerDockerMethods() self.tritonclient_mock = MockTritonClientMethods() # Create server config self.server_config = TritonServerConfig() self.server_config['model-repository'] = MODEL_REPOSITORY_PATH self.server_config['model-control-mode'] = 'explicit' # Set CUDA_VISIBLE_DEVICES os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Create and start the server self.server = TritonServerFactory.create_server_docker( model_path=MODEL_LOCAL_PATH, image=TRITON_IMAGE, config=self.server_config)
def _test_get_logs(self, gpus): # Create a TritonServerConfig server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Check docker server logs self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=server_config, gpus=gpus) self.server.start() self.server.stop() self.server_docker_mock.assert_server_process_terminate_called() self.assertEqual(self.server.logs(), "Triton Server Test Log") # Create local server logs self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=gpus) self.server.start() self.server.stop() self.server_local_mock.assert_server_process_terminate_called() self.assertEqual(self.server.logs(), "Triton Server Test Log")
def test_run(self, requests_mock): # Now create a server config server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) perf_client = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config) self.server.start() requests_mock.get.return_value.status_code = 200 self.server.wait_for_ready(num_retries=1) # Run perf analyzer throughput_record, latency_record = perf_client.run() self.perf_mock.assert_perf_analyzer_run_as( [PERF_BIN_PATH, '-m', TEST_MODEL_NAME]) self.server.stop() # Test latency parsing test_latency_output = "Avg latency: 5000 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) _, latency_record = perf_client.run() self.assertEqual(latency_record.value(), 5000) # Test throughput parsing test_throughput_output = "Throughput: 46.8 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_throughput_output) throughput_record, _ = perf_client.run() self.assertEqual(throughput_record.value(), 46.8) # Test parsing for both test_both_output = "Throughput: 0.001 ms\nAvg latency: 3.6 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_both_output) throughput_record, latency_record = perf_client.run() self.assertEqual(throughput_record.value(), 0.001) self.assertEqual(latency_record.value(), 3.6)
def test_record_cpu_memory(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH gpus = [ GPUDevice('TEST_DEVICE_NAME', 0, "TEST_PCI_BUS_ID", "TEST_UUID") ] frequency = 1 monitoring_time = 2 metrics = [CPUAvailableRAM, CPUUsedRAM] server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=gpus) # Start triton and monitor server.start() cpu_monitor = CPUMonitor(server, frequency, metrics) cpu_monitor.start_recording_metrics() time.sleep(monitoring_time) records = cpu_monitor.stop_recording_metrics() # Assert library calls self.server_local_mock.assert_cpu_stats_called() # Assert instance types for record in records: self.assertIsInstance(record.value(), float) self.assertIsInstance(record.timestamp(), int) # The number of records should be dividable by number of metrics self.assertTrue(len(records) % len(metrics) == 0) self.assertTrue(len(records) > 0) with self.assertRaises(TritonModelAnalyzerException): cpu_monitor.stop_recording_metrics() cpu_monitor.destroy() server.stop()
def setUp(self): # GPUs gpus = [ GPUDevice('TEST_DEVICE_NAME', 0, "TEST_PCI_BUS_ID", "TEST_UUID") ] # Mocks self.server_docker_mock = MockServerDockerMethods() self.tritonclient_mock = MockTritonClientMethods() self.server_docker_mock.start() self.tritonclient_mock.start() # Create server config self.server_config = TritonServerConfig() self.server_config['model-repository'] = MODEL_REPOSITORY_PATH self.server_config['model-control-mode'] = 'explicit' # Set CUDA_VISIBLE_DEVICES os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Create and start the server self.server = TritonServerFactory.create_server_docker( image=TRITON_IMAGE, config=self.server_config, gpus=gpus)
def test_run(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, max_retries=10, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() self.client.wait_for_server_ready(num_retries=1) pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait\n1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314""" # Test avg latency parsing perf_metrics = [PerfLatencyAvg] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 5) # Test p90 latency parsing perf_metrics = [PerfLatencyP90] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 4.7) # Test p95 latency parsing perf_metrics = [PerfLatencyP95] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 4.8) # Test p99 latency parsing perf_metrics = [PerfLatencyP99] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 4.9) # Test throughput parsing perf_metrics = [PerfThroughput] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 46.8) # Test client response wait perf_metrics = [PerfClientResponseWait] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.314) # Test server queue perf_metrics = [PerfServerQueue] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.018) # Test server compute infer perf_metrics = [PerfServerComputeInfer] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.065) # Test server compute input perf_metrics = [PerfServerComputeInput] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.034) # Test server compute infer perf_metrics = [PerfServerComputeOutput] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.016) # # Test parsing for subset perf_metrics = [ PerfThroughput, PerfLatencyAvg, PerfLatencyP90, PerfLatencyP95, PerfLatencyP99 ] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 5) # Test no exceptions are raised when nothing can be parsed pa_csv_empty = "" perf_metrics = [ PerfThroughput, PerfClientSendRecv, PerfClientResponseWait, PerfServerQueue, PerfServerComputeInfer, PerfServerComputeInput, PerfServerComputeOutput ] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): self.assertFalse(perf_analyzer.run(perf_metrics)) # Test exception handling self.perf_mock.set_perf_analyzer_return_code(1) with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): self.assertTrue(perf_analyzer.run(perf_metrics)) self.server.stop()
def _add_profile_models_configs(self): """ Adds configs specific to model specifications """ triton_server_flags_scheme = ConfigObject(schema={ k: ConfigPrimitive(str) for k in TritonServerConfig.allowed_keys() }) perf_analyzer_additive_keys = { k: None for k in PerfAnalyzerConfig.additive_keys() } perf_analyzer_flags_scheme = ConfigObject( schema={ k: ((ConfigUnion([ConfigPrimitive( type_=str), ConfigListString()])) if ( k in perf_analyzer_additive_keys) else ConfigPrimitive( type_=str)) for k in PerfAnalyzerConfig.allowed_keys() }) triton_server_environment_scheme = ConfigObject( schema={'*': ConfigPrimitive(str)}) self._add_config( ConfigField( 'perf_analyzer_flags', field_type=perf_analyzer_flags_scheme, description= 'Allows custom configuration of the perf analyzer instances used by model analyzer.' )) self._add_config( ConfigField( 'triton_server_flags', field_type=triton_server_flags_scheme, description= 'Allows custom configuration of the triton instances used by model analyzer.' )) self._add_config( ConfigField( 'triton_server_environment', field_type=triton_server_environment_scheme, description= 'Allows setting environment variables for tritonserver server instances launched by Model Analyzer' )) objectives_scheme = ConfigUnion([ ConfigObject( schema={ tag: ConfigPrimitive(type_=int) for tag in RecordType.get_all_record_types().keys() }), ConfigListString(output_mapper=objective_list_output_mapper) ]) constraints_scheme = ConfigObject( schema={ 'perf_throughput': ConfigObject(schema={ 'min': ConfigPrimitive(int), }), 'perf_latency_avg': ConfigObject(schema={ 'max': ConfigPrimitive(int), }), 'perf_latency_p90': ConfigObject(schema={ 'max': ConfigPrimitive(int), }), 'perf_latency_p95': ConfigObject(schema={ 'max': ConfigPrimitive(int), }), 'perf_latency_p99': ConfigObject(schema={ 'max': ConfigPrimitive(int), }), 'perf_latency': ConfigObject(schema={ 'max': ConfigPrimitive(int), }), 'gpu_used_memory': ConfigObject(schema={ 'max': ConfigPrimitive(int), }), }) self._add_config( ConfigField( 'objectives', field_type=objectives_scheme, default_value=DEFAULT_OFFLINE_OBJECTIVES, description= 'Model Analyzer uses the objectives described here to find the best configuration for each model.' )) self._add_config( ConfigField( 'constraints', field_type=constraints_scheme, description= 'Constraints on the objectives specified in the "objectives" field of the config.' )) model_config_fields = self._get_model_config_fields() profile_model_scheme = ConfigObject( required=True, schema={ # Any key is allowed, but the keys must follow the pattern # below '*': ConfigObject( schema={ 'cpu_only': ConfigPrimitive(bool), 'parameters': ConfigObject( schema={ 'batch_sizes': ConfigListNumeric(type_=int), 'concurrency': ConfigListNumeric(type_=int) }), 'objectives': objectives_scheme, 'constraints': constraints_scheme, 'model_config_parameters': model_config_fields, 'perf_analyzer_flags': perf_analyzer_flags_scheme, 'triton_server_flags': triton_server_flags_scheme, 'triton_server_environment': triton_server_environment_scheme }) }, output_mapper=ConfigModelProfileSpec. model_object_to_config_model_profile_spec) self._add_config( ConfigField( 'profile_models', flags=['--profile-models'], field_type=ConfigUnion([ profile_model_scheme, ConfigListGeneric(ConfigUnion([ profile_model_scheme, ConfigPrimitive(str, output_mapper=ConfigModelProfileSpec. model_str_to_config_model_profile_spec) ]), required=True, output_mapper=ConfigModelProfileSpec. model_mixed_to_config_model_profile_spec), ConfigListString(output_mapper=ConfigModelProfileSpec. model_list_to_config_model_profile_spec), ], required=True), description='List of the models to be profiled')) self._add_config( ConfigField( 'batch_sizes', flags=['-b', '--batch-sizes'], field_type=ConfigListNumeric(int), default_value=DEFAULT_BATCH_SIZES, description= 'Comma-delimited list of batch sizes to use for the profiling')) self._add_config( ConfigField( 'concurrency', flags=['-c', '--concurrency'], field_type=ConfigListNumeric(int), description= "Comma-delimited list of concurrency values or ranges <start:end:step>" " to be used during profiling")) self._add_config( ConfigField( 'reload_model_disable', field_type=ConfigPrimitive(bool), parser_args={'action': 'store_true'}, default_value=False, flags=['--reload-model-disable'], description='Flag to indicate whether or not to disable model ' 'loading and unloading in remote mode.'))