def test_measurement_request_count_increase(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, max_retries=10, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() # Test the timeout for count mode self.client.wait_for_server_ready(num_retries=1) test_both_output = "Please use a larger time window" self.perf_mock.set_perf_analyzer_result_string(test_both_output) self.perf_mock.set_perf_analyzer_return_code(1) perf_metrics = [PerfThroughput, PerfLatencyP99] perf_analyzer.run(perf_metrics) self.assertEqual( self.perf_mock.get_perf_analyzer_popen_read_call_count(), 10)
def test_measurement_interval_increase(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus) perf_analyzer_config = PerfAnalyzerConfig() perf_analyzer_config['model-name'] = TEST_MODEL_NAME perf_analyzer_config['concurrency-range'] = TEST_CONCURRENCY_RANGE perf_analyzer_config['measurement-mode'] = 'time_windows' perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, max_retries=10, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() # Test failure to stabilize for measurement windows self.client.wait_for_server_ready(num_retries=1) test_stabilize_output = "Please use a larger time window" self.perf_mock.set_perf_analyzer_result_string(test_stabilize_output) self.perf_mock.set_perf_analyzer_return_code(1) perf_metrics = [PerfThroughput, PerfLatencyP99] perf_analyzer.run(perf_metrics) self.assertEqual( self.perf_mock.get_perf_analyzer_popen_read_call_count(), 10)
def _get_perf_analyzer_metrics(self, perf_config, perf_output_writer=None): """ Gets the aggregated metrics from the perf_analyzer Parameters ---------- perf_config : dict The keys are arguments to perf_analyzer The values are their values perf_output_writer : OutputWriter Writer that writes the output from perf_analyzer to the output stream/file. If None, the output is not written Raises ------ TritonModelAnalyzerException """ try: perf_analyzer = PerfAnalyzer( path=self._perf_analyzer_path, config=perf_config, timeout=self._config.perf_analyzer_timeout, max_cpu_util=self._config.perf_analyzer_cpu_util) status = perf_analyzer.run(self._perf_metrics) # PerfAnalzyer run was not succesful if status == 1: return 1 except FileNotFoundError as e: raise TritonModelAnalyzerException( f"perf_analyzer binary not found : {e}") if perf_output_writer: perf_output_writer.write(perf_analyzer.output() + '\n') perf_records = perf_analyzer.get_records() perf_record_aggregator = RecordAggregator() perf_record_aggregator.insert_all(perf_records) return perf_record_aggregator.aggregate()
def test_run(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() self.client.wait_for_server_ready(num_retries=1) # Run perf analyzer with dummy tags to check command parsing perf_tags = [id] _ = perf_analyzer.run(perf_tags) self.perf_mock.assert_perf_analyzer_run_as( [PERF_BIN_PATH, '-m', TEST_MODEL_NAME]) # Test latency parsing test_latency_output = "Avg latency: 5000 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_tags = [PerfLatency] records = perf_analyzer.run(perf_tags) self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 5000) # Test throughput parsing test_throughput_output = "Throughput: 46.8 infer/sec\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_throughput_output) perf_tags = [PerfThroughput] records = perf_analyzer.run(perf_tags) self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 46.8) # Test parsing for both test_both_output = "Throughput: 0.001 infer/sec\nAvg latency: 3.6 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_both_output) perf_tags = [PerfLatency, PerfThroughput] records = perf_analyzer.run(perf_tags) self.assertEqual(len(records), 2) self.assertEqual(records[0].value(), 3.6) self.assertEqual(records[1].value(), 0.001) # Test exception handling with self.assertRaisesRegex( expected_exception=TritonModelAnalyzerException, expected_regex="Running perf_analyzer with", msg="Expected TritonModelAnalyzerException"): self.perf_mock.raise_exception_on_run() _ = perf_analyzer.run(perf_tags) self.server.stop()
def _get_perf_analyzer_metrics(self, perf_config, perf_output_writer=None, perf_analyzer_env=None): """ Gets the aggregated metrics from the perf_analyzer Parameters ---------- perf_config : dict The keys are arguments to perf_analyzer The values are their values perf_output_writer : OutputWriter Writer that writes the output from perf_analyzer to the output stream/file. If None, the output is not written perf_analyzer_env : dict a dict of name:value pairs for the environment variables with which perf_analyzer should be run. Raises ------ TritonModelAnalyzerException """ perf_analyzer = PerfAnalyzer( path=self._config.perf_analyzer_path, config=perf_config, max_retries=self._config.perf_analyzer_max_auto_adjusts, timeout=self._config.perf_analyzer_timeout, max_cpu_util=self._config.perf_analyzer_cpu_util) # IF running with C_API, need to set CUDA_VISIBLE_DEVICES here if self._config.triton_launch_mode == 'c_api': perf_analyzer_env['CUDA_VISIBLE_DEVICES'] = ','.join( [gpu.device_uuid() for gpu in self._gpus]) status = perf_analyzer.run(self._perf_metrics, env=perf_analyzer_env) if perf_output_writer: perf_output_writer.write( '============== Perf Analyzer Launched ==============\n ' f'Command: perf_analyzer {perf_config.to_cli_string()} \n\n', append=True) if perf_analyzer.output(): perf_output_writer.write(perf_analyzer.output() + '\n', append=True) # PerfAnalyzer run was not succesful if status == 1: return 1 perf_records = perf_analyzer.get_records() perf_record_aggregator = RecordAggregator() perf_record_aggregator.insert_all(perf_records) return perf_record_aggregator.aggregate()
def test_run(self, requests_mock): # Now create a server config server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) perf_client = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config) self.server.start() requests_mock.get.return_value.status_code = 200 self.server.wait_for_ready(num_retries=1) # Run perf analyzer throughput_record, latency_record = perf_client.run() self.perf_mock.assert_perf_analyzer_run_as( [PERF_BIN_PATH, '-m', TEST_MODEL_NAME]) self.server.stop() # Test latency parsing test_latency_output = "Avg latency: 5000 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) _, latency_record = perf_client.run() self.assertEqual(latency_record.value(), 5000) # Test throughput parsing test_throughput_output = "Throughput: 46.8 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_throughput_output) throughput_record, _ = perf_client.run() self.assertEqual(throughput_record.value(), 46.8) # Test parsing for both test_both_output = "Throughput: 0.001 ms\nAvg latency: 3.6 ms\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_both_output) throughput_record, latency_record = perf_client.run() self.assertEqual(throughput_record.value(), 0.001) self.assertEqual(latency_record.value(), 3.6)
def test_run(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() self.client.wait_for_server_ready(num_retries=1) # Run perf analyzer with dummy metrics to check command parsing perf_metrics = [id] test_latency_output = "p99 latency: 5000 us\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_analyzer.run(perf_metrics) self.perf_mock.assert_perf_analyzer_run_as([ PERF_BIN_PATH, '-m', TEST_MODEL_NAME, '--measurement-interval', str(self.config['measurement-interval']) ]) # Test latency parsing test_latency_output = "p99 latency: 5000 us\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_latency_output) perf_metrics = [PerfLatency] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 5) # Test throughput parsing test_throughput_output = "Throughput: 46.8 infer/sec\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_throughput_output) perf_metrics = [PerfThroughput] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 46.8) # Test parsing for both test_both_output = "Throughput: 0.001 infer/sec\np99 latency: 3600 us\n\n\n\n" self.perf_mock.set_perf_analyzer_result_string(test_both_output) perf_metrics = [PerfThroughput, PerfLatency] perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 2) self.assertEqual(records[0].value(), 0.001) self.assertEqual(records[1].value(), 3.6) # Test exception handling self.perf_mock.set_perf_analyzer_return_code(1) self.assertTrue(perf_analyzer.run(perf_metrics), 1) self.server.stop()
def test_run(self): server_config = TritonServerConfig() server_config['model-repository'] = MODEL_REPOSITORY_PATH # Create server, client, PerfAnalyzer, and wait for server ready self.server = TritonServerFactory.create_server_local( path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus) perf_analyzer = PerfAnalyzer(path=PERF_BIN_PATH, config=self.config, max_retries=10, timeout=100, max_cpu_util=50) self.client = TritonClientFactory.create_grpc_client( server_url=TEST_GRPC_URL) self.server.start() self.client.wait_for_server_ready(num_retries=1) pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait\n1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314""" # Test avg latency parsing perf_metrics = [PerfLatencyAvg] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 5) # Test p90 latency parsing perf_metrics = [PerfLatencyP90] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 4.7) # Test p95 latency parsing perf_metrics = [PerfLatencyP95] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 4.8) # Test p99 latency parsing perf_metrics = [PerfLatencyP99] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 4.9) # Test throughput parsing perf_metrics = [PerfThroughput] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 46.8) # Test client response wait perf_metrics = [PerfClientResponseWait] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.314) # Test server queue perf_metrics = [PerfServerQueue] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.018) # Test server compute infer perf_metrics = [PerfServerComputeInfer] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.065) # Test server compute input perf_metrics = [PerfServerComputeInput] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.034) # Test server compute infer perf_metrics = [PerfServerComputeOutput] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 1) self.assertEqual(records[0].value(), 0.016) # # Test parsing for subset perf_metrics = [ PerfThroughput, PerfLatencyAvg, PerfLatencyP90, PerfLatencyP95, PerfLatencyP99 ] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): perf_analyzer.run(perf_metrics) records = perf_analyzer.get_records() self.assertEqual(len(records), 5) # Test no exceptions are raised when nothing can be parsed pa_csv_empty = "" perf_metrics = [ PerfThroughput, PerfClientSendRecv, PerfClientResponseWait, PerfServerQueue, PerfServerComputeInfer, PerfServerComputeInput, PerfServerComputeOutput ] with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): self.assertFalse(perf_analyzer.run(perf_metrics)) # Test exception handling self.perf_mock.set_perf_analyzer_return_code(1) with patch('model_analyzer.perf_analyzer.perf_analyzer.open', mock_open(read_data=pa_csv_mock)), patch( 'model_analyzer.perf_analyzer.perf_analyzer.os.remove'): self.assertTrue(perf_analyzer.run(perf_metrics)) self.server.stop()