def test_add_result(): """Test interface BenchmarkResult.add_result().""" result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS) result.add_result('metric1', 300) result.add_result('metric1', 200) assert (result.result['metric1'][0] == 300) assert (result.result['metric1'][1] == 200)
def _preprocess(self): """Preprocess/preparation operations before the benchmarking. Return: True if _preprocess() succeed. """ self.add_parser_arguments() ret, self._args, unknown = self.parse_args() if not ret: self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT) return False self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.SUCCESS, run_count=self._args.run_count) if not isinstance(self._benchmark_type, BenchmarkType): logger.error( 'Invalid benchmark type - benchmark: {}, type: {}'.format( self._name, type(self._benchmark_type))) self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE) return False return True
def test_set_timestamp(): """Test interface BenchmarkResult.set_timestamp().""" result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS) start_time = '2021-02-03 16:59:49' end_time = '2021-02-03 17:00:08' result.set_timestamp(start_time, end_time) assert (result.start_time == start_time) assert (result.end_time == end_time)
def test_fambench(): """Test FAMBench benchmarks.""" benchmark_name = 'fambench' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) assert (benchmark._benchmark_type == BenchmarkType.DOCKER) assert (benchmark._image_uri == 'superbench/benchmark:cuda11.1.1-fambench') assert (benchmark._container_name == 'fambench-benchmarks') assert (benchmark._entrypoint == '/workspace/FAMBench/benchmarks/run_all_benchmarks.sh') assert (benchmark._cmd is None) benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS) benchmark._args = SimpleNamespace(log_raw_data=False) raw_output = """ benchmark implementation mode config score units batch_latency_95_sec DLRM OOTB eval tiny 152.800399 ex/s 0.515052 DLRM OOTB train tiny 35.483686 ex/s None DLRM UBENCH train linear_[(2,2,2,2,2)] 3.679281e-07 TF/s None XLMR OOTB eval default-config 1.015586 ex/s 16.463461 """ assert (benchmark._process_raw_result(0, raw_output)) assert (benchmark.result['dlrm_ootb_eval_tiny_ex_s'][0] == 152.800399) assert (benchmark.result['dlrm_ootb_train_tiny_ex_s'][0] == 35.483686) assert (benchmark.result['dlrm_ubench_train_linear_[(2,2,2,2,2)]_tf_s'][0] == 3.679281e-07) assert (benchmark.result['xlmr_ootb_eval_default_config_ex_s'][0] == 1.015586)
def test_tensorrt_inference_result_parsing(self, test_raw_log): """Test tensorrt-inference benchmark result parsing.""" (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( self.benchmark_name, Platform.CUDA) benchmark = benchmark_cls(self.benchmark_name, parameters='') benchmark._args = SimpleNamespace( pytorch_models=['model_0', 'model_1'], log_raw_data=False) benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) # Positive case - valid raw output self.assertTrue(benchmark._process_raw_result(0, test_raw_log)) self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result)) for tag in ['mean', '99']: self.assertEqual(0.5, benchmark.result[f'model_0_gpu_time_{tag}'][0]) self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0]) self.assertEqual( 1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0]) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
def test_set_return_code(): """Test interface BenchmarkResult.set_return_code().""" result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS) assert (result.return_code == ReturnCode.SUCCESS) assert (result.result['return_code'] == [ReturnCode.SUCCESS.value]) result.set_return_code(ReturnCode.INVALID_ARGUMENT) assert (result.return_code == ReturnCode.INVALID_ARGUMENT) assert (result.result['return_code'] == [ReturnCode.INVALID_ARGUMENT.value]) result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT) assert (result.return_code == ReturnCode.INVALID_BENCHMARK_RESULT) assert (result.result['return_code'] == [ReturnCode.INVALID_BENCHMARK_RESULT.value])
def test_serialize_deserialize(): """Test serialization/deserialization and compare the results.""" # Result with one metric. result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=2) result.add_result('metric1', 300, ReduceType.MAX) result.add_result('metric1', 200, ReduceType.MAX) result.add_result('metric2', 100, ReduceType.AVG) result.add_raw_data('metric1', [1, 2, 3], False) result.add_raw_data('metric1', [4, 5, 6], False) result.add_raw_data('metric1', [7, 8, 9], False) start_time = '2021-02-03 16:59:49' end_time = '2021-02-03 17:00:08' result.set_timestamp(start_time, end_time) result.set_benchmark_type(BenchmarkType.MICRO) expected = ( '{"name": "pytorch-bert-base1", "type": "micro", "run_count": 2, "return_code": 0, ' '"start_time": "2021-02-03 16:59:49", "end_time": "2021-02-03 17:00:08", ' '"raw_data": {"metric1": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, ' '"result": {"return_code": [0], "metric1": [300, 200], "metric2": [100]}, ' '"reduce_op": {"return_code": null, "metric1": "max", "metric2": "avg"}}' ) assert (result.to_string() == expected)
def test_set_benchmark_type(): """Test interface BenchmarkResult.set_benchmark_type().""" result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS) result.set_benchmark_type(BenchmarkType.MICRO) assert (result.type == BenchmarkType.MICRO)
def test_add_raw_data(): """Test interface BenchmarkResult.add_raw_data().""" result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS) result.add_raw_data('metric1', 'raw log 1', False) result.add_raw_data('metric1', 'raw log 2', False) assert (result.raw_data['metric1'][0] == 'raw log 1') assert (result.raw_data['metric1'][1] == 'raw log 2') assert (result.type == BenchmarkType.MICRO) assert (result.return_code == ReturnCode.SUCCESS) result = BenchmarkResult('model', BenchmarkType.MODEL, ReturnCode.SUCCESS) result.add_raw_data('metric1', [1, 2, 3], False) result.add_raw_data('metric1', [4, 5, 6], False) assert (result.raw_data['metric1'][0] == [1, 2, 3]) assert (result.raw_data['metric1'][1] == [4, 5, 6]) assert (result.type == BenchmarkType.MODEL) assert (result.return_code == ReturnCode.SUCCESS) # Test log_raw_data = True. result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS) result.add_raw_data('metric1', 'raw log 1', True) result.add_raw_data('metric1', 'raw log 2', True) assert (result.type == BenchmarkType.MICRO) assert (result.return_code == ReturnCode.SUCCESS) raw_data_file = os.path.join(os.getcwd(), 'rawdata.log') assert (os.path.isfile(raw_data_file)) os.remove(raw_data_file)
class Benchmark(ABC): """The base class of all benchmarks.""" def __init__(self, name, parameters=''): """Constructor. Args: name (str): benchmark name. parameters (str): benchmark parameters. """ self._name = name self._argv = list(filter( None, parameters.split(' '))) if parameters is not None else list() self._benchmark_type = None self._parser = argparse.ArgumentParser( add_help=False, usage=argparse.SUPPRESS, allow_abbrev=False, formatter_class=SortedMetavarTypeHelpFormatter, ) self._args = None self._curr_run_index = 0 self._result = None def add_parser_arguments(self): """Add the specified arguments.""" self._parser.add_argument( '--run_count', type=int, default=1, required=False, help='The run count of benchmark.', ) self._parser.add_argument( '--duration', type=int, default=0, required=False, help='The elapsed time of benchmark in seconds.', ) self._parser.add_argument( '--log_raw_data', action='store_true', default=False, help= 'Log raw data into file instead of saving it into result object.', ) def get_configurable_settings(self): """Get all the configurable settings. Return: All configurable settings in raw string. """ return self._parser.format_help().strip() def parse_args(self, ignore_invalid=False): """Parse the arguments. Return: ret (bool): whether parse succeed or not. args (argparse.Namespace): parsed arguments. unknown (list): unknown arguments. """ try: args, unknown = self._parser.parse_known_args(self._argv) except BaseException as e: if ignore_invalid: logger.info( 'Missing or invliad parameters, will ignore the error and skip the args checking.' ) return True, None, [] else: logger.error( 'Invalid argument - benchmark: {}, message: {}.'.format( self._name, str(e))) return False, None, [] ret = True if len(unknown) > 0: logger.error( 'Unknown arguments - benchmark: {}, unknown arguments: {}'. format(self._name, ' '.join(unknown))) ret = False return ret, args, unknown def _preprocess(self): """Preprocess/preparation operations before the benchmarking. Return: True if _preprocess() succeed. """ self.add_parser_arguments() ret, self._args, unknown = self.parse_args() if not ret: self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT) return False self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.SUCCESS, run_count=self._args.run_count) if not isinstance(self._benchmark_type, BenchmarkType): logger.error( 'Invalid benchmark type - benchmark: {}, type: {}'.format( self._name, type(self._benchmark_type))) self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE) return False return True def _postprocess(self): """Postprocess/cleanup operations after the benchmarking. Return: True if _postprocess() succeed. """ return True @abstractmethod def _benchmark(self): """Implementation for benchmarking.""" pass def run(self): """Function to launch the benchmarking. Return: True if run benchmark successfully. """ ret = True try: ret &= self._preprocess() if ret: self._start_time = datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') for self._curr_run_index in range(self._args.run_count): ret &= self._benchmark() self._end_time = datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') self._result.set_timestamp(self._start_time, self._end_time) if ret: ret &= self.__check_result_format() except BaseException as e: self._result.set_return_code(ReturnCode.RUNTIME_EXCEPTION_ERROR) logger.error( 'Run benchmark failed - benchmark: {}, message: {}'.format( self._name, str(e))) finally: ret &= self._postprocess() return ret def __check_result_format(self): """Check the validation of result object. Return: True if the result is valid. """ if (not self.__check_result_type()) or ( not self.__check_summarized_result()) or ( not self.__check_raw_data()): self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT) return False return True def __check_result_type(self): """Check the type of result object. Return: True if the result is instance of BenchmarkResult. """ if not isinstance(self._result, BenchmarkResult): logger.error( 'Invalid benchmark result type - benchmark: {}, type: {}'. format(self._name, type(self._result))) return False return True def __is_list_type(self, data, t): if isinstance(data, list) and all( isinstance(item, t) for item in data): return True return False def __is_list_list_type(self, data, t): if (self.__is_list_type(data, list) and all( isinstance(value, t) for item in data for value in item)): return True return False def __check_summarized_result(self): """Check the validation of summary result. Return: True if the summary result is instance of List[Number]. """ for metric in self._result.result: if not self.__is_list_type(self._result.result[metric], numbers.Number): logger.error( 'Invalid summarized result - benchmark: {}, metric: {}, result: {}.' .format(self._name, metric, self._result.result[metric])) return False return True def __check_raw_data(self): """Check the validation of raw data. Return: True if the raw data is: instance of List[List[Number]] for BenchmarkType.MODEL. instance of List[str] for BenchmarkType.DOCKER. instance of List[List[Number]] or List[str] for BenchmarkType.MICRO. """ for metric in self._result.raw_data: is_valid = True if self._benchmark_type == BenchmarkType.MODEL: is_valid = self.__is_list_list_type( self._result.raw_data[metric], numbers.Number) elif self._benchmark_type == BenchmarkType.DOCKER: is_valid = self.__is_list_type(self._result.raw_data[metric], str) elif self._benchmark_type == BenchmarkType.MICRO: is_valid = self.__is_list_type( self._result.raw_data[metric], str) or self.__is_list_list_type( self._result.raw_data[metric], numbers.Number) if not is_valid: logger.error( 'Invalid raw data type - benchmark: {}, metric: {}, raw data: {}.' .format(self._name, metric, self._result.raw_data[metric])) return False return True def _process_percentile_result(self, metric, result, reduce_type=None): """Function to process the percentile results. Args: metric (str): metric name which is the key. result (List[numbers.Number]): numerical result. reduce_type (ReduceType): The type of reduce function. """ if len(result) > 0: percentile_list = ['50', '90', '95', '99', '99.9'] for percentile in percentile_list: self._result.add_result( '{}_{}'.format(metric, percentile), np.percentile(result, float(percentile), interpolation='nearest'), reduce_type) def print_env_info(self): """Print environments or dependencies information.""" # TODO: will implement it when add real benchmarks in the future. pass @property def name(self): """Decoration function to access benchmark name.""" return self._result.name @property def type(self): """Decoration function to access benchmark type.""" return self._result.type @property def run_count(self): """Decoration function to access benchmark run_count.""" return self._result.run_count @property def return_code(self): """Decoration function to access benchmark return_code.""" return self._result.return_code @property def start_time(self): """Decoration function to access benchmark start_time.""" return self._result.start_time @property def end_time(self): """Decoration function to access benchmark end_time.""" return self._result.end_time @property def raw_data(self): """Decoration function to access benchmark raw_data.""" return self._result.raw_data @property def result(self): """Decoration function to access benchmark result.""" return self._result.result @property def serialized_result(self): """Decoration function to access benchmark result.""" return self._result.to_string() @property def default_metric_count(self): """Decoration function to get the count of default metrics.""" return self._result.default_metric_count
def test_rocm_onnxruntime_performance(): """Test onnxruntime model benchmark.""" benchmark_name = 'onnxruntime-ort-models' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.ROCM) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) assert (benchmark._benchmark_type == BenchmarkType.DOCKER) assert (benchmark._image_uri == 'superbench/benchmark:rocm4.3.1-onnxruntime1.9.0') assert (benchmark._container_name == 'rocm-onnxruntime-model-benchmarks') assert ( benchmark._entrypoint == '/stage/onnxruntime-training-examples/huggingface/azureml/run_benchmark.sh' ) assert (benchmark._cmd is None) benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS) benchmark._args = SimpleNamespace(log_raw_data=False) raw_output = """ __superbench__ begin bert-large-uncased ngpu=1 "samples_per_second": 21.829 __superbench__ begin bert-large-uncased ngpu=8 "samples_per_second": 147.181 __superbench__ begin distilbert-base-uncased ngpu=1 "samples_per_second": 126.827 __superbench__ begin distilbert-base-uncased ngpu=8 "samples_per_second": 966.796 __superbench__ begin gpt2 ngpu=1 "samples_per_second": 20.46 __superbench__ begin gpt2 ngpu=8 "samples_per_second": 151.089 __superbench__ begin facebook/bart-large ngpu=1 "samples_per_second": 66.171 __superbench__ begin facebook/bart-large ngpu=8 "samples_per_second": 370.343 __superbench__ begin roberta-large ngpu=1 "samples_per_second": 37.103 __superbench__ begin roberta-large ngpu=8 "samples_per_second": 274.455 """ assert (benchmark._process_raw_result(0, raw_output)) assert ( benchmark.result['bert_large_uncased_ngpu_1_throughput'][0] == 21.829) assert ( benchmark.result['bert_large_uncased_ngpu_8_throughput'][0] == 147.181) assert (benchmark.result['distilbert_base_uncased_ngpu_1_throughput'][0] == 126.827) assert (benchmark.result['distilbert_base_uncased_ngpu_8_throughput'][0] == 966.796) assert (benchmark.result['gpt2_ngpu_1_throughput'][0] == 20.46) assert (benchmark.result['gpt2_ngpu_8_throughput'][0] == 151.089) assert ( benchmark.result['facebook_bart_large_ngpu_1_throughput'][0] == 66.171) assert (benchmark.result['facebook_bart_large_ngpu_8_throughput'][0] == 370.343) assert (benchmark.result['roberta_large_ngpu_1_throughput'][0] == 37.103) assert (benchmark.result['roberta_large_ngpu_8_throughput'][0] == 274.455)