예제 #1
0
def test_add_result():
    """Test interface BenchmarkResult.add_result()."""
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    result.add_result('metric1', 300)
    result.add_result('metric1', 200)
    assert (result.result['metric1'][0] == 300)
    assert (result.result['metric1'][1] == 200)
예제 #2
0
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        self.add_parser_arguments()
        ret, self._args, unknown = self.parse_args()

        if not ret:
            self._result = BenchmarkResult(self._name, self._benchmark_type,
                                           ReturnCode.INVALID_ARGUMENT)
            return False

        self._result = BenchmarkResult(self._name,
                                       self._benchmark_type,
                                       ReturnCode.SUCCESS,
                                       run_count=self._args.run_count)

        if not isinstance(self._benchmark_type, BenchmarkType):
            logger.error(
                'Invalid benchmark type - benchmark: {}, type: {}'.format(
                    self._name, type(self._benchmark_type)))
            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE)
            return False

        return True
예제 #3
0
def test_set_timestamp():
    """Test interface BenchmarkResult.set_timestamp()."""
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    start_time = '2021-02-03 16:59:49'
    end_time = '2021-02-03 17:00:08'
    result.set_timestamp(start_time, end_time)
    assert (result.start_time == start_time)
    assert (result.end_time == end_time)
예제 #4
0
def test_fambench():
    """Test FAMBench benchmarks."""
    benchmark_name = 'fambench'
    (benchmark_class,
     predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
    assert (benchmark_class)
    benchmark = benchmark_class(benchmark_name)
    assert (benchmark._benchmark_type == BenchmarkType.DOCKER)
    assert (benchmark._image_uri == 'superbench/benchmark:cuda11.1.1-fambench')
    assert (benchmark._container_name == 'fambench-benchmarks')
    assert (benchmark._entrypoint == '/workspace/FAMBench/benchmarks/run_all_benchmarks.sh')
    assert (benchmark._cmd is None)
    benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS)
    benchmark._args = SimpleNamespace(log_raw_data=False)

    raw_output = """
benchmark implementation mode config score units batch_latency_95_sec
DLRM OOTB eval tiny 152.800399 ex/s 0.515052
DLRM OOTB train tiny 35.483686 ex/s None
DLRM UBENCH train linear_[(2,2,2,2,2)] 3.679281e-07 TF/s None
XLMR OOTB eval default-config 1.015586 ex/s 16.463461
"""
    assert (benchmark._process_raw_result(0, raw_output))
    assert (benchmark.result['dlrm_ootb_eval_tiny_ex_s'][0] == 152.800399)
    assert (benchmark.result['dlrm_ootb_train_tiny_ex_s'][0] == 35.483686)
    assert (benchmark.result['dlrm_ubench_train_linear_[(2,2,2,2,2)]_tf_s'][0] == 3.679281e-07)
    assert (benchmark.result['xlmr_ootb_eval_default_config_ex_s'][0] == 1.015586)
    def test_tensorrt_inference_result_parsing(self, test_raw_log):
        """Test tensorrt-inference benchmark result parsing."""
        (benchmark_cls,
         _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             self.benchmark_name, Platform.CUDA)
        benchmark = benchmark_cls(self.benchmark_name, parameters='')
        benchmark._args = SimpleNamespace(
            pytorch_models=['model_0', 'model_1'], log_raw_data=False)
        benchmark._result = BenchmarkResult(self.benchmark_name,
                                            BenchmarkType.MICRO,
                                            ReturnCode.SUCCESS,
                                            run_count=1)

        # Positive case - valid raw output
        self.assertTrue(benchmark._process_raw_result(0, test_raw_log))
        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)

        self.assertEqual(6 + benchmark.default_metric_count,
                         len(benchmark.result))
        for tag in ['mean', '99']:
            self.assertEqual(0.5,
                             benchmark.result[f'model_0_gpu_time_{tag}'][0])
            self.assertEqual(0.6,
                             benchmark.result[f'model_0_host_time_{tag}'][0])
            self.assertEqual(
                1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0])

        # Negative case - invalid raw output
        self.assertFalse(benchmark._process_raw_result(1,
                                                       'Invalid raw output'))
예제 #6
0
def test_set_return_code():
    """Test interface BenchmarkResult.set_return_code()."""
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    assert (result.return_code == ReturnCode.SUCCESS)
    assert (result.result['return_code'] == [ReturnCode.SUCCESS.value])
    result.set_return_code(ReturnCode.INVALID_ARGUMENT)
    assert (result.return_code == ReturnCode.INVALID_ARGUMENT)
    assert (result.result['return_code'] == [ReturnCode.INVALID_ARGUMENT.value])
    result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
    assert (result.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
    assert (result.result['return_code'] == [ReturnCode.INVALID_BENCHMARK_RESULT.value])
예제 #7
0
def test_serialize_deserialize():
    """Test serialization/deserialization and compare the results."""
    # Result with one metric.
    result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=2)
    result.add_result('metric1', 300, ReduceType.MAX)
    result.add_result('metric1', 200, ReduceType.MAX)
    result.add_result('metric2', 100, ReduceType.AVG)
    result.add_raw_data('metric1', [1, 2, 3], False)
    result.add_raw_data('metric1', [4, 5, 6], False)
    result.add_raw_data('metric1', [7, 8, 9], False)
    start_time = '2021-02-03 16:59:49'
    end_time = '2021-02-03 17:00:08'
    result.set_timestamp(start_time, end_time)
    result.set_benchmark_type(BenchmarkType.MICRO)

    expected = (
        '{"name": "pytorch-bert-base1", "type": "micro", "run_count": 2, "return_code": 0, '
        '"start_time": "2021-02-03 16:59:49", "end_time": "2021-02-03 17:00:08", '
        '"raw_data": {"metric1": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, '
        '"result": {"return_code": [0], "metric1": [300, 200], "metric2": [100]}, '
        '"reduce_op": {"return_code": null, "metric1": "max", "metric2": "avg"}}'
    )
    assert (result.to_string() == expected)
예제 #8
0
def test_set_benchmark_type():
    """Test interface BenchmarkResult.set_benchmark_type()."""
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    result.set_benchmark_type(BenchmarkType.MICRO)
    assert (result.type == BenchmarkType.MICRO)
예제 #9
0
def test_add_raw_data():
    """Test interface BenchmarkResult.add_raw_data()."""
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    result.add_raw_data('metric1', 'raw log 1', False)
    result.add_raw_data('metric1', 'raw log 2', False)
    assert (result.raw_data['metric1'][0] == 'raw log 1')
    assert (result.raw_data['metric1'][1] == 'raw log 2')
    assert (result.type == BenchmarkType.MICRO)
    assert (result.return_code == ReturnCode.SUCCESS)

    result = BenchmarkResult('model', BenchmarkType.MODEL, ReturnCode.SUCCESS)
    result.add_raw_data('metric1', [1, 2, 3], False)
    result.add_raw_data('metric1', [4, 5, 6], False)
    assert (result.raw_data['metric1'][0] == [1, 2, 3])
    assert (result.raw_data['metric1'][1] == [4, 5, 6])
    assert (result.type == BenchmarkType.MODEL)
    assert (result.return_code == ReturnCode.SUCCESS)

    # Test log_raw_data = True.
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    result.add_raw_data('metric1', 'raw log 1', True)
    result.add_raw_data('metric1', 'raw log 2', True)
    assert (result.type == BenchmarkType.MICRO)
    assert (result.return_code == ReturnCode.SUCCESS)
    raw_data_file = os.path.join(os.getcwd(), 'rawdata.log')
    assert (os.path.isfile(raw_data_file))
    os.remove(raw_data_file)
예제 #10
0
class Benchmark(ABC):
    """The base class of all benchmarks."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        self._name = name
        self._argv = list(filter(
            None,
            parameters.split(' '))) if parameters is not None else list()
        self._benchmark_type = None
        self._parser = argparse.ArgumentParser(
            add_help=False,
            usage=argparse.SUPPRESS,
            allow_abbrev=False,
            formatter_class=SortedMetavarTypeHelpFormatter,
        )
        self._args = None
        self._curr_run_index = 0
        self._result = None

    def add_parser_arguments(self):
        """Add the specified arguments."""
        self._parser.add_argument(
            '--run_count',
            type=int,
            default=1,
            required=False,
            help='The run count of benchmark.',
        )
        self._parser.add_argument(
            '--duration',
            type=int,
            default=0,
            required=False,
            help='The elapsed time of benchmark in seconds.',
        )
        self._parser.add_argument(
            '--log_raw_data',
            action='store_true',
            default=False,
            help=
            'Log raw data into file instead of saving it into result object.',
        )

    def get_configurable_settings(self):
        """Get all the configurable settings.

        Return:
            All configurable settings in raw string.
        """
        return self._parser.format_help().strip()

    def parse_args(self, ignore_invalid=False):
        """Parse the arguments.

        Return:
            ret (bool): whether parse succeed or not.
            args (argparse.Namespace): parsed arguments.
            unknown (list): unknown arguments.
        """
        try:
            args, unknown = self._parser.parse_known_args(self._argv)
        except BaseException as e:
            if ignore_invalid:
                logger.info(
                    'Missing or invliad parameters, will ignore the error and skip the args checking.'
                )
                return True, None, []
            else:
                logger.error(
                    'Invalid argument - benchmark: {}, message: {}.'.format(
                        self._name, str(e)))
                return False, None, []

        ret = True
        if len(unknown) > 0:
            logger.error(
                'Unknown arguments - benchmark: {}, unknown arguments: {}'.
                format(self._name, ' '.join(unknown)))
            ret = False

        return ret, args, unknown

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        self.add_parser_arguments()
        ret, self._args, unknown = self.parse_args()

        if not ret:
            self._result = BenchmarkResult(self._name, self._benchmark_type,
                                           ReturnCode.INVALID_ARGUMENT)
            return False

        self._result = BenchmarkResult(self._name,
                                       self._benchmark_type,
                                       ReturnCode.SUCCESS,
                                       run_count=self._args.run_count)

        if not isinstance(self._benchmark_type, BenchmarkType):
            logger.error(
                'Invalid benchmark type - benchmark: {}, type: {}'.format(
                    self._name, type(self._benchmark_type)))
            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE)
            return False

        return True

    def _postprocess(self):
        """Postprocess/cleanup operations after the benchmarking.

        Return:
            True if _postprocess() succeed.
        """
        return True

    @abstractmethod
    def _benchmark(self):
        """Implementation for benchmarking."""
        pass

    def run(self):
        """Function to launch the benchmarking.

        Return:
            True if run benchmark successfully.
        """
        ret = True
        try:
            ret &= self._preprocess()
            if ret:
                self._start_time = datetime.utcnow().strftime(
                    '%Y-%m-%d %H:%M:%S')
                for self._curr_run_index in range(self._args.run_count):
                    ret &= self._benchmark()
                self._end_time = datetime.utcnow().strftime(
                    '%Y-%m-%d %H:%M:%S')
                self._result.set_timestamp(self._start_time, self._end_time)

                if ret:
                    ret &= self.__check_result_format()
        except BaseException as e:
            self._result.set_return_code(ReturnCode.RUNTIME_EXCEPTION_ERROR)
            logger.error(
                'Run benchmark failed - benchmark: {}, message: {}'.format(
                    self._name, str(e)))
        finally:
            ret &= self._postprocess()

        return ret

    def __check_result_format(self):
        """Check the validation of result object.

        Return:
            True if the result is valid.
        """
        if (not self.__check_result_type()) or (
                not self.__check_summarized_result()) or (
                    not self.__check_raw_data()):
            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
            return False

        return True

    def __check_result_type(self):
        """Check the type of result object.

        Return:
            True if the result is instance of BenchmarkResult.
        """
        if not isinstance(self._result, BenchmarkResult):
            logger.error(
                'Invalid benchmark result type - benchmark: {}, type: {}'.
                format(self._name, type(self._result)))
            return False

        return True

    def __is_list_type(self, data, t):
        if isinstance(data, list) and all(
                isinstance(item, t) for item in data):
            return True
        return False

    def __is_list_list_type(self, data, t):
        if (self.__is_list_type(data, list) and all(
                isinstance(value, t) for item in data for value in item)):
            return True
        return False

    def __check_summarized_result(self):
        """Check the validation of summary result.

        Return:
            True if the summary result is instance of List[Number].
        """
        for metric in self._result.result:
            if not self.__is_list_type(self._result.result[metric],
                                       numbers.Number):
                logger.error(
                    'Invalid summarized result - benchmark: {}, metric: {}, result: {}.'
                    .format(self._name, metric, self._result.result[metric]))
                return False

        return True

    def __check_raw_data(self):
        """Check the validation of raw data.

        Return:
            True if the raw data is:
              instance of List[List[Number]] for BenchmarkType.MODEL.
              instance of List[str] for BenchmarkType.DOCKER.
              instance of List[List[Number]] or List[str] for BenchmarkType.MICRO.
        """
        for metric in self._result.raw_data:
            is_valid = True
            if self._benchmark_type == BenchmarkType.MODEL:
                is_valid = self.__is_list_list_type(
                    self._result.raw_data[metric], numbers.Number)
            elif self._benchmark_type == BenchmarkType.DOCKER:
                is_valid = self.__is_list_type(self._result.raw_data[metric],
                                               str)
            elif self._benchmark_type == BenchmarkType.MICRO:
                is_valid = self.__is_list_type(
                    self._result.raw_data[metric],
                    str) or self.__is_list_list_type(
                        self._result.raw_data[metric], numbers.Number)
            if not is_valid:
                logger.error(
                    'Invalid raw data type - benchmark: {}, metric: {}, raw data: {}.'
                    .format(self._name, metric, self._result.raw_data[metric]))
                return False

        return True

    def _process_percentile_result(self, metric, result, reduce_type=None):
        """Function to process the percentile results.

        Args:
            metric (str): metric name which is the key.
            result (List[numbers.Number]): numerical result.
            reduce_type (ReduceType): The type of reduce function.
        """
        if len(result) > 0:
            percentile_list = ['50', '90', '95', '99', '99.9']
            for percentile in percentile_list:
                self._result.add_result(
                    '{}_{}'.format(metric, percentile),
                    np.percentile(result,
                                  float(percentile),
                                  interpolation='nearest'), reduce_type)

    def print_env_info(self):
        """Print environments or dependencies information."""
        # TODO: will implement it when add real benchmarks in the future.
        pass

    @property
    def name(self):
        """Decoration function to access benchmark name."""
        return self._result.name

    @property
    def type(self):
        """Decoration function to access benchmark type."""
        return self._result.type

    @property
    def run_count(self):
        """Decoration function to access benchmark run_count."""
        return self._result.run_count

    @property
    def return_code(self):
        """Decoration function to access benchmark return_code."""
        return self._result.return_code

    @property
    def start_time(self):
        """Decoration function to access benchmark start_time."""
        return self._result.start_time

    @property
    def end_time(self):
        """Decoration function to access benchmark end_time."""
        return self._result.end_time

    @property
    def raw_data(self):
        """Decoration function to access benchmark raw_data."""
        return self._result.raw_data

    @property
    def result(self):
        """Decoration function to access benchmark result."""
        return self._result.result

    @property
    def serialized_result(self):
        """Decoration function to access benchmark result."""
        return self._result.to_string()

    @property
    def default_metric_count(self):
        """Decoration function to get the count of default metrics."""
        return self._result.default_metric_count
def test_rocm_onnxruntime_performance():
    """Test onnxruntime model benchmark."""
    benchmark_name = 'onnxruntime-ort-models'
    (benchmark_class, predefine_params
     ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
         benchmark_name, Platform.ROCM)
    assert (benchmark_class)
    benchmark = benchmark_class(benchmark_name)
    assert (benchmark._benchmark_type == BenchmarkType.DOCKER)
    assert (benchmark._image_uri ==
            'superbench/benchmark:rocm4.3.1-onnxruntime1.9.0')
    assert (benchmark._container_name == 'rocm-onnxruntime-model-benchmarks')
    assert (
        benchmark._entrypoint ==
        '/stage/onnxruntime-training-examples/huggingface/azureml/run_benchmark.sh'
    )
    assert (benchmark._cmd is None)
    benchmark._result = BenchmarkResult(benchmark._name,
                                        benchmark._benchmark_type,
                                        ReturnCode.SUCCESS)
    benchmark._args = SimpleNamespace(log_raw_data=False)

    raw_output = """
__superbench__ begin bert-large-uncased ngpu=1
    "samples_per_second": 21.829
__superbench__ begin bert-large-uncased ngpu=8
    "samples_per_second": 147.181
__superbench__ begin distilbert-base-uncased ngpu=1
    "samples_per_second": 126.827
__superbench__ begin distilbert-base-uncased ngpu=8
    "samples_per_second": 966.796
__superbench__ begin gpt2 ngpu=1
    "samples_per_second": 20.46
__superbench__ begin gpt2 ngpu=8
    "samples_per_second": 151.089
__superbench__ begin facebook/bart-large ngpu=1
    "samples_per_second": 66.171
__superbench__ begin facebook/bart-large ngpu=8
    "samples_per_second": 370.343
__superbench__ begin roberta-large ngpu=1
    "samples_per_second": 37.103
__superbench__ begin roberta-large ngpu=8
    "samples_per_second": 274.455
"""
    assert (benchmark._process_raw_result(0, raw_output))
    assert (
        benchmark.result['bert_large_uncased_ngpu_1_throughput'][0] == 21.829)
    assert (
        benchmark.result['bert_large_uncased_ngpu_8_throughput'][0] == 147.181)
    assert (benchmark.result['distilbert_base_uncased_ngpu_1_throughput'][0] ==
            126.827)
    assert (benchmark.result['distilbert_base_uncased_ngpu_8_throughput'][0] ==
            966.796)
    assert (benchmark.result['gpt2_ngpu_1_throughput'][0] == 20.46)
    assert (benchmark.result['gpt2_ngpu_8_throughput'][0] == 151.089)
    assert (
        benchmark.result['facebook_bart_large_ngpu_1_throughput'][0] == 66.171)
    assert (benchmark.result['facebook_bart_large_ngpu_8_throughput'][0] ==
            370.343)
    assert (benchmark.result['roberta_large_ngpu_1_throughput'][0] == 37.103)
    assert (benchmark.result['roberta_large_ngpu_8_throughput'][0] == 274.455)