示例#1
0
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        # Format the arguments
        self._args.operation = self._args.operation.lower()

        # Check the arguments and generate the commands
        op = self._args.operation
        if op not in self.__operations:
            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
            logger.error(
                'Unsupported operation of NCCL test - benchmark: {}, operation: {}, expected: {}.'
                .format(self._name, op,
                        ' '.join(list(self.__operations.keys()))))
            return False
        else:
            self._bin_name = self.__operations[op]
            if not self._set_binary_path():
                return False

            command = os.path.join(self._args.bin_dir, self._bin_name)
            command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
                self._args.minbytes, self._args.maxbytes,
                str(self._args.stepfactor), str(self._args.ngpus),
                str(self._args.check), str(self._args.iters),
                str(self._args.warmup_iters))
            self._commands.append(command)

        return True
    def _process_raw_result(self, idx, raw_output):
        """Function to process raw results and save the summarized results.

          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            idx (int): the index corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        host = self.__hosts[idx]
        self._result.add_raw_data('raw_output_' + host, raw_output,
                                  self._args.log_raw_data)

        try:
            # If socket error or exception happens on TCPing, add result values as failed
            suc = 0
            fail = self._args.count
            mininum = 0.00
            maximum = 0.00
            average = 0.00
            rate = 0
            # Parse and add result from table-like output of TCPing
            if 'failure' not in raw_output:
                raw_output = raw_output.splitlines()
                labels = None
                for line in raw_output:
                    # Get the line of the table labels
                    if 'Host' in line:
                        labels = line.split('|')
                        labels = [label.strip() for label in labels]
                    if host in line:
                        res = line.split('|')
                        res = [result.strip() for result in res]
                        suc = int(res[labels.index('Successed')])
                        fail = int(res[labels.index('Failed')])
                        rate = float(
                            res[labels.index('Success Rate')].strip('%'))
                        mininum = float(
                            res[labels.index('Minimum')].strip('ms'))
                        maximum = float(
                            res[labels.index('Maximum')].strip('ms'))
                        average = float(
                            res[labels.index('Average')].strip('ms'))
            self._result.add_result(host + '_successed_count', suc)
            self._result.add_result(host + '_failed_count', fail)
            self._result.add_result(host + '_success_rate', rate)
            self._result.add_result(host + '_time_min', mininum)
            self._result.add_result(host + '_time_max', maximum)
            self._result.add_result(host + '_time_avg', average)
        except Exception as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, address: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, host, raw_output,
                        str(e)))
            return False

        return True
示例#3
0
    def _benchmark(self):
        """Implementation for benchmarking.

        Return:
            True if run benchmark successfully.
        """
        for cmd_idx in range(len(self._commands)):
            logger.info(
                'Execute command - round: {}, benchmark: {}, command: {}.'.
                format(self._curr_run_index, self._name,
                       self._commands[cmd_idx]))
            output = run_command(self._commands[cmd_idx])
            if output.returncode != 0:
                self._result.set_return_code(
                    ReturnCode.DOCKERBENCHMARK_EXECUTION_FAILURE)
                logger.error(
                    'DockerBenchmark execution failed - round: {}, benchmark: {}, error message: {}.'
                    .format(self._curr_run_index, self._name, output.stdout))
                return False
            else:
                if not self._process_raw_result(cmd_idx, output.stdout):
                    self._result.set_return_code(
                        ReturnCode.DOCKERBENCHMARK_RESULT_PARSING_FAILURE)
                    return False

        return True
    def _parse_rules(self, rules):
        """Parse the rules for result summary.

        Args:
            rules (dict): rules from rule yaml file

        Returns:
            bool: return True if successfully get all rules, otherwise False.
        """
        try:
            if not rules:
                logger.error('ResultSummary: get rules failed')
                return False
            self._sb_rules = {}
            self._enable_metrics = set()
            benchmark_rules = rules['superbench']['rules']
            for rule in benchmark_rules:
                benchmark_rules[rule] = self._check_rules(
                    benchmark_rules[rule], rule)
                self._sb_rules[rule] = {}
                self._sb_rules[rule]['name'] = rule
                self._sb_rules[rule]['categories'] = benchmark_rules[rule][
                    'categories']
                self._sb_rules[rule]['metrics'] = {}
                self._sb_rules[rule]['statistics'] = benchmark_rules[rule][
                    'statistics']
                self._sb_rules[rule]['aggregate'] = benchmark_rules[rule][
                    'aggregate'] if 'aggregate' in benchmark_rules[
                        rule] else False
                super()._get_metrics(rule, benchmark_rules)
            return True
        except Exception as e:
            logger.error('ResultSummary: parse rules failed - {}'.format(
                str(e)))
            return False
示例#5
0
    def _postprocess(self):
        """Postprocess/cleanup operations after the benchmarking.

        Return:
            True if _postprocess() succeed.
        """
        if not super()._postprocess():
            return False

        try:
            if self._args.distributed_impl == DistributedImpl.DDP:
                torch.distributed.barrier()
                torch.distributed.destroy_process_group()
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
            logger.error(
                'Post process failed - model: {}, distributed implementation: {}, message: {}.'
                .format(self._name, self._args.distributed_impl, str(e)))
            return False

        if self._gpu_available:
            torch.cuda.synchronize()
        del self._target
        del self._optimizer
        del self._model
        if self._gpu_available:
            torch.cuda.empty_cache()

        return True
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        # Format the arguments
        self._args.mem_type = [p.lower() for p in self._args.mem_type]

        # Check the arguments and generate the commands
        if self._args.memory not in self._memory:
            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
            logger.error(
                'Unsupported mem_type of bandwidth test - benchmark: {}, memory: {}, expected: {}.'
                .format(self._name, self._args.memory, ' '.join(self._memory)))
            return False
        for mem_type in self._args.mem_type:
            if mem_type not in self._mem_types:
                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
                logger.error(
                    'Unsupported mem_type of bandwidth test - benchmark: {}, mem_type: {}, expected: {}.'
                    .format(self._name, mem_type, ' '.join(self._mem_types)))
                return False

        return True
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        mlc_path = os.path.join(self._args.bin_dir, self._bin_name)
        ret_val = os.access(mlc_path, os.X_OK | os.F_OK)
        if not ret_val:
            logger.error(
                'Executable {} not found in {} or it is not executable'.format(
                    self._bin_name, self._args.bin_dir))
            return False

        # the mlc command requires hugapage to be enabled
        mlc_wrapper = ' '.join([
            'nr_hugepages=`cat /proc/sys/vm/nr_hugepages`;',
            'echo 4000 > /proc/sys/vm/nr_hugepages;', '%s;', 'err=$?;',
            'echo ${nr_hugepages} > /proc/sys/vm/nr_hugepages;', '(exit $err)'
        ])
        for test in self._args.tests:
            command = mlc_path + ' --%s' % test
            self._commands.append(mlc_wrapper % command)
        return True
示例#8
0
    def _process_numeric_result(self, metric, result, reduce_type=None, cal_percentile=False):
        """Function to save the numerical results.

        Args:
            metric (str): metric name which is the key.
            result (List[numbers.Number]): numerical result.
            reduce_type (ReduceType): The type of reduce function.
            cal_percentile (bool): Whether to calculate the percentile results.

        Return:
            True if result list is not empty.
        """
        if len(result) == 0:
            logger.error(
                'Numerical result of benchmark is empty - round: {}, name: {}.'.format(
                    self._curr_run_index, self._name
                )
            )
            return False

        self._result.add_raw_data(metric, result, self._args.log_raw_data)
        self._result.add_result(metric, statistics.mean(result), reduce_type)
        if cal_percentile:
            self._process_percentile_result(metric, result, reduce_type)

        return True
示例#9
0
    def __prepare_general_ib_command_params(self):
        """Prepare general params for ib commands.

        Returns:
            Str of ib command params if arguments are valid, otherwise False.
        """
        # Format the ib command type
        self._args.command = self._args.command.lower()
        # Add message size for ib command
        msg_size = f'-s {self._args.msg_size}' if self._args.msg_size > 0 else '-a'
        # Add GPUDirect for ib command
        gpu_dev = ''
        if self._args.gpu_dev is not None:
            gpu = GPU()
            if gpu.vendor == 'nvidia':
                gpu_dev = f'--use_cuda={self._args.gpu_dev}'
            elif gpu.vendor == 'amd':
                gpu_dev = f'--use_rocm={self._args.gpu_dev}'
            else:
                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
                logger.error('No GPU found - benchmark: {}'.format(self._name))
                return False
        # Generate ib command params
        command_params = f'-F -n {self._args.iters} -d {self._args.ib_dev} {msg_size} {gpu_dev}'
        command_params = f'{command_params.strip()} --report_gbits'
        return command_params
示例#10
0
    def parse_args(self, ignore_invalid=False):
        """Parse the arguments.

        Return:
            ret (bool): whether parse succeed or not.
            args (argparse.Namespace): parsed arguments.
            unknown (list): unknown arguments.
        """
        try:
            args, unknown = self._parser.parse_known_args(self._argv)
        except BaseException as e:
            if ignore_invalid:
                logger.info(
                    'Missing or invliad parameters, will ignore the error and skip the args checking.'
                )
                return True, None, []
            else:
                logger.error(
                    'Invalid argument - benchmark: {}, message: {}.'.format(
                        self._name, str(e)))
                return False, None, []

        ret = True
        if len(unknown) > 0:
            logger.error(
                'Unknown arguments - benchmark: {}, unknown arguments: {}'.
                format(self._name, ' '.join(unknown)))
            ret = False

        return ret, args, unknown
示例#11
0
    def _set_binary_path(self):
        """Search the binary from self._args.bin_dir or from system environment path and set the binary directory.

        If self._args.bin_dir is specified, the binary is only searched inside it. Otherwise, the binary is searched
        from system environment path.

        Return:
            True if the binary exists.
        """
        if self._bin_name is None:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_BINARY_NAME_NOT_SET)
            logger.error('The binary name is not set - benchmark: {}.'.format(self._name))
            return False

        self._args.bin_dir = shutil.which(self._bin_name, mode=os.X_OK, path=self._args.bin_dir)

        if self._args.bin_dir is None:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_BINARY_NOT_EXIST)
            logger.error(
                'The binary does not exist - benchmark: {}, binary name: {}, binary directory: {}.'.format(
                    self._name, self._bin_name, self._args.bin_dir
                )
            )
            return False

        self._args.bin_dir = os.path.dirname(self._args.bin_dir)

        return True
示例#12
0
    def __check_raw_data(self):
        """Check the validation of raw data.

        Return:
            True if the raw data is:
              instance of List[List[Number]] for BenchmarkType.MODEL.
              instance of List[str] for BenchmarkType.DOCKER.
              instance of List[List[Number]] or List[str] for BenchmarkType.MICRO.
        """
        for metric in self._result.raw_data:
            is_valid = True
            if self._benchmark_type == BenchmarkType.MODEL:
                is_valid = self.__is_list_list_type(
                    self._result.raw_data[metric], numbers.Number)
            elif self._benchmark_type == BenchmarkType.DOCKER:
                is_valid = self.__is_list_type(self._result.raw_data[metric],
                                               str)
            elif self._benchmark_type == BenchmarkType.MICRO:
                is_valid = self.__is_list_type(
                    self._result.raw_data[metric],
                    str) or self.__is_list_list_type(
                        self._result.raw_data[metric], numbers.Number)
            if not is_valid:
                logger.error(
                    'Invalid raw data type - benchmark: {}, metric: {}, raw data: {}.'
                    .format(self._name, metric, self._result.raw_data[metric]))
                return False

        return True
示例#13
0
    def run(self):
        """Function to launch the benchmarking.

        Return:
            True if run benchmark successfully.
        """
        ret = True
        try:
            ret &= self._preprocess()
            if ret:
                self._start_time = datetime.utcnow().strftime(
                    '%Y-%m-%d %H:%M:%S')
                for self._curr_run_index in range(self._args.run_count):
                    ret &= self._benchmark()
                self._end_time = datetime.utcnow().strftime(
                    '%Y-%m-%d %H:%M:%S')
                self._result.set_timestamp(self._start_time, self._end_time)

                if ret:
                    ret &= self.__check_result_format()
        except BaseException as e:
            self._result.set_return_code(ReturnCode.RUNTIME_EXCEPTION_ERROR)
            logger.error(
                'Run benchmark failed - benchmark: {}, message: {}'.format(
                    self._name, str(e)))
        finally:
            ret &= self._postprocess()

        return ret
示例#14
0
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        self.add_parser_arguments()
        ret, self._args, unknown = self.parse_args()

        if not ret:
            self._result = BenchmarkResult(self._name, self._benchmark_type,
                                           ReturnCode.INVALID_ARGUMENT)
            return False

        self._result = BenchmarkResult(self._name,
                                       self._benchmark_type,
                                       ReturnCode.SUCCESS,
                                       run_count=self._args.run_count)

        if not isinstance(self._benchmark_type, BenchmarkType):
            logger.error(
                'Invalid benchmark type - benchmark: {}, type: {}'.format(
                    self._name, type(self._benchmark_type)))
            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE)
            return False

        return True
示例#15
0
    def __init__(self, shape, world_size, dtype=torch.float):
        """Constructor.

        Args:
            shape (List[int]): Shape of dataset.
            world_size (int): Number of workers.
            dtype (torch.dtype): Type of the elements.
        """
        self._len = 0
        self._data = None

        try:
            if dtype in [torch.float32, torch.float64]:
                self._data = torch.randn(*shape, dtype=dtype)
            elif dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
                self._data = torch.randint(0, 128, tuple(shape), dtype=dtype)
            else:
                logger.error(
                    'Unsupported precision for RandomDataset - data type: {}.'.
                    format(dtype))
                return
        except BaseException as e:
            logger.error(
                'Generate random dataset failed - data type: {}, shape: {}, message: {}.'
                .format(dtype, shape, str(e)))
            return

        self._len = shape[0] * world_size
        self._world_size = world_size
示例#16
0
    def __get_network_bytes(self):
        """Method to get the network traffic information, unit: bytes.

        Return:
            The bytes transferred on the network, None means fail to get the data.
        """
        net_info = dict()
        try:
            with open(self._net_file, 'r') as f:
                for line in f:
                    items = line.split()
                    if len(items) != 17:
                        continue
                    else:
                        receive_bytes = int(items[1])
                        transmit_bytes = int(items[9])
                        net_info[items[0].strip()[:-1]] = [
                            receive_bytes, transmit_bytes
                        ]
            return net_info
        except BaseException as e:
            logger.error(
                'Failed to read network traffic information - error message: {}'
                .format(str(e)))

        return None
示例#17
0
    def _create_model(self, precision):
        """Construct the model for benchmarking.

        Args:
            precision (Precision): precision of model and input data, such as float32, float16.
        """
        self._config = GPT2Config(n_embd=self._args.hidden_size,
                                  n_layer=self._args.num_hidden_layers,
                                  n_head=self._args.num_attention_heads)

        try:
            self._model = GPT2BenchmarkModel(self._config,
                                             self._args.num_classes)
            self._model = self._model.to(dtype=getattr(torch, precision.value))
            if self._gpu_available:
                self._model = self._model.cuda()
        except BaseException as e:
            logger.error(
                'Create model with specified precision failed - model: {}, precision: {}, message: {}.'
                .format(self._name, precision, str(e)))
            return False

        self._target = torch.LongTensor(self._args.batch_size).random_(
            self._args.num_classes)
        if self._gpu_available:
            self._target = self._target.cuda()

        return True
示例#18
0
    def run(self):
        """Method representing the process’s activity.

        Return:
            True if launching the process succeed.
        """
        if self.__running.value == 0:
            if not self.__preprocess():
                return False

            try:
                logger.info('Start monitoring.')
                self.__running.value = 1
                self.__sample()
                self.__scheduler.run()
            except BaseException as e:
                logger.error(
                    'Failed to launch the monitor process - error message: {}'.
                    format(str(e)))
                self.stop()
                return False
        else:
            logger.error('Monitor is still running')

        return True
示例#19
0
    def __kernel_nccl_pipeline(self, kernel, matA, matB, stages, message,
                               times):
        """Computation and NCCL kernel pipeline with single GPU.

        Args:
            kernel (ComputationKernelType): the type of the computation kernel to run.
            matA (list[tensor]): the matrix list used in matmul or mul for every stage.
            matB (tensor): the matrix used in matmul.
            stages (int): the ratio number of computation kernel and communication kernel.
            message(tensor): the data used to be transferred through NCCL.
            times(int): number of times in one step to run.

        Return:
            True of False: if computation kernel type is invalid, return False, else, return True.
        """
        if kernel == ComputationKernelType.MUL:
            for i in range(times):
                torch.distributed.all_reduce(message,
                                             op=torch.distributed.ReduceOp.SUM,
                                             async_op=True)
                for stage in range(stages):
                    matA[stage].mul(matA[stage])
        elif kernel == ComputationKernelType.MATMUL:
            for i in range(times):
                torch.distributed.all_reduce(message,
                                             op=torch.distributed.ReduceOp.SUM,
                                             async_op=True)
                for stage in range(stages):
                    matA[stage].matmul(matB)
        else:
            logger.error(
                'Unknown comoputation kernel type - benchmark: {}, type: {}.'.
                format(self._name, kernel))
            return False
        return True
示例#20
0
    def output_diagnosis_in_jsonl(self, data_not_accept_df, output_path):
        """Output data_not_accept_df into jsonl file.

        Args:
            data_not_accept_df (DataFrame): the DataFrame to output
            output_path (str): the path of output jsonl file
        """
        p = Path(output_path)
        try:
            data_not_accept_json = data_not_accept_df.to_json(orient='index')
            data_not_accept = json.loads(data_not_accept_json)
            if not isinstance(data_not_accept_df, pd.DataFrame):
                logger.warning('DataDiagnosis: output json data - data_not_accept_df is not DataFrame.')
                return
            if data_not_accept_df.empty:
                logger.warning('DataDiagnosis: output json data - data_not_accept_df is empty.')
                return
            with p.open('w') as f:
                for node in data_not_accept:
                    line = data_not_accept[node]
                    line['Index'] = node
                    json_str = json.dumps(line)
                    f.write(json_str + '\n')
        except Exception as e:
            logger.error('DataDiagnosis: output json data failed, msg: {}'.format(str(e)))
    def _process_raw_result(self, cmd_idx, raw_output):
        """Function to parse raw results and save the summarized results.

          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output,
                                  self._args.log_raw_data)

        try:
            output_lines = [x.strip() for x in raw_output.strip().splitlines()]
            for output_line in output_lines:
                tag, bw_str = output_line.split()
                self._result.add_result(tag + '_bw', float(bw_str))
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, raw_output, str(e)))
            return False

        return True
示例#22
0
    def _parse_rules_and_baseline(self, rules, baseline):
        """Parse and merge rules and baseline read from file.

        Args:
            rules (dict): rules from rule yaml file
            baseline (dict): baseline of metrics from baseline json file

        Returns:
            bool: return True if successfully get the criteria for all rules, otherwise False.
        """
        try:
            if not rules:
                logger.error('DataDiagnosis: get criteria failed')
                return False
            self._sb_rules = {}
            self._enable_metrics = set()
            benchmark_rules = rules['superbench']['rules']
            self._raw_rules = benchmark_rules
            for rule in benchmark_rules:
                benchmark_rules[rule] = self._check_and_format_rules(benchmark_rules[rule], rule)
                self._sb_rules[rule] = {}
                self._sb_rules[rule]['name'] = rule
                self._sb_rules[rule]['function'] = benchmark_rules[rule]['function']
                self._sb_rules[rule]['store'] = True if 'store' in benchmark_rules[
                    rule] and benchmark_rules[rule]['store'] is True else False
                self._sb_rules[rule]['criteria'] = benchmark_rules[rule]['criteria']
                self._sb_rules[rule]['categories'] = benchmark_rules[rule]['categories']
                self._sb_rules[rule]['metrics'] = {}
                self.__get_metrics_and_baseline(rule, benchmark_rules, baseline)
            self._enable_metrics = sorted(list(self._enable_metrics))
        except Exception as e:
            logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e)))
            return False

        return True
示例#23
0
    def _sync_result(self, result):
        """Function to reduce the result to rank 0.

        Args:
            result (list): The result data to sync.

        Return:
            Result if reduce result data successfully, otherwise None.
        """
        result = super()._sync_result(result)
        if not result:
            return None

        try:
            if self._args.distributed_impl == DistributedImpl.DDP:
                if self._args.distributed_backend == DistributedBackend.NCCL:
                    tensor = torch.as_tensor(result).cuda()
                else:
                    tensor = torch.as_tensor(result)
                torch.distributed.all_reduce(tensor,
                                             op=torch.distributed.ReduceOp.MAX)
                result = tensor.tolist()
        except BaseException as e:
            logger.error(
                'Sync train result failed - model: {}, distributed implementation: {}, message: {}.'
                .format(self._name, self._args.distributed_impl, str(e)))
            return None

        return result
示例#24
0
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        if self._args.distributed_impl != DistributedImpl.DDP:
            self._result.set_return_code(
                ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
            logger.error(
                'Unsupported distributed implementation - model: {}, distributed implementation: {}.'
                .format(self._name, self._args.distributed_impl))
            return False

        if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
            try:
                torch.distributed.init_process_group(backend='nccl')
                self.__world_size = int(os.environ['WORLD_SIZE'])
                self.__local_rank = int(os.environ['LOCAL_RANK'])
            except BaseException as e:
                self._result.set_return_code(
                    ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
                torch.distributed.destroy_process_group()
                logger.error(
                    'Initialize distributed env failed - benchmark: {}, message: {}.'
                    .format(self._name, str(e)))
                return False

        if torch.cuda.is_available():
            torch.cuda.set_device(self.__local_rank)

        return True
    def _benchmark(self):
        """Implementation for benchmarking.

        Return:
            True if run benchmark successfully.
        """
        logger.info('TCP validation - round: {0}, name: {1}'.format(
            self._curr_run_index, self._name))

        # Run TCPing on host in the hostfile in parallel
        try:
            outputs = Parallel(
                n_jobs=min(len(self.__hosts), self._args.parallel))(
                    delayed(run_tcping)(self.__hosts[i], self._args.port,
                                        self._args.count, self._args.timeout)
                    for i in (range(len(self.__hosts))))
        except Exception as e:
            self._result.set_return_code(
                ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
            logger.error(
                'Microbenchmark execution failed - round: {}, benchmark: {}, error message: {}.'
                .format(self._curr_run_index, self._name, str(e)))
            return False

        # Parse the output and get the results
        for host_index, out in enumerate(outputs):
            if not self._process_raw_result(host_index, out):
                self._result.set_return_code(
                    ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
                return False

        return True
示例#26
0
    def _benchmark(self):
        """Implementation for benchmarking."""
        M = self._args.m
        K = self._args.k
        N = self._args.n
        for mode in self._args.mode:
            if mode == ShardingMode.NOSHARDING:
                elapse_times = self.__matmul_nosharding(M, K, N)
            elif mode == ShardingMode.ALLREDUCE:
                elapse_times = self.__matmul_allreduce(M, K, N)
            elif mode == ShardingMode.ALLGATHER:
                elapse_times = self.__matmul_allgather(M, K, N)
            else:
                logger.error(
                    'Unknown sharding mode - benchmark: {}, mode: {}.'.format(
                        self._name, mode))
                return False

            metric = '{}_time'.format(mode)
            if not self._process_numeric_result(
                    metric, elapse_times, reduce_type=ReduceType.MAX):
                return False

            logger.info(
                'Matmul sharding - round: {0}, name: {1}, shape: ({2}, {3}) * ({3}, {4}), mode: {5}, cost: {6} ms'
                .format(self._curr_run_index, self._name, M, K, N, mode,
                        statistics.mean(elapse_times)))

        return True
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        # Check if the content of hostfile is valid and not empty
        valid = True
        try:
            with open(self._args.hostfile, 'r') as f:
                self.__hosts = f.readlines()
            for i in range(0, len(self.__hosts)):
                self.__hosts[i] = self.__hosts[i].rstrip('\n')
        except Exception:
            valid = False
        if not valid or len(self.__hosts) == 0:
            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
            logger.error(
                'Invalid hostfile - benchmark: {}, hostfile: {}.'.format(
                    self._name, self._args.hostfile))
            return False

        return True
示例#28
0
def correlation(raw_data_df):
    """Get the correlations.

    Args:
        raw_data_df (DataFrame): raw data

    Returns:
        DataFrame: correlations
    """
    data_corr_df = pd.DataFrame()
    if not isinstance(raw_data_df, pd.DataFrame):
        logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
        return data_corr_df
    if len(raw_data_df) == 0:
        logger.warning('DataAnalyzer: empty data.')
        return data_corr_df
    try:
        data_corr_df = raw_data_df.corr()
        statistics_error = []
        for column in list(raw_data_df.columns):
            if column not in list(data_corr_df.columns
                                  ) and not raw_data_df[column].isnull().all():
                statistics_error.append(column)
        if statistics_error:
            logger.warning(
                'DataAnalyzer: [{}] is missing in correlation results.'.format(
                    ','.join(str(x) for x in statistics_error)))
    except Exception as e:
        logger.error('DataAnalyzer: correlation failed, msg: {}'.format(
            str(e)))
    return data_corr_df
示例#29
0
    def _process_raw_result(self, cmd_idx, raw_output):
        """Function to parse raw results and save the summarized results.

          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output,
                                  self._args.log_raw_data)

        pattern = r'\d+\.\d+'
        result = re.findall(pattern, raw_output)
        if len(result) != 2:
            logger.error(
                'Cannot extract kernel launch overhead in event and wall mode - round: {}, benchmark: {}, raw data: {}.'
                .format(self._curr_run_index, self._name, raw_output))
            return False

        try:
            result = [float(item) for item in result]
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, result: {}, message: {}.'
                .format(self._curr_run_index, self._name, result, str(e)))
            return False

        self._result.add_result('event_time', result[0])
        self._result.add_result('wall_time', result[1])

        return True
示例#30
0
    def add_result(self, metric, value, reduce_type=None):
        """Add summarized data into result.

        Args:
            metric (str): metric name which is the key.
            value (float): summarized data.
              For e2e model benchmarks, the value is step-time or throughput.
              For micro-benchmarks, the value is FLOPS, bandwidth and etc.
            reduce_type (ReduceType): type of reduce function.

        Return:
            True if succeed to add the result.
        """
        if not metric or not isinstance(metric, str):
            logger.error(
                'metric name of benchmark is not string, name: {}, metric type: {}'
                .format(self.__name, type(metric)))
            return False

        if metric not in self.__result:
            self.__result[metric] = list()
            self.__reduce_op[metric] = reduce_type.value if isinstance(
                reduce_type, Enum) else None
        self.__result[metric].append(value)

        return True