示例#1
0
def collect_local_file(report_path: str, key: str, collect_path: str) -> str:
    """
    collect local file:
    :param report_path: the local report path
    :param key: the key in slog_conf_path
    :param collect_path: the collect path
    :return: the local path
    """
    collect_target_path = os.path.join(collect_path,
                                       os.path.basename(report_path))
    utils.check_path_valid(collect_target_path, isdir=True, output=True)
    if key == Constant.DIR_SLOG:
        slog_report_path = os.path.join(report_path, key)
        if os.path.exists(slog_report_path) and \
                os.path.isdir(slog_report_path):
            copy_file_to_dest(slog_report_path, key, collect_target_path,
                              report_path)
        else:
            utils.print_error_log('There is no %s in %s.' % (key, report_path))
            raise utils.AicErrException(
                Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
    elif key == 'MNTN_PATH':
        collect_target_path = os.path.join(collect_path,
                                           os.path.basename(report_path))
        utils.check_path_valid(collect_target_path, isdir=True, output=True)
        hisi_report_path = os.path.join(report_path, Constant.DIR_BBOX)
        if os.path.exists(hisi_report_path) and \
                os.path.isdir(hisi_report_path):
            copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log('There is no hisi_logs in %s.' % report_path)
    return collect_target_path
示例#2
0
    def copy_kernel_meta(self, report_path: str, collect_compile_path: str, kernel_name: str) -> bool:
        """
        collect local dump file:
        :param report_path: the local compile path
        :param collect_compile_path: the collect compile path
        :param kernel_name: the kernel name
        """
        match = False
        kernel_meta_path = os.path.join(self.report_path, "extra-info", "ops")
        if os.path.exists(kernel_meta_path):
            for root, _, names in os.walk(kernel_meta_path):
                for name in names:
                    if name.startswith(kernel_name):
                        src = os.path.join(root, name)
                        collect_kernel_meta_path = os.path.join(
                            collect_compile_path, "kernel_meta")
                        utils.check_path_valid(collect_kernel_meta_path, isdir=True,
                                        output=True)
                        dest = os.path.join(collect_kernel_meta_path, name)
                        utils.copy_file(src, dest)
                        match = True

        if not match:
            utils.print_warn_log('There is no kernel_meta file for "%s" in %s.'
                        % (kernel_name, report_path))
        return match
示例#3
0
 def collect_plog_file(collect_path: str) -> None:
     """
     collect plog file
     :param collect_path: the collect path
     """
     home_path = os.path.expanduser("~")
     ascend_path = os.path.join(home_path, Constant.DIR_ASCEND)
     applog_path = os.path.join(ascend_path, Constant.DIR_LOG)
     collect_target_path = os.path.join(collect_path,
                                        os.path.basename(applog_path))
     utils.check_path_valid(collect_target_path, isdir=True, output=True)
     copy_file_to_dest(applog_path, Constant.DIR_PLOG, collect_target_path,
                       applog_path)
示例#4
0
    def collect(self: any) -> None:
        """
        collect info
        """
        self.check_argument_valid()
        collect_path = os.path.join(self.output_path, 'collection')
        utils.check_path_valid(collect_path, isdir=True, output=True)
        utils.print_info_log('******************Collection******************')

        # collect slog
        utils.print_info_log('Start to collect slog file.')
        self.collect_slog_path = self.collect_slog_file(
            self.report_path, collect_path)
        utils.print_info_log('The slog file is saved in %s.' %
                             self.collect_slog_path)

        # collect plog
        utils.print_info_log('Start to collect plog file.')
        self.collect_plog_file(self, collect_path)
        self.collect_applog_path = collect_path
        utils.print_info_log('The plog file is saved in %s.' %
                             self.collect_applog_path)

        # if os.path.exists(os.path.join(self.report_path, "log", "device")):
        #     utils.print_info_log(
        #         'Start to parse ai core error by slog and plog file.')
        #     log_parser = DeviceLogParser(self.collect_applog_path, self.collect_slog_path)
        # else:
        #     # 某些场景无法获取device日志
        utils.print_info_log('Start to parse ai core error only by plog file.')
        log_parser = HostLogParser(self.collect_applog_path)
        self.ai_core_error_list, self.node_name_list, self.kernel_name_list = log_parser.get_op_info(
        )
        utils.print_info_log('The ai core error occurs in %s.' %
                             self.node_name_list)

        # collect compile
        utils.print_info_log('Start to collect compile file.')
        self.collect_compile_path = self.collect_compile_file(
            collect_path, self.kernel_name_list)

        utils.print_info_log('Start to collect dump file.')
        self.collect_dump_path = self.collect_dump_file(
            collect_path, self.node_name_list)

        # collect bbox
        utils.print_info_log('Start to collect bbox file.')
        self.collect_bbox_path = self.collect_bbox_file(
            self.report_path, collect_path)
        utils.print_info_log('The bbox file is saved in %s.' %
                             self.collect_bbox_path)
示例#5
0
    def collect_dump_file(self: any, collect_path: str,
                          op_name_list: list) -> str:
        """
        collect dump file
        :param collect_path: the collect path
        :param op_name_list: the op name list
        """
        # dump files are in compile_path
        utils.check_path_valid(self.compile_path, isdir=True)
        collect_dump_path = os.path.join(collect_path, 'dump')
        utils.check_path_valid(collect_dump_path, isdir=True, output=True)
        copy_dump_file_status = False
        for op_name in op_name_list:
            copy_dump_file_status = utils.copy_dump_file(
                self.compile_path, collect_dump_path, op_name)

        if copy_dump_file_status:
            utils.print_info_log('The dump file is saved in %s.' %
                                 collect_dump_path)

        return collect_dump_path
示例#6
0
 def _parse_dump_file(self: any, dump_file: str) -> any:
     """
     Parse the dump file path by big dump data format
     :param: dump_file the dump file
     :return: DumpData
     :exception when read or parse file error
     """
     utils.check_path_valid(dump_file)
     try:
         # get file size
         file_size = os.path.getsize(dump_file)
         # check file size > 8
         if file_size <= Constant.UINT64_SIZE:
             utils.print_error_log('The size of %s is at least greater then %d, but the file'
                                   ' size is %d. Please check the dump file.'
                                   % (dump_file, Constant.UINT64_SIZE, file_size))
             raise utils.AicErrException(
                 Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
         with open(dump_file, 'rb') as dump_data_file:
             # read header length
             header_length = dump_data_file.read(Constant.UINT64_SIZE)
             header_length = struct.unpack(Constant.UINT64_FMT, header_length)[0]
             # check header_length <= file_size - 8
             if header_length > file_size - Constant.UINT64_SIZE:
                 utils.print_error_log(
                     'The header content size(%d) of %s must be less then '
                     'or equal to %d(file size) - %d(header length).'
                     ' Please check the dump file.'
                     % (header_length, dump_file, file_size,
                        Constant.UINT64_SIZE))
                 raise utils.AicErrException(
                     Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
             # read header content
             return self._get_dump_data(dump_data_file, header_length, file_size)
     except IOError as io_error:
         utils.print_error_log('Failed to read the dump file %s. %s'
                               % (dump_file, str(io_error)))
         raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR)
     finally:
         pass
示例#7
0
 def check_argument_valid(self: any) -> None:
     """
     check argument valid
     """
     utils.check_path_valid(self.report_path, isdir=True)
     utils.check_path_valid(self.compile_path, isdir=True)
     utils.check_path_valid(self.output_path, isdir=True, output=True)
示例#8
0
def collect_remote_file(report_path: str, key: str, collect_path: str) -> str:
    """
    collect local file:
    :param report_path: the local report path
    :param key: the key in slog_conf_path
    :param collect_path: the collect path
    :return: the local path
    """
    collect_target_path = os.path.join(collect_path,
                                       os.path.basename(report_path))
    utils.check_path_valid(collect_target_path, isdir=True, output=True)
    if key == Constant.DIR_SLOG:
        slog_report_path = os.path.join(report_path, "log", "device", "firmware")
        if os.path.exists(slog_report_path) and \
                os.path.isdir(slog_report_path):
            copy_file_to_dest(slog_report_path, key, collect_target_path,
                              report_path)
        else:
            utils.print_error_log(
                'There is no %s in %s.' % (key, report_path))
    elif key == 'MNTN_PATH':
        hisi_report_path = os.path.join(report_path, "log", "device", "system")
        if os.path.exists(hisi_report_path) and \
                os.path.isdir(hisi_report_path):
            copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log(
                'There is no hisi_logs in %s.' % report_path)
    elif key == Constant.DIR_PLOG:
        plog_path = os.path.join(report_path, "log", "host", "cann")
        if os.path.exists(plog_path) and \
                os.path.isdir(plog_path):
            copy_file_to_dest(plog_path, Constant.DIR_PLOG,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log(
                'There is no plog in %s.' % report_path)
    return collect_target_path
示例#9
0
    def collect_compile_file(self: any, collect_path: str, kernel_name_list: list) -> str:
        """
        collect compile file
        :param collect_path: the collect path
        :param kernel_name_list: the kernel name list
        """
        utils.check_path_valid(self.report_path, isdir=True)
        collect_compile_path = os.path.join(collect_path, 'compile')
        utils.check_path_valid(collect_compile_path, isdir=True, output=True)

        copy_kernel_meta_status = False

        for kernel_name in kernel_name_list:
            copy_kernel_meta_status = self.copy_kernel_meta(
                self.report_path, collect_compile_path, kernel_name)
        copy_proto_file_status = self.copy_proto_file(self.report_path,
                                                       collect_compile_path)

        if copy_kernel_meta_status or copy_proto_file_status:
            utils.print_info_log(
                'The compile file is saved in %s.' % collect_compile_path)

        return collect_compile_path
示例#10
0
    def parse(self: any) -> None:
        """
        parse by collection info
        """
        utils.print_info_log('******************Analysis******************')
        aicore_error_data_list = self._aicore_error_data()
        utils.print_info_log('Start to analyze each ai core error.')
        summary_info_list = []

        # decompile
        if "aarch64" in platform.machine():
            obj_dump_file = "cce-objdump_aarch64"
        else:
            obj_dump_file = "cce-objdump"

        obj_dump_file = os.path.join(os.getcwd(), "tools", obj_dump_file)
        if os.path.exists(obj_dump_file):
            os.system("chmod 755 " + obj_dump_file)
            os.environ["PATH"] = os.path.join(
                os.getcwd(), "tools") + ":" + os.environ["PATH"]
        else:
            cce_dump = shutil.which("cce-objdump")
            if not cce_dump:
                # guess where is cce-objdump
                parent_path = "aarch64-linux" if "aarch64" in platform.machine(
                ) else "x86_64-linux"
                cce_dump_guess = os.path.join("usr/local/Ascend/latest",
                                              parent_path,
                                              "ccec_compiler/bin/cce-objdump")
                if os.path.exists(cce_dump_guess):
                    cce_dump = cce_dump_guess

            if not cce_dump:
                utils.print_error_log(
                    'Cannot find  cce-objdump! please add cce-objdump path in env PATH.'
                )
                raise utils.AicErrException(
                    Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
        for i, current_pc in enumerate(self.collection.ai_core_error_list):
            # parser aic error by slog
            info = AicErrorInfo()
            info.err_time, info.dev_id, info.stream_id, info.task_id, \
            info.core_id, info.aic_error, info.start_pc, info.extra_info, \
            info.current_pc = current_pc

            utils.print_info_log(
                "******************No.%d %s******************" %
                (i, info.err_time))
            info.err_time_obj = utils.strplogtime(info.err_time)
            err_i_folder_name = "aicerror_%d_%s" % (
                i, time.strftime("%Y%m%d%H%M%S",
                                 info.err_time_obj.timetuple()))
            err_i_folder = os.path.join(self.output_path, err_i_folder_name)
            utils.check_path_valid(err_i_folder, isdir=True, output=True)
            info.node_name = self.collection.node_name_list[i]
            info.kernel_name = self.collection.kernel_name_list[i]
            # get hisi log
            self._get_hisi_log(info, err_i_folder)
            # get op info in build proto file
            self._get_op_by_graph(aicore_error_data_list[Constant.GRAPH_FILE],
                                  info)
            kernel_meta_path = os.path.join(
                self.collection.collect_compile_path, 'kernel_meta')
            if os.path.exists(kernel_meta_path):
                # 反编译  出错指令
                result = self._decompile([info.kernel_name, kernel_meta_path],
                                         err_i_folder, info)
                if result is False:
                    utils.print_warn_log(
                        "decompile kernel_meta file %s failed." % os.path.join(
                            kernel_meta_path, info.kernel_name + ".o"))
            else:
                utils.print_warn_log("kernel_meta path %s not exist" %
                                     kernel_meta_path)
            try:
                # input output address
                info.aval_addrs = self._get_available_addrs(info.err_time)
                info.necessary_addr = self._get_necessary_addrs(
                    info.kernel_name)
                self._check_addr(info.aval_addrs, info.necessary_addr)
                # self.print_summary(avl_addr, necessary_addr)
            except BaseException as e:
                import logging
                logging.exception(e)
                print("Check addr error failed")

            info.input_output_addrs = self._get_input_output_addrs(
                info, err_i_folder,
                aicore_error_data_list[Constant.ALLOC_ADDR],
                aicore_error_data_list[Constant.ACTUAL_ADDR])

            # 地址越界信息收集
            info.addr_overflow = aicore_error_data_list[Constant.ADDR_OVERFLOW]
            # 算子代码地址,args地址
            info.op_addr, info.args_addr, info.multi_args_addr = \
                self._get_op_and_args_addr(info.start_pc)

            # parse dump
            if self.collection.collect_dump_path:
                parser = DumpDataParser(self.collection.collect_dump_path,
                                        info.node_name, info.kernel_name)
                info.dump_info = parser.parse()

            # write info file
            self._write_errorinfo_file(err_i_folder, info, i)

            summary_info_list.append(
                "%s   %s   device_id=%s   core_id=%s   task_id=%s   node=%s   "
                "kernel=%s" %
                (err_i_folder_name, info.aic_error, info.dev_id, info.core_id,
                 info.task_id, info.node_name, info.kernel_name))
        utils.print_info_log('Finish to analyze each ai core error.')
        # write summary info
        self._write_summary_file(summary_info_list)
示例#11
0
文件: msaicerr.py 项目: Ascend/tools
def main() -> int:
    """
    main function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("-p",
                        "--report_path",
                        dest="report_path",
                        default="",
                        help="<Optional> the tar dir from npucollector",
                        required=False)
    parser.add_argument("-f",
                        "--tar_file",
                        dest="tar_file",
                        default="",
                        help="<Optional> the tar.gz path from npucollector",
                        required=False)
    parser.add_argument("-out",
                        "--output",
                        dest="output_path",
                        default="",
                        help="<Optional> the output path")

    if len(sys.argv) <= 1:
        parser.print_usage()
        return Constant.MS_AICERR_INVALID_PARAM_ERROR
    args = parser.parse_args(sys.argv[1:])
    if (not args.report_path) and (not args.tar_file):
        utils.print_error_log("report_path and tar_file must have one ")
        return Constant.MS_AICERR_INVALID_PARAM_ERROR

    try:
        collect_time = time.localtime()
        cur_time_str = time.strftime("%Y%m%d%H%M%S", collect_time)
        utils.check_path_valid(os.path.realpath(args.output_path),
                               isdir=True,
                               output=True)
        output_path = os.path.join(os.path.realpath(args.output_path),
                                   "info_" + cur_time_str)
        utils.check_path_valid(output_path, isdir=True, output=True)
        if args.tar_file:
            print("Start to unzip tar.gz, ")
            extract_path = "extract_" + cur_time_str
            extract_tar(args.tar_file, extract_path)
            args.report_path = get_select_dir(extract_path)

        # collect info
        collection = RemoteCollection(args.report_path, output_path)
        collection.collect()

        # clear local script.sh
        local_script = os.path.join(output_path, 'collection', Constant.SCRIPT)
        utils.rm_path(local_script, output_path, isdir=True)

        # parse ai core error
        parser = AicoreErrorParser(collection, output_path, collect_time)
        parser.parse()

        single_op_case = SingleOpCase(collection, output_path, collect_time)
        single_op_case.run()

    except utils.AicErrException as error:
        return error.error_info
    finally:
        pass
    return Constant.MS_AICERR_NONE_ERROR
示例#12
0
 def check_arguments_valid(self: any) -> None:
     """
     Function Description: check arguments valid
     """
     utils.check_path_valid(self.input_path, isdir=True)