示例#1
0
    def _get_hisi_log(self: any, info: any, err_i_folder: str) -> None:
        hisi_log_devid_path = os.path.join(self.collection.collect_bbox_path,
                                           Constant.DIR_BBOX,
                                           "device-" + info.dev_id)
        if not os.path.exists(hisi_log_devid_path):
            utils.print_warn_log(
                'There is no hisi log for device_id(%s), the path=%s.' %
                (info.dev_id, hisi_log_devid_path))
            return

        key_word = "device_id=%s, stream_id=%s, task_id=%s" % (
            info.dev_id, info.stream_id, info.task_id)
        cmd = ['grep', key_word, '-nr', self.collection.collect_bbox_path]
        _, data = utils.execute_command(cmd)
        regexp = r"(%s.+?(\d+)-(\d+).+%s)" % (
            self.collection.collect_bbox_path, 'ts.txt')
        ret = re.findall(regexp, data, re.M)
        if len(ret) == 0:
            utils.print_warn_log(
                "Failed to get hisi log for device_id(%s) stream_id(%s) "
                "task_id(%s), you may reboot and try again." %
                (info.dev_id, info.stream_id, info.task_id))
            return

        # find the last time(max time)
        max_hisi_file_path = self._get_max_hisi_file_path(ret)
        utils.copy_file(max_hisi_file_path,
                        os.path.join(err_i_folder, "ts.log"))
示例#2
0
    def _get_tiling_info(self, kernel_name) -> list:
        aic_info_cmd = [
            'grep', '-r', '-C', '7',
            "\[AIC_INFO\] dev_func:{}".format(kernel_name),
            self.collection.collect_applog_path
        ]
        _, aic_info = utils.execute_command(aic_info_cmd)

        aic_info_blockdim_regexp = r"\[AIC_INFO\]\sblock_dim:(\d+)"
        aic_info_blockdim_ret = re.findall(aic_info_blockdim_regexp, aic_info,
                                           re.M)
        if len(aic_info_blockdim_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_blockdim_regexp}")
        elif len(aic_info_blockdim_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_blockdim_regexp} is null")
            block_dim = ""
        else:
            block_dim = int(aic_info_blockdim_ret[0][0])

        aic_info_tiling_data_regex = r"\[AIC_INFO\]\stiling_data:(.*?)"
        aic_info_tiling_data_ret = re.findall(aic_info_tiling_data_regex,
                                              aic_info, re.M)
        if len(aic_info_tiling_data_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_tiling_data_regex}")
        elif len(aic_info_tiling_data_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_tiling_data_regex} is null")
            tiling_data = ""
        else:
            tiling_data = bytes(aic_info_tiling_data_ret[0][0],
                                encoding="utf-8")

        return (block_dim, tiling_data)
示例#3
0
def collect_local_file(report_path: str, key: str, collect_path: str) -> str:
    """
    collect local file:
    :param report_path: the local report path
    :param key: the key in slog_conf_path
    :param collect_path: the collect path
    :return: the local path
    """
    collect_target_path = os.path.join(collect_path,
                                       os.path.basename(report_path))
    utils.check_path_valid(collect_target_path, isdir=True, output=True)
    if key == Constant.DIR_SLOG:
        slog_report_path = os.path.join(report_path, key)
        if os.path.exists(slog_report_path) and \
                os.path.isdir(slog_report_path):
            copy_file_to_dest(slog_report_path, key, collect_target_path,
                              report_path)
        else:
            utils.print_error_log('There is no %s in %s.' % (key, report_path))
            raise utils.AicErrException(
                Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
    elif key == 'MNTN_PATH':
        collect_target_path = os.path.join(collect_path,
                                           os.path.basename(report_path))
        utils.check_path_valid(collect_target_path, isdir=True, output=True)
        hisi_report_path = os.path.join(report_path, Constant.DIR_BBOX)
        if os.path.exists(hisi_report_path) and \
                os.path.isdir(hisi_report_path):
            copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log('There is no hisi_logs in %s.' % report_path)
    return collect_target_path
示例#4
0
    def _save_tensor_to_file(self: any, tensor_list: list, tensor_type: str, dump_file: str) -> str:
        result_info = ''
        if len(tensor_list) == 0:
            utils.print_warn_log(
                'There is no %s in "%s".' % (tensor_type, dump_file))
            return result_info
        dump_file_path, _ = os.path.split(dump_file)
        for (index, tensor) in enumerate(tensor_list):
            try:
                array = np.frombuffer(tensor.data,
                                      dtype=self._get_dtype_by_data_type(
                                          tensor.data_type))
                npy_file_name = ".".join([self.kernel_name, tensor_type, str(index), "npy"])
                np.save(os.path.join(dump_file_path, npy_file_name), array)
                if (np.isinf(array).any() or np.isnan(array).any()) and tensor_type == "input":
                    result_info += '%s[%d] NaN/INF\n' % (tensor_type, index)
                    utils.print_error_log('%s[%d] NaN/INF\n' % (tensor_type, index))
                    raise utils.AicErrException(
                        Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
            except (ValueError, IOError, OSError, MemoryError) as error:
                utils.print_error_log('Failed to parse the data of %s:%d of "%s". %s' % (
                    tensor_type, index, dump_file, error))
                raise utils.AicErrException(
                    Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
            finally:
                pass

        return result_info
示例#5
0
    def copy_kernel_meta(self, report_path: str, collect_compile_path: str, kernel_name: str) -> bool:
        """
        collect local dump file:
        :param report_path: the local compile path
        :param collect_compile_path: the collect compile path
        :param kernel_name: the kernel name
        """
        match = False
        kernel_meta_path = os.path.join(self.report_path, "extra-info", "ops")
        if os.path.exists(kernel_meta_path):
            for root, _, names in os.walk(kernel_meta_path):
                for name in names:
                    if name.startswith(kernel_name):
                        src = os.path.join(root, name)
                        collect_kernel_meta_path = os.path.join(
                            collect_compile_path, "kernel_meta")
                        utils.check_path_valid(collect_kernel_meta_path, isdir=True,
                                        output=True)
                        dest = os.path.join(collect_kernel_meta_path, name)
                        utils.copy_file(src, dest)
                        match = True

        if not match:
            utils.print_warn_log('There is no kernel_meta file for "%s" in %s.'
                        % (kernel_name, report_path))
        return match
示例#6
0
    def _get_occur_before_mark(decompile_file: str, diff_str: str,
                               info: any) -> bool:
        #      504:    04c20000    ST.b64         X1, [X0], #0
        with open(decompile_file, "r") as fo_file:
            text = fo_file.read()

        regexp = r'(^\s+(\S+):\s+\S+\s+\S.+$)'
        ret = re.findall(regexp, text, re.M)
        find_i = -1
        for i, (_, line_diff) in enumerate(ret):
            if line_diff == diff_str:
                find_i = i
                break

        if find_i == -1:
            utils.print_warn_log(
                "Get fault instruction failed, file(%s) diff(%s)" %
                (decompile_file, diff_str))
            return False

        begin_i = 0 if find_i < 9 else find_i - 9
        instr_str_list = []
        for i in range(begin_i, find_i + 1):
            instr_str_list.append(ret[i][0] + "\n")
        instr_str = "".join(instr_str_list).strip("\n")

        info.instr += "\nrelated instructions (error occured before the mark *):\n\n"
        info.instr += instr_str[:instr_str.rfind('\n') + 1] + '*'
        info.instr += instr_str[instr_str.rfind('\n') + 2:]
        info.instr += "\n\nFor complete instructions, please view %s" % decompile_file

        return True
示例#7
0
 def _get_aicerror_args(data):
     regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?device\((\d+)\),.*?core id is (\d+),\s+error code = (\S+)," \
              r".*?pc start:\s(\S+),\scurrent:\s(\S+),\svec error info:\s(\S+),\smte error info:\s(\S+)," \
              r"\sifu error info:\s(\S+),\sccu error info:\s(\S+),\scube error info:\s(\S+)," \
              r"\sbiu error info:\s(\S+),\saic error mask:\s(\S+),\spara base:\s(\S+)."
     ret = re.findall(regexp, data, re.M | re.S)
     if len(ret) == 0:
         utils.print_warn_log(
             "aic error info does not match in  plog \"aicore kernel execute failed\"")
         return None
     return ret
示例#8
0
 def _get_imas_log(self: any) -> None:
     imas_log_file = os.path.join(self.output_path, "imas.log")
     cmd = ['grep', 'IMAS', '-nr', self.collection.collect_applog_path]
     utils.print_info_log('Start to analyze IMAS log.')
     status, data = utils.execute_command(cmd)
     if status == 1:
         utils.print_warn_log("There is no IMAS log in %s" %
                              self.output_path)
         return
     if status != 0:
         utils.print_error_log("Failed to execute command: %s. %s" %
                               (" ".join(cmd), " ".join(data)))
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     utils.write_file(imas_log_file, data)
     utils.print_info_log('The IMAS log is saved in %s.' % imas_log_file)
示例#9
0
    def _get_cce_tbe_code_number(self: any, decompile_file: str,
                                 loc_json_file: str, err_pc: str,
                                 info: any) -> bool:
        # txt code to cce number
        if os.path.exists(decompile_file) is False:
            utils.print_error_log("The decompile file does not exist.")
            return False

        if err_pc != "":
            cce_code_num = self._read_decompile_file(decompile_file, err_pc,
                                                     info)
            # cce to tbe code number
            if os.path.exists(loc_json_file) is False:
                utils.print_warn_log("file %s not exist" % loc_json_file)
                return False
            self._read_loc_json_file(loc_json_file, cce_code_num, info)
        return True
示例#10
0
 def _get_node_name_by_kernel_name(self: any, kernel_name: any) -> str:
     """
     get node name by kernel name
     :param kernel_name:
     :return:  node_name  
     """
     node_name = ''
     aic_info_cmd = ['grep', '-r',  '-C', '7',  "\[AIC_INFO\] dev_func:{}".format(kernel_name),
                     self.collect_applog_path]
     _, aic_info = utils.execute_command(aic_info_cmd)
     aic_info_dev_func_regex = r"\[AIC_INFO\]\snode_name:(.*?),"
     aic_info_dev_func_ret = re.findall(aic_info_dev_func_regex, aic_info)
     if len(aic_info_dev_func_ret) == 0:
         utils.print_warn_log("Failed to get node name by kernel name.")
         return node_name
     node_name = aic_info_dev_func_ret[0]
     return node_name
示例#11
0
    def _decompile(self: any, kernel_info: list, dir_path: str,
                   info: any) -> bool:
        kernel_name = kernel_info[0]
        kernel_meta_path = kernel_info[1]
        diff_str, err_pc = self._get_info_for_decompile(info)

        # decompile .o file
        cce_file = os.path.join(kernel_meta_path, kernel_name + ".cce")
        if os.path.exists(cce_file) is False:
            utils.print_warn_log(".cce file %s not exist" % cce_file)
        else:
            utils.copy_file(cce_file,
                            os.path.join(dir_path, kernel_name + ".cce"))

        # decompile .o file
        o_file = os.path.join(kernel_meta_path, kernel_name + ".o")
        if os.path.exists(o_file) is False:
            utils.print_warn_log(".o file %s not exist" % o_file)
            return False

        utils.copy_file(o_file, os.path.join(dir_path, kernel_name + ".o"))

        utils.copy_file(o_file, os.path.join(dir_path, kernel_name + ".json"))

        decompile_file_name = kernel_name + ".o.txt"
        decompile_file = os.path.join(dir_path, decompile_file_name)

        status = self._get_decompile_status(o_file, decompile_file)
        if status != 0:
            utils.print_error_log(
                "Failed to decompile %s, you can fix problem according to the "
                "message above, or copy %s and %s to another host and execute : "
                "%s -d -mcpu=%s %s > %s" %
                (o_file, Constant.OBJ_DUMP_FILE, o_file,
                 Constant.OBJ_DUMP_FILE, "dav-m100", kernel_name + ".o",
                 decompile_file_name))
            return False

        loc_json_file = os.path.join(kernel_meta_path,
                                     kernel_name + "_loc.json")
        self._get_cce_tbe_code_number(decompile_file, loc_json_file, err_pc,
                                      info)
        self._get_occur_before_mark(decompile_file, diff_str, info)

        return True
示例#12
0
 def _get_graph_file(self: any) -> any:
     match_list = []
     for top, _, files in os.walk(self.collection.collect_compile_path):
         for name in files:
             file_name_pattern = re.compile(
                 Constant.BUILD_PROTO_FILE_PATTERN)
             pattern_match = file_name_pattern.match(name)
             if pattern_match:
                 match_list.append(
                     (pattern_match.group(1), os.path.join(top, name)))
     if len(match_list) == 0:
         utils.print_warn_log('There is no graph file in %s.' %
                              self.collection.collect_compile_path)
         return ''
     new_match_list = sorted(match_list, key=lambda s: s[0], reverse=True)
     choose_file = new_match_list[0][1]
     utils.print_info_log('Choose %s to read op info.' % choose_file)
     return choose_file
示例#13
0
    def _get_node_and_kernel_name(self: any, dev_id: any, task_id: any,
                                  stream_id: any, err_time: any) -> tuple:
        data = self._get_node_and_kernel_name_execute_command()
        regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?device_id=\d+\s*,\s*stream_id=" \
                 r"%s\s*.+?\s*task_id=%s\s*,\s*fault kernel_name=" \
                 r"[-\d_]{0,}(\S+?),\s*func_name=(\S+)__kernel\d+" % (
                     stream_id, task_id)
        ret = re.findall(regexp, data, re.M | re.S)
        if len(ret) == 0:
            utils.print_warn_log(
                "There is no node name and kernel name for dev id(%s) "
                "task id(%s) stream id(%s) in plog."
                % (dev_id, task_id, stream_id))
            return '', ''

        if len(ret) > 1:
            max_i = self._get_the_latest_aicerr_form_ret(ret, err_time)
            return ret[max_i][1:]
        return ret[0][1:]
示例#14
0
def copy_file_to_dest(log_path: str, target: str, collect_target_path: str, report_path: str) -> None:
    """
    copy file to dest:
    :param log_path: the local log path
    :param target: the target in log
    :param collect_target_path: the collect path
    :param report_path: the local report path
    """
    match = False
    for top, _, files in os.walk(log_path):
        for name in files:
            src = os.path.join(top, name)
            dest = os.path.join(collect_target_path,
                                top[len(report_path) + 1:], name)
            utils.copy_file(src, dest)
            match = True
    if not match:
        utils.print_warn_log(
            'There is no %s file in %s.' % (target, report_path))
示例#15
0
    def _analyse_op_graph(info: any, input_output_addrs: list) -> None:
        # 分析图op信息
        if info.operator:
            regexp = r'input_desc\s+\{\s+dtype:\s+(\S+).+?\s+size:\s+(\d+)'
            input_ret = re.findall(regexp, info.operator, re.M | re.S)

            regexp = r'output_desc\s+\{\s+dtype:\s+(\S+).+?\s+size:\s+(\d+)'
            output_ret = re.findall(regexp, info.operator, re.M | re.S)

            if len(input_ret) + len(output_ret) != len(input_output_addrs):
                utils.print_warn_log(
                    'The number(%d) of input/output in logs does not match with'
                    ' that(%d) in graph.' % (len(input_output_addrs),
                                             len(input_ret) + len(output_ret)))
            for i in input_output_addrs:
                ret = input_ret if i.is_input else output_ret
                if i.idx <= len(ret) - 1:
                    rec = ret[i.idx]
                    i.dtype = rec[0]  # DT_FLOAT
                    i.size = int(rec[1])  # bytes
示例#16
0
 def _get_op_by_graph(graph_file: str, info: any) -> None:
     if graph_file == '':
         return
     try:
         with open(graph_file, 'r') as graph:
             text = graph.read()
             regexp = r'(op\s+\{\s+name:\s+"%s".+?%s.+?\})\s+' \
                      r'op\s+\{' % (info.node_name, info.kernel_name)
             ret = re.findall(regexp, text, re.M | re.S)
             if len(ret) == 0:
                 utils.print_warn_log(
                     'Failed to get op for node(%s) kernel(%s).' %
                     (info.node_name, info.kernel_name))
                 return
             info.operator = ret[0]
     except IOError as io_error:
         utils.print_error_log('Failed to open file %s. %s' %
                               (graph_file, io_error))
         raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR)
     finally:
         pass
示例#17
0
def collect_remote_file(report_path: str, key: str, collect_path: str) -> str:
    """
    collect local file:
    :param report_path: the local report path
    :param key: the key in slog_conf_path
    :param collect_path: the collect path
    :return: the local path
    """
    collect_target_path = os.path.join(collect_path,
                                       os.path.basename(report_path))
    utils.check_path_valid(collect_target_path, isdir=True, output=True)
    if key == Constant.DIR_SLOG:
        slog_report_path = os.path.join(report_path, "log", "device", "firmware")
        if os.path.exists(slog_report_path) and \
                os.path.isdir(slog_report_path):
            copy_file_to_dest(slog_report_path, key, collect_target_path,
                              report_path)
        else:
            utils.print_error_log(
                'There is no %s in %s.' % (key, report_path))
    elif key == 'MNTN_PATH':
        hisi_report_path = os.path.join(report_path, "log", "device", "system")
        if os.path.exists(hisi_report_path) and \
                os.path.isdir(hisi_report_path):
            copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log(
                'There is no hisi_logs in %s.' % report_path)
    elif key == Constant.DIR_PLOG:
        plog_path = os.path.join(report_path, "log", "host", "cann")
        if os.path.exists(plog_path) and \
                os.path.isdir(plog_path):
            copy_file_to_dest(plog_path, Constant.DIR_PLOG,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log(
                'There is no plog in %s.' % report_path)
    return collect_target_path
示例#18
0
    def copy_proto_file(self, report_path: str, collect_compile_path: str) -> bool:
        """
        copy proto file:
        :param report_path: the local compile path
        :param collect_compile_path: the collect compile path
        """
        match = False
        proto_path = os.path.join(self.report_path, "extra-info", "graph")
        for root, _, names in os.walk(report_path):
            for name in names:
                file_name_pattern = re.compile(Constant.BUILD_PROTO_FILE_PATTERN)
                pattern_match = file_name_pattern.match(name)
                if pattern_match:
                    src = os.path.join(root, name)
                    dest = os.path.join(collect_compile_path, name)
                    utils.copy_file(src, dest)
                    match = True

        if not match:
            utils.print_warn_log('There is no graph file in %s.' % report_path)

        return match
示例#19
0
 def parse(self: any) -> str:
     """
     Function Description: dump data parse.
     """
     # 1. check arguments valid
     self.check_arguments_valid()
     match_name = "".join(['.', self.op_name.replace('/', '_'), '.'])
     match_dump_list = []
     for top, _, files in os.walk(self.input_path):
         for name in files:
             if match_name in name:
                 match_dump_list.append(os.path.join(top, name))
     result_info_list = []
     for dump_file in match_dump_list:
         result_info_list.extend(['%s\n' % dump_file,
                                  self.parse_dump_data(dump_file)])
     result_info = "".join(result_info_list)
     if len(match_dump_list) == 0:
         utils.print_warn_log('There is no dump file for "%s". Please '
                              'check the dump path.' % self.op_name)
     utils.print_info_log(f"Parse dump file finished,result_info:{result_info}")
     return result_info
示例#20
0
    def _get_input_output_addrs(self: any, info: any, err_i_folder: str,
                                alloc_addr: str, actual_addr: str) -> list:
        ret = self._get_input_output_addrs_cmd_process(info, err_i_folder)
        if len(ret) == 0:
            utils.print_warn_log('Failed to get input address and output '
                                 'address for %s.' % info.node_name)
            return []

        # get the last value
        flags = {}
        for _, (time_str, flag, addr) in enumerate(ret):
            time_obj = utils.strplogtime(time_str)
            if flag in flags and time_obj <= flags.get(flag)[0]:
                continue
            flags[flag] = [time_obj, flag, addr]
        input_output_addrs = []
        for _, (time_obj, flag, addr) in enumerate(flags.values()):
            op_io = OpInputOutput()
            op_io.name = flag  # 'input[0]'
            op_io.addr = addr
            op_io.idx = int(flag[flag.find('[') + 1:flag.find(']')])
            op_io.is_input = flag.lower().startswith("input")
            input_output_addrs.append(op_io)

            # 获取真实地址
            if addr in actual_addr:
                op_io.actual_addr = actual_addr[addr]

        # 分析图op信息
        self._analyse_op_graph(info, input_output_addrs)

        # 检查地址是否越界
        if len(alloc_addr) > 0:
            self._analyse_alloc_addr_range(alloc_addr, input_output_addrs)

        return input_output_addrs
示例#21
0
    def parse(self: any) -> None:
        """
        parse by collection info
        """
        utils.print_info_log('******************Analysis******************')
        aicore_error_data_list = self._aicore_error_data()
        utils.print_info_log('Start to analyze each ai core error.')
        summary_info_list = []

        # decompile
        if "aarch64" in platform.machine():
            obj_dump_file = "cce-objdump_aarch64"
        else:
            obj_dump_file = "cce-objdump"

        obj_dump_file = os.path.join(os.getcwd(), "tools", obj_dump_file)
        if os.path.exists(obj_dump_file):
            os.system("chmod 755 " + obj_dump_file)
            os.environ["PATH"] = os.path.join(
                os.getcwd(), "tools") + ":" + os.environ["PATH"]
        else:
            cce_dump = shutil.which("cce-objdump")
            if not cce_dump:
                # guess where is cce-objdump
                parent_path = "aarch64-linux" if "aarch64" in platform.machine(
                ) else "x86_64-linux"
                cce_dump_guess = os.path.join("usr/local/Ascend/latest",
                                              parent_path,
                                              "ccec_compiler/bin/cce-objdump")
                if os.path.exists(cce_dump_guess):
                    cce_dump = cce_dump_guess

            if not cce_dump:
                utils.print_error_log(
                    'Cannot find  cce-objdump! please add cce-objdump path in env PATH.'
                )
                raise utils.AicErrException(
                    Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
        for i, current_pc in enumerate(self.collection.ai_core_error_list):
            # parser aic error by slog
            info = AicErrorInfo()
            info.err_time, info.dev_id, info.stream_id, info.task_id, \
            info.core_id, info.aic_error, info.start_pc, info.extra_info, \
            info.current_pc = current_pc

            utils.print_info_log(
                "******************No.%d %s******************" %
                (i, info.err_time))
            info.err_time_obj = utils.strplogtime(info.err_time)
            err_i_folder_name = "aicerror_%d_%s" % (
                i, time.strftime("%Y%m%d%H%M%S",
                                 info.err_time_obj.timetuple()))
            err_i_folder = os.path.join(self.output_path, err_i_folder_name)
            utils.check_path_valid(err_i_folder, isdir=True, output=True)
            info.node_name = self.collection.node_name_list[i]
            info.kernel_name = self.collection.kernel_name_list[i]
            # get hisi log
            self._get_hisi_log(info, err_i_folder)
            # get op info in build proto file
            self._get_op_by_graph(aicore_error_data_list[Constant.GRAPH_FILE],
                                  info)
            kernel_meta_path = os.path.join(
                self.collection.collect_compile_path, 'kernel_meta')
            if os.path.exists(kernel_meta_path):
                # 反编译  出错指令
                result = self._decompile([info.kernel_name, kernel_meta_path],
                                         err_i_folder, info)
                if result is False:
                    utils.print_warn_log(
                        "decompile kernel_meta file %s failed." % os.path.join(
                            kernel_meta_path, info.kernel_name + ".o"))
            else:
                utils.print_warn_log("kernel_meta path %s not exist" %
                                     kernel_meta_path)
            try:
                # input output address
                info.aval_addrs = self._get_available_addrs(info.err_time)
                info.necessary_addr = self._get_necessary_addrs(
                    info.kernel_name)
                self._check_addr(info.aval_addrs, info.necessary_addr)
                # self.print_summary(avl_addr, necessary_addr)
            except BaseException as e:
                import logging
                logging.exception(e)
                print("Check addr error failed")

            info.input_output_addrs = self._get_input_output_addrs(
                info, err_i_folder,
                aicore_error_data_list[Constant.ALLOC_ADDR],
                aicore_error_data_list[Constant.ACTUAL_ADDR])

            # 地址越界信息收集
            info.addr_overflow = aicore_error_data_list[Constant.ADDR_OVERFLOW]
            # 算子代码地址,args地址
            info.op_addr, info.args_addr, info.multi_args_addr = \
                self._get_op_and_args_addr(info.start_pc)

            # parse dump
            if self.collection.collect_dump_path:
                parser = DumpDataParser(self.collection.collect_dump_path,
                                        info.node_name, info.kernel_name)
                info.dump_info = parser.parse()

            # write info file
            self._write_errorinfo_file(err_i_folder, info, i)

            summary_info_list.append(
                "%s   %s   device_id=%s   core_id=%s   task_id=%s   node=%s   "
                "kernel=%s" %
                (err_i_folder_name, info.aic_error, info.dev_id, info.core_id,
                 info.task_id, info.node_name, info.kernel_name))
        utils.print_info_log('Finish to analyze each ai core error.')
        # write summary info
        self._write_summary_file(summary_info_list)
示例#22
0
    def _get_necessary_addrs(self: any, kernal_name: str) -> list:
        '''
        获取occur_time时刻可用的地址
        :param kernal_name: 发生aicore error的kernal_name
        :return: 需要的空间
        '''
        result = {}
        aic_info_cmd = [
            'grep', '-r', '-C', '7',
            "\[AIC_INFO\] dev_func:{}".format(kernal_name),
            self.collection.collect_applog_path
        ]
        _, aic_info = utils.execute_command(aic_info_cmd)
        utils.print_info_log(
            "===============================\n{}\n=================================="
            .format(aic_info))
        aic_info_all_regexp = r"\[AIC_INFO\]\snode_name:(.*?),\snode_type:(.*?),\sstream_id:(\d+),\stask_id:(\d+)"
        aic_info_all_ret = re.findall(aic_info_all_regexp, aic_info, re.M)
        if len(aic_info_all_ret) == 0:
            utils.print_warn_log(
                "Failed to get [AIC_INFO]\snode_name(.*?),\snode_tye(.*?),\sstream_id:(\d+),\stask_id:(\d+)"
            )
            return
        node_name = aic_info_all_ret[0][0]
        node_type = aic_info_all_ret[0][1]
        stream_id = aic_info_all_ret[0][2]
        task_id = aic_info_all_ret[0][3]

        aic_info_input_regexp = r"\[AIC_INFO\]\sinput:(.*?);shape:(.*?);format:(.*?);dtype:(.*?);addr:(.*?)$"
        aic_info_input_ret = re.findall(aic_info_input_regexp, aic_info, re.M)
        if len(aic_info_input_ret) == 0:
            utils.print_warn_log(
                "Failed to get [AIC_INFO]\sinput:(.*?);shape(.*?);format:(.*?);dtype(.*?);addr:(.*?)"
            )
            return
        input_params = []

        for input_info in aic_info_input_ret:
            input_param = {}
            input_param["index"] = input_info[0]
            input_param["shape"] = input_info[1]
            input_param["format"] = input_info[2]
            input_param["dtype"] = input_info[3]
            input_param["addr"] = input_info[4]
            input_params.append(input_param)

        aic_info_output_regexp = r"\[AIC_INFO\]\soutput:(.*?);shape:(.*?);format:(.*?);dtype:(.*?);addr:(.*?)$"
        aic_info_output_ret = re.findall(aic_info_output_regexp, aic_info,
                                         re.M)
        if len(aic_info_output_ret) == 0:
            utils.print_warn_log(
                "Failed to get [AIC_INFO]\soutput:(.*?);shape(.*?);format:(.*?);dtype(.*?);addr:(.*?)"
            )
            return
        output_params = []
        for output_info in aic_info_output_ret:
            output_param = {}
            output_param["index"] = output_info[0]
            output_param["shape"] = output_info[1]
            output_param["format"] = output_info[2]
            output_param["dtype"] = output_info[3]
            output_param["addr"] = output_info[4]
            output_params.append(output_param)

        aic_info_blockdim_regexp = r"\[AIC_INFO\]\sblock_dim:(\d+)"
        aic_info_blockdim_ret = re.findall(aic_info_blockdim_regexp, aic_info,
                                           re.M)
        if len(aic_info_blockdim_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_blockdim_regexp}")
        elif len(aic_info_blockdim_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_blockdim_regexp} is null")
            block_dim = ""
        else:
            block_dim = int(aic_info_blockdim_ret[0][0])

        aic_info_workspace_regex = r"\[AIC_INFO\]\sworkspace_bytes:(.*?)"
        aic_info_workspace_ret = re.findall(aic_info_workspace_regex, aic_info,
                                            re.M)
        if len(aic_info_workspace_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_workspace_regex}")
        elif len(aic_info_workspace_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_workspace_regex} is null")
            workspace = "0"
        else:
            workspace = aic_info_workspace_ret[0][0]

        aic_info_dev_func_regex = r"\[AIC_INFO\]\sdev_func:(.*?)"
        aic_info_dev_func_ret = re.findall(aic_info_dev_func_regex, aic_info,
                                           re.M)
        aic_info_tvm_magic_regex = r"\[AIC_INFO\]\stvm_magic:(.*?)"
        aic_info_tvm_magic_ret = re.findall(aic_info_tvm_magic_regex, aic_info,
                                            re.M)
        aic_info_kernel_info_regex = r"\[AIC_INFO\]\skernel_info:(.*?)"
        aic_info_kernel_info_ret = re.findall(aic_info_kernel_info_regex,
                                              aic_info, re.M)
        aic_info_tiling_key_regex = r"\[AIC_INFO\]\stiling_key:(.*?)"
        aic_info_tiling_key_ret = re.findall(aic_info_tiling_key_regex,
                                             aic_info, re.M)
        aic_info_tiling_data_regex = r"\[AIC_INFO\]\stiling_data:(.*?)"
        aic_info_tiling_data_ret = re.findall(aic_info_tiling_data_regex,
                                              aic_info, re.M)

        if len(aic_info_tiling_data_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_tiling_data_regex}")
        elif len(aic_info_tiling_data_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_tiling_data_regex} is null")
            tiling_data = ""
        else:
            tiling_data = bytes(aic_info_tiling_data_ret[0][0],
                                encoding="utf-8")

        aic_info_op_file_path_regex = r"\[AIC_INFO\]\sop_file_path:(.*?)"
        aic_info_op_file_path_ret = re.findall(aic_info_op_file_path_regex,
                                               aic_info, re.M)

        result["input_addr"] = input_params
        result["output_addr"] = output_params
        result["workspace"] = workspace
        return result