示例#1
0
    def _get_available_addrs(self: any, occur_time: str) -> list:
        '''
        获取occur_time时刻可用的地址
        :param occur_time: aicore error发生的时间
        :return: 可用空间的list
        '''
        alloc_cmd = [
            'grep', 'DevMalloc: Succ,', '-nr',
            self.collection.collect_applog_path
        ]
        _, alloc_data = utils.execute_command(alloc_cmd)
        alloc_regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?size\s*=\s*([" \
                       r"\d]+).+?ptr\s*=\s*([\da-zA-Z]+)"
        alloc_ret = re.findall(alloc_regexp, alloc_data, re.M)

        free_cmd = [
            'grep', 'DevFree: mem', '-nr', self.collection.collect_applog_path
        ]
        _, free_data = utils.execute_command(free_cmd)
        free_regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?mem\s*=\s*([\da-zA-Z]+)"
        free_ret = re.findall(free_regexp, free_data, re.M)
        avl_addr = []
        occur_time_obj = utils.strplogtime(occur_time)
        for _, (alloc_time, size, addr) in enumerate(alloc_ret):
            alloc_time_obj = utils.strplogtime(alloc_time)

            if alloc_time_obj < occur_time_obj:
                avl_addr.append((addr, int(size)))

        for _, (free_time, addr) in enumerate(free_ret):
            free_time_obj = utils.strplogtime(free_time)
            if free_time_obj < occur_time_obj:
                avl_addr = self._remove_first_found_addr(addr, avl_addr)
        utils.print_info_log("get available addr: {}".format(avl_addr))
        return avl_addr
示例#2
0
 def _get_input_output_addrs_cmd_process(self: any, info: any,
                                         err_i_folder: str) -> list:
     cmd = ['grep', "memaddr", '-nr', self.collection.collect_applog_path]
     _, data = utils.execute_command(cmd)
     tmp_file = os.path.join(err_i_folder, 'tmp.txt')
     utils.write_file(tmp_file, data)
     cmd = ['grep', info.node_name, '-nr', tmp_file]
     _, data = utils.execute_command(cmd)
     utils.rm_path(tmp_file, self.output_path)
     regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?\[IMAS\].+(input\[\d+\]|" \
              r"output\[\d+\]) *memaddr\[(\S+)\]"
     ret = re.findall(regexp, data, re.M)
     return ret
示例#3
0
    def _get_hisi_log(self: any, info: any, err_i_folder: str) -> None:
        hisi_log_devid_path = os.path.join(self.collection.collect_bbox_path,
                                           Constant.DIR_BBOX,
                                           "device-" + info.dev_id)
        if not os.path.exists(hisi_log_devid_path):
            utils.print_warn_log(
                'There is no hisi log for device_id(%s), the path=%s.' %
                (info.dev_id, hisi_log_devid_path))
            return

        key_word = "device_id=%s, stream_id=%s, task_id=%s" % (
            info.dev_id, info.stream_id, info.task_id)
        cmd = ['grep', key_word, '-nr', self.collection.collect_bbox_path]
        _, data = utils.execute_command(cmd)
        regexp = r"(%s.+?(\d+)-(\d+).+%s)" % (
            self.collection.collect_bbox_path, 'ts.txt')
        ret = re.findall(regexp, data, re.M)
        if len(ret) == 0:
            utils.print_warn_log(
                "Failed to get hisi log for device_id(%s) stream_id(%s) "
                "task_id(%s), you may reboot and try again." %
                (info.dev_id, info.stream_id, info.task_id))
            return

        # find the last time(max time)
        max_hisi_file_path = self._get_max_hisi_file_path(ret)
        utils.copy_file(max_hisi_file_path,
                        os.path.join(err_i_folder, "ts.log"))
示例#4
0
    def _get_tiling_info(self, kernel_name) -> list:
        aic_info_cmd = [
            'grep', '-r', '-C', '7',
            "\[AIC_INFO\] dev_func:{}".format(kernel_name),
            self.collection.collect_applog_path
        ]
        _, aic_info = utils.execute_command(aic_info_cmd)

        aic_info_blockdim_regexp = r"\[AIC_INFO\]\sblock_dim:(\d+)"
        aic_info_blockdim_ret = re.findall(aic_info_blockdim_regexp, aic_info,
                                           re.M)
        if len(aic_info_blockdim_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_blockdim_regexp}")
        elif len(aic_info_blockdim_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_blockdim_regexp} is null")
            block_dim = ""
        else:
            block_dim = int(aic_info_blockdim_ret[0][0])

        aic_info_tiling_data_regex = r"\[AIC_INFO\]\stiling_data:(.*?)"
        aic_info_tiling_data_ret = re.findall(aic_info_tiling_data_regex,
                                              aic_info, re.M)
        if len(aic_info_tiling_data_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_tiling_data_regex}")
        elif len(aic_info_tiling_data_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_tiling_data_regex} is null")
            tiling_data = ""
        else:
            tiling_data = bytes(aic_info_tiling_data_ret[0][0],
                                encoding="utf-8")

        return (block_dim, tiling_data)
示例#5
0
 def _get_decompile_status(o_file: str, decompile_file: str) -> int:
     flags = "dav-m100"
     cmd = [
         Constant.OBJ_DUMP_FILE, '-d', '-mcpu=' + flags, '-line-numbers',
         o_file
     ]
     status, _ = utils.execute_command(cmd, file_out=decompile_file)
     return status
示例#6
0
 def _get_air_error_execute_command(self):
     grep_cmd = ['grep', 'PrintCoreErrorInfo:.*?there is an aicore error',
                 '-inrE', self.collect_applog_path]
     status, data = utils.execute_command(grep_cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s.Maybe rts break when report Core log to host." %
                               " ".join(grep_cmd))
         raise utils.AicErrException(Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     return data
示例#7
0
 def _get_node_and_kernel_name_execute_command(self: any) -> any:
     grep_cmd = ['grep', 'PrintErrorInfo:.*?aicore kernel execute failed',
                 '-inrE', self.collect_applog_path]
     status, data = utils.execute_command(grep_cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd))
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     return data
示例#8
0
 def _get_addr_overflow_mini(self: any) -> list:
     cmd = [
         'grep', 'devmm_page_fault_d2h_query_flag', '-nr',
         self.collection.collect_slog_path
     ]
     _, data = utils.execute_command(cmd)
     regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?va=([\da-zA-Z]+)"
     ret = re.findall(regexp, data, re.M)
     for i, (time_str, value) in enumerate(ret):
         ret[i] = "%s %s is out of range" % (time_str, value)
     return ret
示例#9
0
 def _get_all_error_log(self: any) -> None:
     error_log_file = os.path.join(self.output_path, "error.log")
     utils.print_info_log('Start to analyze error slog.')
     cmd = ['grep', r'\[ERROR\]', '-nr', self.collection.collect_slog_path]
     status, data = utils.execute_command(cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s. %s" %
                               (" ".join(cmd), " ".join(data)))
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     utils.write_file(error_log_file, data)
     utils.print_info_log('The error slog is saved in %s.' % error_log_file)
示例#10
0
    def _get_op_param(self, kernel_name) -> list:
        get_param_cmd = [
            'grep',
            f'BuildSingleOp Prebuilding op: kernelName\[{kernel_name}\]',
            '-hr', '-A', '3', self.collection.collect_applog_path
        ]
        _, get_param_data = utils.execute_command(get_param_cmd)
        purified_data = re.sub(r"\[INFO\].*?\[fusion_op.cc:\d+?\].*?\s", "",
                               get_param_data)
        purified_data = re.sub(r"[\n]", "", purified_data)
        get_param_regexp = r"op inputs:\s*\((.*?)\),\s*outputs:\s*\((.*?)\),\s*attrs:\s*\((.*?)\)\."
        get_param_ret = re.findall(get_param_regexp, purified_data, re.M)
        if len(get_param_ret) == 0:
            utils.print_error_log(
                f"Fail to get op params of kernel [{kernel_name}] in host log ."
            )
            raise utils.AicErrException(
                Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
        input_str, output_str, attr_str = get_param_ret[0]

        result_list = []
        input_list = ast.literal_eval("[" + input_str + "]")
        for (index, input_item) in enumerate(input_list):
            input_item["param_type"] = "input"
            input_item["run_shape"] = input_item.get("shape")
            data_file = os.path.join(
                self.collection.collect_dump_path,
                ".".join([kernel_name, "input",
                          str(index), "npy"]))

            input_item["value"] = np.load(data_file)
            x_range = []
            for i in input_item.get("shape"):
                x_range.append((i, i))
            input_item["range"] = x_range
        result_list.extend(input_list)

        output_list = ast.literal_eval("[" + output_str + "]")
        for output_item in output_list:
            output_item["param_type"] = "output"
            output_item["run_shape"] = output_item.get("shape")
            y_range = []
            for i in output_item.get("shape"):
                y_range.append((i, i))
            output_item["range"] = y_range
        result_list.extend(output_list)

        attr_list_ori = ast.literal_eval("[" + attr_str + "]")
        for attr_item in attr_list_ori:
            if isinstance(attr_item, dict):
                result_list.append(attr_item.get("value"))
        return result_list
示例#11
0
 def _get_addr_overflow_cloud(self: any) -> list:
     cmd = [
         'grep', 'previous alloced start_va', '-nr',
         self.collection.collect_slog_path
     ]
     _, data = utils.execute_command(cmd)
     regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?va=([\da-zA-Z]+)\s+previous " \
              r"alloced start_va=([\da-zA-Z]+), end_va=([\da-zA-Z]+),"
     ret = re.findall(regexp, data, re.M)
     for i, (time_str, value, start, end) in enumerate(ret):
         ret[i] = "%s %s is out of range [%s, %s]" % (time_str, value,
                                                      start, end)
     return ret
示例#12
0
 def _get_alloc_addr(self: any) -> list:
     #  DevMalloc: Succ, size=512, type=2, ptr=0x108040014000
     cmd = [
         'grep', 'DevMalloc: Succ,', '-nr',
         self.collection.collect_applog_path
     ]
     _, data = utils.execute_command(cmd)
     regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?size\s*=\s*([" \
              r"\d]+).+?ptr\s*=\s*([\da-zA-Z]+)"
     ret = re.findall(regexp, data, re.M)
     alloc_addr = []
     for _, (_, size, addr) in enumerate(ret):
         alloc_addr.append((addr, int(size)))
     return alloc_addr
示例#13
0
    def _get_op_and_args_addr(self: any, pc_start: str) -> tuple:
        # pc_start低48位有效
        code = utils.get_01_from_hexstr(pc_start, 47, 0)
        op_addr = hex(int(code, 2))
        match_pattern = "ToCommandBody: funcAddr=%s" % (str(op_addr).upper())

        cmd = [
            'grep', match_pattern, '-nr', self.collection.collect_applog_path
        ]
        _, data = utils.execute_command(cmd)
        regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?funcAddr=([\da-zA-Z]+).+?args=([\da-zA-Z]+)"
        ret = re.findall(regexp, data, re.M)
        args_addr_late, multi_args_addr = self._get_args_addr_late(
            op_addr, ret)
        return op_addr, args_addr_late, multi_args_addr
示例#14
0
 def _get_module_str(self, kernel_name) -> str:
     get_module_cmd = [
         'grep', rf'kernel\[{kernel_name}\].*module\[', '-hr',
         self.collection.collect_applog_path
     ]
     _, get_module_data = utils.execute_command(get_module_cmd)
     get_module_regexp = rf"kernel\[{kernel_name}\].*?module\[(.*?)\]"
     get_module_ret = re.findall(get_module_regexp, get_module_data, re.M)
     if len(get_module_ret) == 0:
         utils.print_error_log(
             f"Fail to get op module of kernel [{kernel_name}] in host log ."
         )
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     return get_module_ret[0]
示例#15
0
 def _get_imas_log(self: any) -> None:
     imas_log_file = os.path.join(self.output_path, "imas.log")
     cmd = ['grep', 'IMAS', '-nr', self.collection.collect_applog_path]
     utils.print_info_log('Start to analyze IMAS log.')
     status, data = utils.execute_command(cmd)
     if status == 1:
         utils.print_warn_log("There is no IMAS log in %s" %
                              self.output_path)
         return
     if status != 0:
         utils.print_error_log("Failed to execute command: %s. %s" %
                               (" ".join(cmd), " ".join(data)))
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     utils.write_file(imas_log_file, data)
     utils.print_info_log('The IMAS log is saved in %s.' % imas_log_file)
示例#16
0
 def _get_addr_overflow_diff_incorrect_device(self: any) -> list:
     cmd = [
         'grep', 'devmm_svm_get_vaflgs_by_pid', '-nr',
         self.collection.collect_slog_path
     ]
     _, data = utils.execute_command(cmd)
     regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?addr is mapped.+va=" \
              r"([\da-zA-Z]+).devid=(\d+),bitmap=([\da-zA-Z]+)"
     ret = re.findall(regexp, data, re.M)
     for i, (time_str, value, devid, bitmap) in enumerate(ret):
         # bitmap 的  【31:26】 标明 该地址在哪个device上分配
         code = utils.get_01_from_hexstr(bitmap, 31, 26)
         allocated_dev_id = str(int(code, 2))
         ret[i] = "%s %s, allocated for device %s, is visited on wrong " \
                  "device whose id is %s" % (
                      time_str, value, allocated_dev_id, devid)
     return ret
示例#17
0
 def _get_node_name_by_kernel_name(self: any, kernel_name: any) -> str:
     """
     get node name by kernel name
     :param kernel_name:
     :return:  node_name  
     """
     node_name = ''
     aic_info_cmd = ['grep', '-r',  '-C', '7',  "\[AIC_INFO\] dev_func:{}".format(kernel_name),
                     self.collect_applog_path]
     _, aic_info = utils.execute_command(aic_info_cmd)
     aic_info_dev_func_regex = r"\[AIC_INFO\]\snode_name:(.*?),"
     aic_info_dev_func_ret = re.findall(aic_info_dev_func_regex, aic_info)
     if len(aic_info_dev_func_ret) == 0:
         utils.print_warn_log("Failed to get node name by kernel name.")
         return node_name
     node_name = aic_info_dev_func_ret[0]
     return node_name
示例#18
0
 def _get_actual_addr(self: any) -> dict:
     # 获取真实地址
     cmd = [
         'grep', '[ZCPY] Copy Blobs', '-nr',
         self.collection.collect_slog_path
     ]
     _, data = utils.execute_command(cmd)
     regexp = r'(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?Copy Blobs.+?addr:\s*([' \
              r'\da-zA-Z]+).+?data:' \
              r'\s*([\da-zA-Z]+)'
     ret = re.findall(regexp, data, re.M)
     actual_addr = {}
     for _, (time_str, old_addr, new_addr) in enumerate(ret):
         time_obj = utils.strplogtime(time_str)
         if old_addr in actual_addr:
             # 取最迟的
             if time_obj > actual_addr.get(old_addr)[1]:
                 actual_addr[old_addr] = [new_addr, time_obj]
         else:
             actual_addr[old_addr] = [new_addr, time_obj]
     for old_addr in actual_addr:
         actual_addr[old_addr] = actual_addr.get(old_addr)[0]
     return actual_addr
示例#19
0
 def get_op_info(self: any) -> tuple:
     grep_cmd = ['grep', '<exception_print>TIME.*4060006', '-nr', '-A',
                 '120', self.collect_slog_path]
     status, data = utils.execute_command(grep_cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd))
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     ret = re.findall(Constant.EXCEPTION_PATTERN, data, re.M | re.S)
     if len(ret) == 0:
         utils.print_info_log("No AIC_ERROR found.")
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     for device_aic_err in ret:
         if len(device_aic_err) != Constant.AIC_ERROR_TUPLE_LEN:
             utils.print_info_log("The AIC_ERROR is not complete.")
             raise utils.AicErrException(
                 Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
         log_time = device_aic_err[0]
         dev_id = device_aic_err[1]
         stream_id = device_aic_err[2]
         task_id = device_aic_err[3]
         err_time = utils.strplogtime(log_time)
         node_name, kernel_name = self._get_node_and_kernel_name(
             dev_id, task_id, stream_id, err_time)
         if node_name == '' and kernel_name == '':
             continue
         self.ai_core_error_list.append(device_aic_err)
         self.node_name_list.append(node_name)
         self.kernel_name_list.append(kernel_name)
     if len(self.ai_core_error_list) == 0:
         utils.print_error_log(
             "The AIC_ERROR of device does not match the host.")
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     return self.ai_core_error_list, self.node_name_list, self.kernel_name_list
示例#20
0
    def _get_necessary_addrs(self: any, kernal_name: str) -> list:
        '''
        获取occur_time时刻可用的地址
        :param kernal_name: 发生aicore error的kernal_name
        :return: 需要的空间
        '''
        result = {}
        aic_info_cmd = [
            'grep', '-r', '-C', '7',
            "\[AIC_INFO\] dev_func:{}".format(kernal_name),
            self.collection.collect_applog_path
        ]
        _, aic_info = utils.execute_command(aic_info_cmd)
        utils.print_info_log(
            "===============================\n{}\n=================================="
            .format(aic_info))
        aic_info_all_regexp = r"\[AIC_INFO\]\snode_name:(.*?),\snode_type:(.*?),\sstream_id:(\d+),\stask_id:(\d+)"
        aic_info_all_ret = re.findall(aic_info_all_regexp, aic_info, re.M)
        if len(aic_info_all_ret) == 0:
            utils.print_warn_log(
                "Failed to get [AIC_INFO]\snode_name(.*?),\snode_tye(.*?),\sstream_id:(\d+),\stask_id:(\d+)"
            )
            return
        node_name = aic_info_all_ret[0][0]
        node_type = aic_info_all_ret[0][1]
        stream_id = aic_info_all_ret[0][2]
        task_id = aic_info_all_ret[0][3]

        aic_info_input_regexp = r"\[AIC_INFO\]\sinput:(.*?);shape:(.*?);format:(.*?);dtype:(.*?);addr:(.*?)$"
        aic_info_input_ret = re.findall(aic_info_input_regexp, aic_info, re.M)
        if len(aic_info_input_ret) == 0:
            utils.print_warn_log(
                "Failed to get [AIC_INFO]\sinput:(.*?);shape(.*?);format:(.*?);dtype(.*?);addr:(.*?)"
            )
            return
        input_params = []

        for input_info in aic_info_input_ret:
            input_param = {}
            input_param["index"] = input_info[0]
            input_param["shape"] = input_info[1]
            input_param["format"] = input_info[2]
            input_param["dtype"] = input_info[3]
            input_param["addr"] = input_info[4]
            input_params.append(input_param)

        aic_info_output_regexp = r"\[AIC_INFO\]\soutput:(.*?);shape:(.*?);format:(.*?);dtype:(.*?);addr:(.*?)$"
        aic_info_output_ret = re.findall(aic_info_output_regexp, aic_info,
                                         re.M)
        if len(aic_info_output_ret) == 0:
            utils.print_warn_log(
                "Failed to get [AIC_INFO]\soutput:(.*?);shape(.*?);format:(.*?);dtype(.*?);addr:(.*?)"
            )
            return
        output_params = []
        for output_info in aic_info_output_ret:
            output_param = {}
            output_param["index"] = output_info[0]
            output_param["shape"] = output_info[1]
            output_param["format"] = output_info[2]
            output_param["dtype"] = output_info[3]
            output_param["addr"] = output_info[4]
            output_params.append(output_param)

        aic_info_blockdim_regexp = r"\[AIC_INFO\]\sblock_dim:(\d+)"
        aic_info_blockdim_ret = re.findall(aic_info_blockdim_regexp, aic_info,
                                           re.M)
        if len(aic_info_blockdim_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_blockdim_regexp}")
        elif len(aic_info_blockdim_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_blockdim_regexp} is null")
            block_dim = ""
        else:
            block_dim = int(aic_info_blockdim_ret[0][0])

        aic_info_workspace_regex = r"\[AIC_INFO\]\sworkspace_bytes:(.*?)"
        aic_info_workspace_ret = re.findall(aic_info_workspace_regex, aic_info,
                                            re.M)
        if len(aic_info_workspace_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_workspace_regex}")
        elif len(aic_info_workspace_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_workspace_regex} is null")
            workspace = "0"
        else:
            workspace = aic_info_workspace_ret[0][0]

        aic_info_dev_func_regex = r"\[AIC_INFO\]\sdev_func:(.*?)"
        aic_info_dev_func_ret = re.findall(aic_info_dev_func_regex, aic_info,
                                           re.M)
        aic_info_tvm_magic_regex = r"\[AIC_INFO\]\stvm_magic:(.*?)"
        aic_info_tvm_magic_ret = re.findall(aic_info_tvm_magic_regex, aic_info,
                                            re.M)
        aic_info_kernel_info_regex = r"\[AIC_INFO\]\skernel_info:(.*?)"
        aic_info_kernel_info_ret = re.findall(aic_info_kernel_info_regex,
                                              aic_info, re.M)
        aic_info_tiling_key_regex = r"\[AIC_INFO\]\stiling_key:(.*?)"
        aic_info_tiling_key_ret = re.findall(aic_info_tiling_key_regex,
                                             aic_info, re.M)
        aic_info_tiling_data_regex = r"\[AIC_INFO\]\stiling_data:(.*?)"
        aic_info_tiling_data_ret = re.findall(aic_info_tiling_data_regex,
                                              aic_info, re.M)

        if len(aic_info_tiling_data_ret) == 0:
            utils.print_warn_log(f"Failed to get {aic_info_tiling_data_regex}")
        elif len(aic_info_tiling_data_ret[0]) == 0:
            utils.print_info_log(f"get {aic_info_tiling_data_regex} is null")
            tiling_data = ""
        else:
            tiling_data = bytes(aic_info_tiling_data_ret[0][0],
                                encoding="utf-8")

        aic_info_op_file_path_regex = r"\[AIC_INFO\]\sop_file_path:(.*?)"
        aic_info_op_file_path_ret = re.findall(aic_info_op_file_path_regex,
                                               aic_info, re.M)

        result["input_addr"] = input_params
        result["output_addr"] = output_params
        result["workspace"] = workspace
        return result