def _save_tensor_to_file(self: any, tensor_list: list, tensor_type: str, dump_file: str) -> str: result_info = '' if len(tensor_list) == 0: utils.print_warn_log( 'There is no %s in "%s".' % (tensor_type, dump_file)) return result_info dump_file_path, _ = os.path.split(dump_file) for (index, tensor) in enumerate(tensor_list): try: array = np.frombuffer(tensor.data, dtype=self._get_dtype_by_data_type( tensor.data_type)) npy_file_name = ".".join([self.kernel_name, tensor_type, str(index), "npy"]) np.save(os.path.join(dump_file_path, npy_file_name), array) if (np.isinf(array).any() or np.isnan(array).any()) and tensor_type == "input": result_info += '%s[%d] NaN/INF\n' % (tensor_type, index) utils.print_error_log('%s[%d] NaN/INF\n' % (tensor_type, index)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) except (ValueError, IOError, OSError, MemoryError) as error: utils.print_error_log('Failed to parse the data of %s:%d of "%s". %s' % ( tensor_type, index, dump_file, error)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) finally: pass return result_info
def get_op_info(self: any) -> tuple: data = self._get_air_error_execute_command() ret = self._get_aicerror_args(data) for aic_err in ret: log_time = aic_err[0] dev_id = aic_err[1] err_time = utils.strplogtime(log_time) stream_id, task_id, node_name, kernel_name = self._get_node_and_kernel_name( dev_id, err_time) if node_name == '' and kernel_name == '': continue # 适配原开发过程中的device_aic_err device_aic_err = [None] * 9 device_aic_err[0] = aic_err[0] # err time device_aic_err[1] = aic_err[1] # dev id device_aic_err[2] = stream_id # stream id device_aic_err[3] = task_id # task id device_aic_err[4] = aic_err[2] # core id device_aic_err[5] = aic_err[3] # aic error code device_aic_err[6] = aic_err[4] # start pc device_aic_err[7] = self._get_extra_info(aic_err) # extra_info device_aic_err[8] = aic_err[5] # current pc self.ai_core_error_list.append(device_aic_err) self.node_name_list.append(node_name) self.kernel_name_list.append(kernel_name) if len(self.ai_core_error_list) == 0: utils.print_error_log( "The AIC_ERROR of device does not match the host.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return self.ai_core_error_list, self.node_name_list, self.kernel_name_list
def _get_node_and_kernel_name(self: any, dev_id: any, err_time: any) -> tuple: data = self._get_node_and_kernel_name_execute_command() regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?device_id=\d+\s*,\s*stream_id=" \ r"(\d+)\s*.+?\s*task_id=(\d+)\s*,.*?fault kernel_name=" \ r"[-\d_]{0,}(\S+?),\s*func_name=(\S+)," ret = re.findall(regexp, data, re.M | re.S) if len(ret) == 0: utils.print_error_log( "There is no node name and kernel name for dev id(%s) in plog." % dev_id) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) if len(ret) > 1: max_i = self._get_the_latest_aicerr_form_ret(ret, err_time) result = ret[max_i][1:] result = ret[0][1:] kernel_name_list = result[3].split('_') if "" in kernel_name_list: kernel_name_list.remove("") kernel_name_list = kernel_name_list[:-1] kernel_name = '_'.join(kernel_name_list) node_name = self._get_node_name_by_kernel_name(kernel_name) result_list = list(result) result_list[2] = node_name result_list[3] = kernel_name return result_list
def _get_op_impl_type(self, params: list, module_name: str) -> str: has_dynamic_shape = False for para_item in params: if not isinstance(para_item, dict) or para_item.get("param_type") != "input": continue for i in para_item.get("shape"): if i < 0: has_dynamic_shape = True break if has_dynamic_shape: break if ".dynamic." in module_name: if has_dynamic_shape: return "dynamic" else: return "static" else: if has_dynamic_shape: utils.print_error_log( "There is dynamic shape in param, but call static impl") raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) else: return "pre-static"
def _get_dump_data(self: any, dump_file: any, header_length: int, file_size: int) -> any: # read header content content = dump_file.read(header_length) dump_data = DD.DumpData() try: dump_data.ParseFromString(content) except DecodeError as de_error: utils.print_error_log('Failed to parse the serialized header content of %s. ' 'Please check the dump file. %s ' % (dump_file, str(de_error))) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) finally: pass self._check_dump_data_vaild(dump_data, dump_file, header_length, file_size) if len(dump_data.input) > 0: for (index, _) in enumerate(dump_data.input): dump_data.input[index].data = dump_file.read( dump_data.input[index].size) if len(dump_data.output) > 0: for (index, _) in enumerate(dump_data.output): dump_data.output[index].data = dump_file.read( dump_data.output[index].size) if len(dump_data.buffer) > 0: for (index, _) in enumerate(dump_data.buffer): dump_data.buffer[index].data = dump_file.read( dump_data.buffer[index].size) return dump_data
def collect_local_file(report_path: str, key: str, collect_path: str) -> str: """ collect local file: :param report_path: the local report path :param key: the key in slog_conf_path :param collect_path: the collect path :return: the local path """ collect_target_path = os.path.join(collect_path, os.path.basename(report_path)) utils.check_path_valid(collect_target_path, isdir=True, output=True) if key == Constant.DIR_SLOG: slog_report_path = os.path.join(report_path, key) if os.path.exists(slog_report_path) and \ os.path.isdir(slog_report_path): copy_file_to_dest(slog_report_path, key, collect_target_path, report_path) else: utils.print_error_log('There is no %s in %s.' % (key, report_path)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) elif key == 'MNTN_PATH': collect_target_path = os.path.join(collect_path, os.path.basename(report_path)) utils.check_path_valid(collect_target_path, isdir=True, output=True) hisi_report_path = os.path.join(report_path, Constant.DIR_BBOX) if os.path.exists(hisi_report_path) and \ os.path.isdir(hisi_report_path): copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX, collect_target_path, report_path) else: utils.print_warn_log('There is no hisi_logs in %s.' % report_path) return collect_target_path
def _get_air_error_execute_command(self): grep_cmd = ['grep', 'PrintCoreErrorInfo:.*?there is an aicore error', '-inrE', self.collect_applog_path] status, data = utils.execute_command(grep_cmd) if status != 0: utils.print_error_log("Failed to execute command: %s.Maybe rts break when report Core log to host." % " ".join(grep_cmd)) raise utils.AicErrException(Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return data
def _get_node_and_kernel_name_execute_command(self: any) -> any: grep_cmd = ['grep', 'PrintErrorInfo:.*?aicore kernel execute failed', '-inrE', self.collect_applog_path] status, data = utils.execute_command(grep_cmd) if status != 0: utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return data
def _get_all_error_log(self: any) -> None: error_log_file = os.path.join(self.output_path, "error.log") utils.print_info_log('Start to analyze error slog.') cmd = ['grep', r'\[ERROR\]', '-nr', self.collection.collect_slog_path] status, data = utils.execute_command(cmd) if status != 0: utils.print_error_log("Failed to execute command: %s. %s" % (" ".join(cmd), " ".join(data))) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) utils.write_file(error_log_file, data) utils.print_info_log('The error slog is saved in %s.' % error_log_file)
def _parse_dump_file(self: any, dump_file: str) -> any: """ Parse the dump file path by big dump data format :param: dump_file the dump file :return: DumpData :exception when read or parse file error """ utils.check_path_valid(dump_file) try: # get file size file_size = os.path.getsize(dump_file) # check file size > 8 if file_size <= Constant.UINT64_SIZE: utils.print_error_log('The size of %s is at least greater then %d, but the file' ' size is %d. Please check the dump file.' % (dump_file, Constant.UINT64_SIZE, file_size)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) with open(dump_file, 'rb') as dump_data_file: # read header length header_length = dump_data_file.read(Constant.UINT64_SIZE) header_length = struct.unpack(Constant.UINT64_FMT, header_length)[0] # check header_length <= file_size - 8 if header_length > file_size - Constant.UINT64_SIZE: utils.print_error_log( 'The header content size(%d) of %s must be less then ' 'or equal to %d(file size) - %d(header length).' ' Please check the dump file.' % (header_length, dump_file, file_size, Constant.UINT64_SIZE)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) # read header content return self._get_dump_data(dump_data_file, header_length, file_size) except IOError as io_error: utils.print_error_log('Failed to read the dump file %s. %s' % (dump_file, str(io_error))) raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR) finally: pass
def _get_op_param(self, kernel_name) -> list: get_param_cmd = [ 'grep', f'BuildSingleOp Prebuilding op: kernelName\[{kernel_name}\]', '-hr', '-A', '3', self.collection.collect_applog_path ] _, get_param_data = utils.execute_command(get_param_cmd) purified_data = re.sub(r"\[INFO\].*?\[fusion_op.cc:\d+?\].*?\s", "", get_param_data) purified_data = re.sub(r"[\n]", "", purified_data) get_param_regexp = r"op inputs:\s*\((.*?)\),\s*outputs:\s*\((.*?)\),\s*attrs:\s*\((.*?)\)\." get_param_ret = re.findall(get_param_regexp, purified_data, re.M) if len(get_param_ret) == 0: utils.print_error_log( f"Fail to get op params of kernel [{kernel_name}] in host log ." ) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) input_str, output_str, attr_str = get_param_ret[0] result_list = [] input_list = ast.literal_eval("[" + input_str + "]") for (index, input_item) in enumerate(input_list): input_item["param_type"] = "input" input_item["run_shape"] = input_item.get("shape") data_file = os.path.join( self.collection.collect_dump_path, ".".join([kernel_name, "input", str(index), "npy"])) input_item["value"] = np.load(data_file) x_range = [] for i in input_item.get("shape"): x_range.append((i, i)) input_item["range"] = x_range result_list.extend(input_list) output_list = ast.literal_eval("[" + output_str + "]") for output_item in output_list: output_item["param_type"] = "output" output_item["run_shape"] = output_item.get("shape") y_range = [] for i in output_item.get("shape"): y_range.append((i, i)) output_item["range"] = y_range result_list.extend(output_list) attr_list_ori = ast.literal_eval("[" + attr_str + "]") for attr_item in attr_list_ori: if isinstance(attr_item, dict): result_list.append(attr_item.get("value")) return result_list
def _get_module_str(self, kernel_name) -> str: get_module_cmd = [ 'grep', rf'kernel\[{kernel_name}\].*module\[', '-hr', self.collection.collect_applog_path ] _, get_module_data = utils.execute_command(get_module_cmd) get_module_regexp = rf"kernel\[{kernel_name}\].*?module\[(.*?)\]" get_module_ret = re.findall(get_module_regexp, get_module_data, re.M) if len(get_module_ret) == 0: utils.print_error_log( f"Fail to get op module of kernel [{kernel_name}] in host log ." ) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) return get_module_ret[0]
def get_op_info(self: any) -> tuple: grep_cmd = ['grep', '<exception_print>TIME.*4060006', '-nr', '-A', '120', self.collect_slog_path] status, data = utils.execute_command(grep_cmd) if status != 0: utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) ret = re.findall(Constant.EXCEPTION_PATTERN, data, re.M | re.S) if len(ret) == 0: utils.print_info_log("No AIC_ERROR found.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) for device_aic_err in ret: if len(device_aic_err) != Constant.AIC_ERROR_TUPLE_LEN: utils.print_info_log("The AIC_ERROR is not complete.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) log_time = device_aic_err[0] dev_id = device_aic_err[1] stream_id = device_aic_err[2] task_id = device_aic_err[3] err_time = utils.strplogtime(log_time) node_name, kernel_name = self._get_node_and_kernel_name( dev_id, task_id, stream_id, err_time) if node_name == '' and kernel_name == '': continue self.ai_core_error_list.append(device_aic_err) self.node_name_list.append(node_name) self.kernel_name_list.append(kernel_name) if len(self.ai_core_error_list) == 0: utils.print_error_log( "The AIC_ERROR of device does not match the host.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return self.ai_core_error_list, self.node_name_list, self.kernel_name_list
def _get_imas_log(self: any) -> None: imas_log_file = os.path.join(self.output_path, "imas.log") cmd = ['grep', 'IMAS', '-nr', self.collection.collect_applog_path] utils.print_info_log('Start to analyze IMAS log.') status, data = utils.execute_command(cmd) if status == 1: utils.print_warn_log("There is no IMAS log in %s" % self.output_path) return if status != 0: utils.print_error_log("Failed to execute command: %s. %s" % (" ".join(cmd), " ".join(data))) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) utils.write_file(imas_log_file, data) utils.print_info_log('The IMAS log is saved in %s.' % imas_log_file)
def _get_op_by_graph(graph_file: str, info: any) -> None: if graph_file == '': return try: with open(graph_file, 'r') as graph: text = graph.read() regexp = r'(op\s+\{\s+name:\s+"%s".+?%s.+?\})\s+' \ r'op\s+\{' % (info.node_name, info.kernel_name) ret = re.findall(regexp, text, re.M | re.S) if len(ret) == 0: utils.print_warn_log( 'Failed to get op for node(%s) kernel(%s).' % (info.node_name, info.kernel_name)) return info.operator = ret[0] except IOError as io_error: utils.print_error_log('Failed to open file %s. %s' % (graph_file, io_error)) raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR) finally: pass
def _get_the_latest_aicerr_form_ret(ret: list, err_time: any) -> int: max_i = -1 max_time_obj = None for i, (time_str, _, _) in enumerate(ret): time_obj = utils.strplogtime(time_str) # 都找最迟的会找到同一个,加个条件时间要早于AICERROR时间, # 前提host、device时间同步。否则去掉and前的条件。 if err_time >= time_obj and ( max_time_obj is None or time_obj > max_time_obj): max_time_obj = time_obj max_i = i if max_i == -1: for i, (time_str, _, _) in enumerate(ret): time_obj = utils.strplogtime(time_str) if max_time_obj is None or time_obj > max_time_obj: max_time_obj = time_obj max_i = i if max_i == -1: utils.print_error_log("Failed to get node and kernel name.") raise utils.AicErrException(Constant.MS_AICERR_FIND_DATA_ERROR) return max_i
def _check_dump_data_vaild(dump_data: any, dump_file: str, header_length: int, file_size: int) -> None: input_data_size = 0 for item in dump_data.input: input_data_size += item.size output_data_size = 0 for item in dump_data.output: output_data_size += item.size buffer_data_size = 0 for item in dump_data.buffer: buffer_data_size += item.size # check 8 + content size + sum(input.data) + sum(output.data) # + sum(buffer.data) equal to file size if header_length + Constant.UINT64_SIZE + input_data_size \ + output_data_size + buffer_data_size != file_size: utils.print_error_log( 'The file size(%d) of %s is not equal to %d(header ' 'length) + %d(the size of header content) + %d(the sum' ' of input data) + %d(the sum of output data) + %d(the' ' sum of buffer data). Please check the dump file.' % (file_size, dump_file, Constant.UINT64_SIZE, header_length, input_data_size, output_data_size, buffer_data_size)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
def parse(self: any) -> None: """ parse by collection info """ utils.print_info_log('******************Analysis******************') aicore_error_data_list = self._aicore_error_data() utils.print_info_log('Start to analyze each ai core error.') summary_info_list = [] # decompile if "aarch64" in platform.machine(): obj_dump_file = "cce-objdump_aarch64" else: obj_dump_file = "cce-objdump" obj_dump_file = os.path.join(os.getcwd(), "tools", obj_dump_file) if os.path.exists(obj_dump_file): os.system("chmod 755 " + obj_dump_file) os.environ["PATH"] = os.path.join( os.getcwd(), "tools") + ":" + os.environ["PATH"] else: cce_dump = shutil.which("cce-objdump") if not cce_dump: # guess where is cce-objdump parent_path = "aarch64-linux" if "aarch64" in platform.machine( ) else "x86_64-linux" cce_dump_guess = os.path.join("usr/local/Ascend/latest", parent_path, "ccec_compiler/bin/cce-objdump") if os.path.exists(cce_dump_guess): cce_dump = cce_dump_guess if not cce_dump: utils.print_error_log( 'Cannot find cce-objdump! please add cce-objdump path in env PATH.' ) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) for i, current_pc in enumerate(self.collection.ai_core_error_list): # parser aic error by slog info = AicErrorInfo() info.err_time, info.dev_id, info.stream_id, info.task_id, \ info.core_id, info.aic_error, info.start_pc, info.extra_info, \ info.current_pc = current_pc utils.print_info_log( "******************No.%d %s******************" % (i, info.err_time)) info.err_time_obj = utils.strplogtime(info.err_time) err_i_folder_name = "aicerror_%d_%s" % ( i, time.strftime("%Y%m%d%H%M%S", info.err_time_obj.timetuple())) err_i_folder = os.path.join(self.output_path, err_i_folder_name) utils.check_path_valid(err_i_folder, isdir=True, output=True) info.node_name = self.collection.node_name_list[i] info.kernel_name = self.collection.kernel_name_list[i] # get hisi log self._get_hisi_log(info, err_i_folder) # get op info in build proto file self._get_op_by_graph(aicore_error_data_list[Constant.GRAPH_FILE], info) kernel_meta_path = os.path.join( self.collection.collect_compile_path, 'kernel_meta') if os.path.exists(kernel_meta_path): # 反编译 出错指令 result = self._decompile([info.kernel_name, kernel_meta_path], err_i_folder, info) if result is False: utils.print_warn_log( "decompile kernel_meta file %s failed." % os.path.join( kernel_meta_path, info.kernel_name + ".o")) else: utils.print_warn_log("kernel_meta path %s not exist" % kernel_meta_path) try: # input output address info.aval_addrs = self._get_available_addrs(info.err_time) info.necessary_addr = self._get_necessary_addrs( info.kernel_name) self._check_addr(info.aval_addrs, info.necessary_addr) # self.print_summary(avl_addr, necessary_addr) except BaseException as e: import logging logging.exception(e) print("Check addr error failed") info.input_output_addrs = self._get_input_output_addrs( info, err_i_folder, aicore_error_data_list[Constant.ALLOC_ADDR], aicore_error_data_list[Constant.ACTUAL_ADDR]) # 地址越界信息收集 info.addr_overflow = aicore_error_data_list[Constant.ADDR_OVERFLOW] # 算子代码地址,args地址 info.op_addr, info.args_addr, info.multi_args_addr = \ self._get_op_and_args_addr(info.start_pc) # parse dump if self.collection.collect_dump_path: parser = DumpDataParser(self.collection.collect_dump_path, info.node_name, info.kernel_name) info.dump_info = parser.parse() # write info file self._write_errorinfo_file(err_i_folder, info, i) summary_info_list.append( "%s %s device_id=%s core_id=%s task_id=%s node=%s " "kernel=%s" % (err_i_folder_name, info.aic_error, info.dev_id, info.core_id, info.task_id, info.node_name, info.kernel_name)) utils.print_info_log('Finish to analyze each ai core error.') # write summary info self._write_summary_file(summary_info_list)
def _get_dtype_by_data_type(self: any, data_type: any) -> any: if data_type not in self.DATA_TYPE_TO_DTYPE_MAP: utils.print_error_log("The output data type(%s) does not support." % data_type) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) return self.DATA_TYPE_TO_DTYPE_MAP.get(data_type).get(Constant.DTYPE)