def phase_pb_file(file_path: str) -> Union[MSGraph, None]: """ Parse pb file to graph Args: file_path (str): The file path of pb file. Returns: MSGraph, if load pb file and build graph success, will return the graph, else return None. """ if not CONFIG.VERBOSE: logger.setLevel(logging.ERROR) logger.info("Start to load graph from pb file, file path: %s.", file_path) model_proto = anf_ir_pb2.ModelProto() try: model_proto.ParseFromString(FileHandler(file_path).read()) except ParseError: logger.warning("The given file is not a valid pb file, file path: %s.", file_path) return None graph = MSGraph() try: graph.build_graph(model_proto.graph) except Exception as ex: logger.error("Build graph failed, file path: %s.", file_path) logger.exception(ex) raise UnknownError(str(ex)) logger.info("Build graph success, file path: %s.", file_path) return graph
def _update_node_name_of_cache(self, node, new_name, update_parent=False): """ Update a node name which is stored in cache. Args: node (Node): The node that will be renamed. new_name (str): The new name. update_parent (bool): Determines whether the input and output of the parent node need to be updated. """ logger.debug('Update node name of cache, node(%s), new name is %s.', str(node), new_name) origin_name = node.name node.name = new_name # Find all nodes that need to modify the input and input update_node_map = {} for method in ['input', 'output', 'proxy_input', 'proxy_output']: for target_name in getattr(node, method): target_node = self._get_normal_node(node_name=target_name) if target_node is None: message = f"Node should not be None, name: {target_name}, {method}: {list(getattr(node, method))}." logger.error(message) continue update_node_map.update({target_name: target_node}) if not update_parent: continue slash_index = target_name.find('/') while slash_index != -1: scope_name = target_name[:slash_index] slash_index = target_name.find('/', slash_index+1) if update_node_map.get(scope_name): continue scope_node = self._get_normal_node(node_name=scope_name) if scope_node is None: message = f"Can not find the scope node by scope name({scope_name}), " \ f"may be this scope node has not been built." logger.debug(message) continue update_node_map.update({scope_name: scope_node}) # Update the input and output of the nodes for target_node in update_node_map.values(): for method in ['input', 'output', 'proxy_input', 'proxy_output']: attr_temp = getattr(target_node, method).get(origin_name) if attr_temp is None: # This method does not have this node, so it is skipped continue # Delete the old attribute and update new name to source node or destination node. getattr(target_node, f'delete_{method}')(origin_name) getattr(target_node, f'add_{method}')(new_name, attr_temp) # Delete the origin node in cache. self._delete_nodes_of_cache(node_names=[origin_name]) self._cache_node(node)
def check_int(param_name, value, min_num, max_num): """Check if the value is positive integer.""" if isinstance(value, int) and min_num <= value <= max_num: return log.error("Invalid param `%s`. The integer should be in [%d, %d].", param_name, min_num, max_num) raise DebuggerParamTypeError(f"Invalid param `{param_name}`.")
def is_summary_directory(self, summary_base_dir, relative_path): """ Check if the given summary directory is valid. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: bool, indicates if the given summary directory is valid. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> summaries = summary_watcher.is_summary_directory('/summary/base/dir', './job-01') """ if contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path): return False if not self._is_valid_summary_directory(summary_base_dir, relative_path): return False summary_directory = os.path.realpath( os.path.join(summary_base_dir, relative_path)) try: entries = os.scandir(summary_directory) except PermissionError: logger.error('Path of summary base directory is not accessible.') raise FileSystemPermissionError( 'Path of summary base directory is not accessible.') for entry in entries: if entry.is_symlink(): continue summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) if summary_pattern is not None and entry.is_file(): return True pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name) if pb_pattern is not None and entry.is_file(): return True if entry.is_dir(): profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) cluster_profiler_pattern = re.search( self.CLUSTER_PROFILER_DIRECTORY_REGEX, entry.name) if profiler_pattern is not None or cluster_profiler_pattern is not None: full_path = os.path.realpath( os.path.join(summary_directory, entry.name)) if self._is_valid_profiler_directory(full_path)[0] or \ self._is_valid_cluster_profiler_directory(full_path)[0]: return True return False
def _convert_to_numpy_array(self): """Convert a list data to numpy array.""" try: ndarray = np.array(self._data).reshape(self._dims) except ValueError as ex: logger.error("Reshape array fail, detail: %r", str(ex)) return self._np_array = ndarray
def validate_stack_pattern(stack_pattern): """Check stack pattern.""" if stack_pattern: if not isinstance(stack_pattern, str): log.error( "Invalid stack pattern. String type is required, but got %s.", type(stack_pattern)) raise DebuggerParamTypeError("stack_pattern is not string type.") pattern_limit = 255 if len(stack_pattern) > pattern_limit: log.error( "The length of stack_pattern is %s, which should no greater than %s.", len(stack_pattern), pattern_limit) raise DebuggerParamValueError( "stack_pattern is over length limit.")
def _parse_pb_file(summary_dir, filename): """ Parse pb file and write content to `EventsData`. Args: filename (str): The file path of pb file. Returns: TensorEvent, if load pb file and build graph success, will return tensor event, else return None. """ file_path = FileHandler.join(summary_dir, filename) logger.info("Start to load graph from pb file, file path: %s.", file_path) filehandler = FileHandler(file_path) model_proto = anf_ir_pb2.ModelProto() try: model_proto.ParseFromString(filehandler.read()) except ParseError: logger.warning( "The given file is not a valid pb file, file path: %s.", file_path) return None graph = MSGraph() try: graph.build_graph(model_proto.graph) except Exception as ex: # Normally, there are no exceptions, and it is only possible for users on the MindSpore side # to dump other non-default graphs. logger.error("Build graph failed, file path: %s.", file_path) logger.exception(ex) raise UnknownError(str(ex)) tensor_event = TensorEvent( wall_time=FileHandler.file_stat(file_path).mtime, step=0, tag=filename, plugin_name=PluginNameEnum.GRAPH.value, value=graph, filename=filename) logger.info("Build graph success, file path: %s.", file_path) return tensor_event
def cache_train_jobs(self, train_ids): """ Cache train jobs. Args: train_ids (list): Specify list of train_ids to be cached. Returns: dict, indicates train job ID and its current cache status. Raises: ParamTypeError, if the given train_ids parameter is not in valid type. """ if not isinstance(train_ids, list): logger.error("train_ids must be list.") raise ParamTypeError('train_ids', list) cache_result = [] for train_id in train_ids: if not isinstance(train_id, str): logger.error("train_id must be str.") raise ParamTypeError('train_id', str) try: train_job = self._data_manager.get_train_job(train_id) except exceptions.TrainJobNotExistError: logger.warning('Train job %s not existed', train_id) continue if train_job.cache_status == CacheStatus.NOT_IN_CACHE: self._data_manager.cache_train_job(train_id) # Update loader cache status to CACHING for consistency in response. train_job.cache_status = CacheStatus.CACHING cache_result.append( dict( train_id=train_id, cache_status=train_job.cache_status.value, )) return cache_result
def list_summary_directories(self, summary_base_dir, overall=True): """ List summary directories within base directory. Args: summary_base_dir (str): Path of summary base directory. overall (bool): Limit the total num of scanning if overall is False. Returns: list, list of summary directory info, each of which including the following attributes. - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR, starting with "./". - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. - profiler (dict): profiler info, including profiler subdirectory path, profiler creation time and profiler modification time. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> directories = summary_watcher.list_summary_directories('/summary/base/dir') """ if contains_null_byte(summary_base_dir=summary_base_dir): return [] relative_path = os.path.join('.', '') if not self._is_valid_summary_directory(summary_base_dir, relative_path): return [] summary_dict = {} counter = Counter(max_count=None if overall else self.MAX_SCAN_COUNT) try: entries = os.scandir(summary_base_dir) except PermissionError: logger.error('Path of summary base directory is not accessible.') raise FileSystemPermissionError( 'Path of summary base directory is not accessible.') for entry in entries: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: break try: counter.add() except MaxCountExceededError: logger.info( 'Stop further scanning due to overall is False and ' 'number of scanned files exceeds upper limit.') break if entry.is_symlink(): pass elif entry.is_file(): self._update_summary_dict(summary_dict, summary_base_dir, relative_path, entry) elif entry.is_dir(): entry_path = os.path.realpath( os.path.join(summary_base_dir, entry.name)) self._scan_subdir_entries(summary_dict, summary_base_dir, entry_path, entry.name, counter) directories = [] for key, value in summary_dict.items(): directory = { 'relative_path': key, 'profiler': None, 'create_time': value['ctime'], 'update_time': value['mtime'], } profiler = value.get('profiler') if profiler is not None: directory['profiler'] = { 'directory': profiler['directory'], 'create_time': profiler['ctime'], 'update_time': profiler['mtime'], } directories.append(directory) # sort by update time in descending order and relative path in ascending order directories.sort(key=lambda x: (-int(x['update_time'].timestamp()), x[ 'relative_path'])) return directories
def list_summaries(self, summary_base_dir, relative_path='./'): """ Get info of latest summary file within the given summary directory. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: list, list of summary file including the following attributes. - file_name (str): Summary file name. - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> summaries = summary_watcher.list_summaries('/summary/base/dir', './job-01') """ if contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path): return [] if not self._is_valid_summary_directory(summary_base_dir, relative_path): return [] summaries = [] summary_directory = os.path.realpath( os.path.join(summary_base_dir, relative_path)) try: entries = os.scandir(summary_directory) except PermissionError: logger.error('Path of summary directory is not accessible.') raise FileSystemPermissionError( 'Path of summary directory is not accessible.') for entry in entries: if entry.is_symlink() or not entry.is_file(): continue pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) if pattern is None: continue timestamp = int(pattern.groupdict().get('timestamp')) try: # extract created time from filename ctime = datetime.datetime.fromtimestamp(timestamp).astimezone() except OverflowError: continue try: stat = entry.stat() except FileNotFoundError: logger.warning('File %s not found.', entry.name) continue mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone() summaries.append({ 'file_name': entry.name, 'create_time': ctime, 'update_time': mtime, }) # sort by update time in descending order and filename in ascending order summaries.sort( key=lambda x: (-int(x['update_time'].timestamp()), x['file_name'])) return summaries
def _load_single_file(self, file_handler, executor, events_data): """ Load a log file data. Args: file_handler (FileHandler): A file handler. executor (Executor): The executor instance. events_data (EventsData): The container of event data. Returns: bool, True if the summary file is finished loading. """ while True: start_offset = file_handler.offset try: event_str = self.event_load(file_handler) if event_str is None: file_handler.reset_offset(start_offset) return True if len(event_str) > MAX_EVENT_STRING: logger.warning( "file_path: %s, event string: %d exceeds %d and drop it.", file_handler.file_path, len(event_str), MAX_EVENT_STRING) continue future = executor.submit(self._event_parse, event_str, self._latest_filename) def _add_tensor_event_callback(future_value): tensor_values = future_value.result() for tensor_value in tensor_values: if tensor_value.plugin_name == PluginNameEnum.GRAPH.value: try: graph_tags = events_data.list_tags_by_plugin( PluginNameEnum.GRAPH.value) except KeyError: graph_tags = [] summary_tags = self.filter_files(graph_tags) for tag in summary_tags: events_data.delete_tensor_event(tag) events_data.add_tensor_event(tensor_value) future.add_done_callback( exception_no_raise_wrapper(_add_tensor_event_callback)) return False except (exceptions.CRCFailedError, exceptions.CRCLengthFailedError) as exc: file_handler.reset_offset(start_offset) file_size = file_handler.file_stat(file_handler.file_path).size logger.error( "Check crc failed and ignore this file, please check the integrity of the file, " "file_path: %s, offset: %s, file size: %s. Detail: %s.", file_handler.file_path, file_handler.offset, file_size, str(exc)) return True except (OSError, DecodeError, exceptions.MindInsightException) as ex: logger.error( "Parse log file fail, and ignore this file, detail: %r, " "file path: %s.", str(ex), file_handler.file_path) return True except Exception as ex: logger.exception(ex) raise UnknownError(str(ex))
def wrapper(*args, **kwargs): try: return exception_wrapper(func)(*args, **kwargs) except UnknownError as err: logger.error(str(err))
def list_summary_directories(self, summary_base_dir, overall=True): """ List summary directories within base directory. Args: summary_base_dir (str): Path of summary base directory. Returns: list, list of summary directory info, each of which including the following attributes. - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR, starting with "./". - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> directories = summary_watcher.list_summary_directories('/summary/base/dir') """ if self._contains_null_byte(summary_base_dir=summary_base_dir): return [] if not os.path.exists(summary_base_dir): logger.warning('Path of summary base directory not exists.') return [] if not os.path.isdir(summary_base_dir): logger.warning( 'Path of summary base directory is not a valid directory.') return [] summary_dict = {} scan_count = 0 try: entries = os.scandir(summary_base_dir) except PermissionError: logger.error('Path of summary base directory is not accessible.') raise FileSystemPermissionError( 'Path of summary base directory is not accessible.') for entry in entries: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: break relative_path = os.path.join('.', '') if entry.is_symlink(): pass elif entry.is_file(): self._update_summary_dict(summary_dict, relative_path, entry) elif entry.is_dir(): full_path = os.path.realpath( os.path.join(summary_base_dir, entry.name)) try: subdir_entries = os.scandir(full_path) except PermissionError: logger.warning( 'Path of %s under summary base directory is not accessible.', entry.name) else: for subdir_entry in subdir_entries: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: break subdir_relative_path = os.path.join('.', entry.name) if subdir_entry.is_symlink(): pass elif subdir_entry.is_file(): self._update_summary_dict(summary_dict, subdir_relative_path, subdir_entry) scan_count += 1 if not overall and scan_count >= self.MAX_SCAN_COUNT: break scan_count += 1 if not overall and scan_count >= self.MAX_SCAN_COUNT: logger.info( 'Stop further scanning due to overall is False and ' 'number of scanned files exceeds upper limit.') break directories = [{ 'relative_path': key, 'create_time': value['ctime'], 'update_time': value['mtime'], } for key, value in summary_dict.items()] # sort by update time in descending order and relative path in ascending order directories.sort(key=lambda x: (-int(x['update_time'].timestamp()), x[ 'relative_path'])) return directories