def _parse_op_nodes(self, node_protos): """ Parse `anf_ir_pb2.NodeProto` object, and create a normal node. Args: node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto. """ logger.debug("Start to parse op nodes from proto.") for topological_index, node_proto in enumerate(node_protos): if not node_proto.name: logger.warning( "Finding a node with an empty name will not save it.") continue if not node_proto.full_name or any(node_proto.full_name.lower( ).endswith(f'[:{plugin.value.lower()}]') for plugin in PluginNameEnum): node_name = Node.create_node_name( scope=node_proto.scope, base_name=f'{node_proto.op_type}{node_proto.name}') else: node_name = node_proto.full_name # The Graphviz plug-in that the UI USES can't handle these special characters. check_invalid_character(node_name) node = Node(name=node_name, node_id=node_proto.name, topological_index=topological_index) node.full_name = node_proto.full_name node.type = node_proto.op_type self._parse_attributes(node_proto.attribute, node) self._parse_inputs(node_proto.input, node) node.output_i = node_proto.output_i node.scope = node_proto.scope node.output_shape = self._get_shape_by_parse_type_proto( node_proto.output_type) node.output_nums = len(node.output_shape) node.output_data_type = self._get_data_type_by_parse_type_proto( node_proto.output_type, node) self._cache_node(node)
def get_train_job_by_plugin(self, train_id, plugin_name): """ Get a train job by train job id. If the given train job does not has the given plugin data, the tag list will be empty. Args: train_id (str): Get train job info by the given id. plugin_name (str): Get tags by given plugin. Returns: TypedDict('TrainJobEntity', {'id': str, 'name': str, 'tags': List[str]}), a train job object. """ self._check_status_valid() self._check_train_job_exist(train_id, self._loader_pool) loader = self._get_loader(train_id) if loader is None: logger.warning( "No valid summary log in train job %s, " "or it is not in the cache.", train_id) return None name = loader.name data_loader = loader.data_loader tags = [] try: events_data = data_loader.get_events_data() tags = events_data.list_tags_by_plugin(plugin_name) except KeyError: logger.debug( "Plugin name %r does not exist " "in train job %r, and set tags to empty list.", plugin_name, name) except AttributeError: logger.debug( "Train job %r has been deleted or it has not loaded data, " "and set tags to empty list.", name) result = dict(id=train_id, name=name, tags=tags) return result
def _parse_pb_file(summary_dir, filename): """ Parse pb file and write content to `EventsData`. Args: filename (str): The file path of pb file. Returns: TensorEvent, if load pb file and build graph success, will return tensor event, else return None. """ file_path = FileHandler.join(summary_dir, filename) logger.info("Start to load graph from pb file, file path: %s.", file_path) filehandler = FileHandler(file_path) model_proto = anf_ir_pb2.ModelProto() try: model_proto.ParseFromString(filehandler.read()) except ParseError: logger.warning( "The given file is not a valid pb file, file path: %s.", file_path) return None graph = MSGraph() try: graph.build_graph(model_proto.graph) except Exception as ex: # Normally, there are no exceptions, and it is only possible for users on the MindSpore side # to dump other non-default graphs. logger.error("Build graph failed, file path: %s.", file_path) logger.exception(ex) raise UnknownError(str(ex)) tensor_event = TensorEvent( wall_time=FileHandler.file_stat(file_path).mtime, step=0, tag=filename, plugin_name=PluginNameEnum.GRAPH.value, value=graph, filename=filename) logger.info("Build graph success, file path: %s.", file_path) return tensor_event
def parse_files(self, executor, filenames, events_data): """ Load summary file and parse file content. Args: executor (Executor): The executor instance. filenames (list[str]): File name list. events_data (EventsData): The container of event data. Returns: bool, True if all the summary files are finished loading. """ self._events_data = events_data summary_files = self.filter_files(filenames) summary_files = self.sort_files(summary_files) if self._latest_filename in summary_files: index = summary_files.index(self._latest_filename) summary_files = summary_files[index:] for filename in summary_files: file_path = FileHandler.join(self._summary_dir, filename) if filename != self._latest_filename: self._summary_file_handler = FileHandler(file_path, 'rb') self._latest_filename = filename self._latest_file_size = 0 new_size = FileHandler.file_stat(file_path).size if new_size == self._latest_file_size: continue try: if not self._load_single_file(self._summary_file_handler, executor): self._latest_file_size = self._summary_file_handler.offset else: self._latest_file_size = new_size # Wait for data in this file to be processed to avoid loading multiple files at the same time. logger.info("Parse summary file offset %d, file path: %s.", self._latest_file_size, file_path) return False except UnknownError as ex: logger.warning("Parse summary file failed, detail: %r," "file path: %s.", str(ex), file_path) return True
def _get_train_job_item(self, train_id): """ Get train job item. Args: train_id (str): Specify train id. Returns: dict, a dict of train job item. """ try: train_job = self._data_manager.get_train_job(train_id) except exceptions.TrainJobNotExistError: logger.warning('Train job %s not existed', train_id) return None basic_info = train_job.get_basic_info() train_job_item = dict( train_id=basic_info.train_id, relative_path=basic_info.train_id, create_time=basic_info.create_time.strftime('%Y-%m-%d %H:%M:%S'), update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), profiler_dir=basic_info.profiler_dir, cache_status=train_job.cache_status.value, profiler_type=basic_info.profiler_type, summary_files=basic_info.summary_files, graph_files=basic_info.graph_files, lineage_files=basic_info.lineage_files ) if train_job.cache_status == CacheStatus.CACHED: plugins = self.get_plugins(train_id) else: plugins = dict(plugins={ 'graph': [], 'scalar': [], 'image': [], 'histogram': [], }) train_job_item.update(plugins) return train_job_item
def _check_and_normalize_summary_path(self, summary_path): """ Check and normalize summary path. Args: summary_path (str): A directory path, e.g. '/data/ImageNet/'. Returns: str, normalized summary path. """ if summary_path is None: logger.warning( "Summary path is None. It will not init data loader generator." ) raise ParamValueError("Summary path is None.") summary_path = os.path.realpath(summary_path) return summary_path
def get_train_job(self, train_id): """ Get train job by train ID. This method overrides parent method. Args: train_id (str): Train ID for train job. Returns: dict, single train job, if can not find any data, will return None. """ self._check_train_job_exist(train_id, self._loader_pool) loader = self._get_loader(train_id) if loader is None: logger.warning( "No valid summary log in train job %s, " "or it is not in the cache.", train_id) return None train_job = loader.to_dict() train_job.pop('data_loader') plugin_data = {} for plugin_name in PluginNameEnum.list_members(): job = self.get_train_job_by_plugin(train_id, plugin_name=plugin_name) if job is None: plugin_data[plugin_name] = [] else: plugin_data[plugin_name] = job['tags'] train_job.update({DATAVISUAL_PLUGIN_KEY: plugin_data}) # Will fill basic_info value in future. train_job_obj = CachedTrainJob(basic_info=None) train_job_obj.set(DATAVISUAL_CACHE_KEY, train_job) train_job_obj.cache_status = loader.cache_status return train_job_obj
def cache_train_jobs(self, train_ids): """ Cache train jobs. Args: train_ids (list): Specify list of train_ids to be cached. Returns: dict, indicates train job ID and its current cache status. Raises: ParamTypeError, if the given train_ids parameter is not in valid type. """ if not isinstance(train_ids, list): logger.error("train_ids must be list.") raise ParamTypeError('train_ids', list) cache_result = [] for train_id in train_ids: if not isinstance(train_id, str): logger.error("train_id must be str.") raise ParamTypeError('train_id', str) try: train_job = self._data_manager.get_train_job(train_id) except exceptions.TrainJobNotExistError: logger.warning('Train job %s not existed', train_id) continue if train_job.cache_status == CacheStatus.NOT_IN_CACHE: self._data_manager.cache_train_job(train_id) # Update loader cache status to CACHING for consistency in response. train_job.cache_status = CacheStatus.CACHING cache_result.append( dict( train_id=train_id, cache_status=train_job.cache_status.value, )) return cache_result
def _is_valid_summary_directory(self, summary_base_dir, relative_path): """ Check if the given summary directory is valid. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: bool, indicates if summary directory is valid. """ summary_base_dir = os.path.realpath(summary_base_dir) summary_directory = os.path.realpath( os.path.join(summary_base_dir, relative_path)) if not os.path.exists(summary_directory): logger.warning('Path of summary directory not exists.') return False if not os.path.isdir(summary_directory): logger.warning( 'Path of summary directory is not a valid directory.') return False try: Path(summary_directory).relative_to(Path(summary_base_dir)) except ValueError: logger.warning( 'Relative path %s is not subdirectory of summary_base_dir', relative_path) return False return True
def _parse_op_nodes(self, node_protos): """ Parse `anf_ir_pb2.NodeProto` object, and create a normal node. Args: node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto. """ logger.debug("Start to parse op nodes from proto.") for node_proto in node_protos: if not node_proto.name: logger.warning( "Finding a node with an empty name will not save it.") continue if not node_proto.full_name or any(node_proto.full_name.lower( ).endswith(f'[:{plugin.value.lower()}]') for plugin in PluginNameEnum): node_name = Node.create_node_name( scope=node_proto.scope, base_name=f'{node_proto.op_type}{node_proto.name}') else: node_name = node_proto.full_name node = Node(name=node_name, node_id=node_proto.name) node.type = node_proto.op_type logger.debug( "Foreach graph proto nodes, node id: %s, node name: %s, node def name: %s, " "input count: %s", node.node_id, node.name, node_proto.name, len(node_proto.input)) self._parse_attributes(node_proto.attribute, node) self._parse_inputs(node_proto.input, node) node.output_i = node_proto.output_i node.scope = node_proto.scope node.output_shape = self._get_shape_by_parse_type_proto( node_proto.output_type) node.output_data_type = self._get_data_type_by_parse_type_proto( node_proto.output_type) self._cache_node(node)
def _get_train_job_item(self, train_id): """ Get train job item. Args: train_id (str): Specify train id. Returns: dict, a dict of train job item. """ try: train_job = self._data_manager.get_train_job(train_id) except exceptions.TrainJobNotExistError: logger.warning('Train job %s not existed', train_id) return None basic_info = train_job.get_basic_info() train_job_item = dict( train_id=basic_info.train_id, relative_path=basic_info.train_id, create_time=basic_info.create_time.strftime('%Y-%m-%d %H:%M:%S'), update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), profiler_dir=basic_info.profiler_dir, cache_status=train_job.cache_status.value, profiler_type=basic_info.profiler_type, summary_files=basic_info.summary_files, graph_files=basic_info.graph_files, lineage_files=basic_info.lineage_files, dump_dir=basic_info.dump_dir) if train_job.cache_status != CacheStatus.NOT_IN_CACHE: plugins = self.get_plugins(train_id, manual_update=False) else: plugins = dict(plugins={ plugin: [] for plugin in PluginNameEnum.list_members() }) train_job_item.update(plugins) return train_job_item
def load(self, computing_resource_mgr): """Load the data when loader is exist. Args: computing_resource_mgr (ComputingResourceManager): The ComputingResourceManager instance. """ if self._loader is None: ms_dataloader = MSDataLoader(self._summary_dir) loaders = [ms_dataloader] for loader in loaders: if loader.filter_valid_files(): self._loader = loader break if self._loader is None: logger.warning( "No valid files can be loaded, summary_dir: %s.", self._summary_dir) raise exceptions.SummaryLogPathInvalid() self._loader.load(computing_resource_mgr)
def _load_single_file(self, file_handler, executor): """ Load a log file data. Args: file_handler (FileHandler): A file handler. executor (Executor): The executor instance. Returns: bool, True if the summary file is finished loading. """ while True: start_offset = file_handler.offset try: event_str = self._event_load(file_handler) if event_str is None: file_handler.reset_offset(start_offset) return True if len(event_str) > MAX_EVENT_STRING: logger.warning("file_path: %s, event string: %d exceeds %d and drop it.", file_handler.file_path, len(event_str), MAX_EVENT_STRING) continue future = executor.submit(self._event_parse, event_str, self._latest_filename) def _add_tensor_event_callback(future_value): try: tensor_values = future_value.result() for tensor_value in tensor_values: if tensor_value.plugin_name == PluginNameEnum.GRAPH.value: try: graph_tags = self._events_data.list_tags_by_plugin(PluginNameEnum.GRAPH.value) except KeyError: graph_tags = [] summary_tags = self.filter_files(graph_tags) for tag in summary_tags: self._events_data.delete_tensor_event(tag) self._events_data.add_tensor_event(tensor_value) except Exception as exc: # Log exception for debugging. logger.exception(exc) raise future.add_done_callback(_add_tensor_event_callback) return False except exceptions.CRCFailedError: file_handler.reset_offset(start_offset) logger.warning("Check crc faild and ignore this file, file_path=%s, " "offset=%s.", file_handler.file_path, file_handler.offset) return True except (OSError, DecodeError, exceptions.MindInsightException) as ex: logger.warning("Parse log file fail, and ignore this file, detail: %r," "file path: %s.", str(ex), file_handler.file_path) return True except Exception as ex: logger.exception(ex) raise UnknownError(str(ex))
def _parse_op_nodes(self, node_protos): """ Parse `anf_ir_pb2.NodeProto` object, and create a normal node. Args: node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto. """ logger.debug("Start to parse op nodes from proto.") for topological_index, node_proto in enumerate(node_protos): if not node_proto.name: logger.warning( "Finding a node with an empty name will not save it.") continue node_name = node_proto.name # The Graphviz plug-in that the UI USES can't handle these special characters. check_invalid_character(node_name) node = Node(name=node_name, node_id=node_proto.name, topological_index=topological_index) node.full_name = node_proto.full_name node.type = node_proto.op_type if getattr(node_proto, 'source_address', None): node.stack = DebuggerSource.build_stack_from_source_address( node_proto.source_address) self._parse_attributes(node_proto.attribute, node) self._parse_inputs(node_proto.input, node) node.output_i = node_proto.output_i node.scope = node_proto.scope node.output_shape = self._get_shape_by_parse_type_proto( node_proto.output_type) node.output_nums = len(node.output_shape) node.output_data_type = self._get_data_type_by_parse_type_proto( node_proto.output_type, node) self._cache_node(node)
def parse_files(self, executor, filenames, events_data): """ Load summary file and parse file content. Args: executor (Executor): The executor instance. filenames (list[str]): File name list. events_data (EventsData): The container of event data. """ self._events_data = events_data summary_files = self.filter_files(filenames) summary_files = self.sort_files(summary_files) for filename in summary_files: if self._latest_filename and \ (self._compare_summary_file(self._latest_filename, filename)): continue file_path = FileHandler.join(self._summary_dir, filename) if filename != self._latest_filename: self._summary_file_handler = FileHandler(file_path, 'rb') self._latest_filename = filename self._latest_file_size = 0 new_size = FileHandler.file_stat(file_path).size if new_size == self._latest_file_size: continue self._latest_file_size = new_size try: self._load_single_file(self._summary_file_handler, executor) # Wait for data in this file to be processed to avoid loading multiple files at the same time. executor.wait_all_tasks_finish() except UnknownError as ex: logger.warning( "Parse summary file failed, detail: %r," "file path: %s.", str(ex), file_path)
def _execute_loader(self, loader_id, executor): """ Load data form data_loader. If there is something wrong by loading, add logs and delete the loader. Args: loader_id (str): An ID for `Loader`. executor (Executor): The Executor instance. Returns: bool, True if the loader is finished loading. """ try: with self._loader_pool_mutex: loader = self._loader_pool.get(loader_id, None) if loader is None: logger.debug( "Loader %r has been deleted, will not load data.", loader_id) return True loader.cache_status = CacheStatus.CACHING if loader.data_loader.load(executor): # Update loader cache status to CACHED. # Loader with cache status CACHED should remain the same cache status. loader.cache_status = CacheStatus.CACHED return True return False except MindInsightException as ex: logger.warning( "Data loader %r load data failed. " "Delete data_loader. Detail: %s", loader_id, ex) with self._loader_pool_mutex: self._delete_loader(loader_id) return True
def _parse_summary_value(value, plugin): """ Parse summary value and create corresponding container according to plugin. Args: value (Summary.Value): Value message in summary file. plugin (str): Plugin value. Returns: Union[Summary.Value, HistogramContainer, TensorContainer, ImageContainer], original summary value or an instance of HistogramContainer or TensorContainer or ImageContainer. """ tensor_event_value = getattr(value, plugin) if plugin == PluginNameEnum.HISTOGRAM.value: tensor_event_value = HistogramContainer(tensor_event_value) # Drop steps if original_buckets_count exceeds HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT # to avoid time-consuming re-sample process. if tensor_event_value.histogram.original_buckets_count > Histogram.MAX_ORIGINAL_BUCKETS_COUNT: logger.info('original_buckets_count exceeds ' 'HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT') return None elif plugin == PluginNameEnum.TENSOR.value: tensor_event_value = TensorContainer(tensor_event_value) tensor_count = 1 for d in tensor_event_value.dims: tensor_count *= d if tensor_count > MAX_TENSOR_COUNT: logger.warning( 'tag: %s/tensor, dims: %s, tensor count: %d exceeds %d and drop it.', value.tag, tensor_event_value.dims, tensor_count, MAX_TENSOR_COUNT) return None elif plugin == PluginNameEnum.IMAGE.value: tensor_event_value = ImageContainer(tensor_event_value) return tensor_event_value
def _scan_subdir_entries(self, summary_dict, summary_base_dir, entry_path, entry_name, counter, list_explain): """ Scan subdir entries. Args: summary_dict (dict): Temporary data structure to hold summary directory info. summary_base_dir (str): Path of summary base directory. entry_path(str): Path entry. entry_name (str): Name of entry. counter (Counter): An instance of CountLimiter. list_explain (bool): Indicates whether to list only the mindexplain folder. """ try: subdir_entries = os.scandir(entry_path) except PermissionError: logger.warning( 'Path of %s under summary base directory is not accessible.', entry_name) return for subdir_entry in subdir_entries: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: break try: counter.add() except MaxCountExceededError: logger.info( 'Stop further scanning due to overall is False and ' 'number of scanned files exceeds upper limit.') break subdir_relative_path = os.path.join('.', entry_name) if subdir_entry.is_symlink(): pass self._update_summary_dict(summary_dict, summary_base_dir, subdir_relative_path, subdir_entry, list_explain)
def _parse_consts(self, consts): """ Parse `anf_ir_pb2.NameValueProto` object, and create a const node. Args: consts (list[anf_ir_pb2.NameValueProto]): Refer to `anf_ir_pb2.NameValueProto` object. """ logger.debug("Start to parse consts from proto.") for const in consts: if not const.key: logger.warning( "Finding a const with an empty key will not save it.") continue node = Node(name=const.key, node_id=const.key) node.type = NodeTypeEnum.CONST.value node.add_attr({const.key: str(const.value)}) if const.value.dtype == DataType.DT_TENSOR: shape = [] for dim in const.value.tensor_val.dims: shape.append(dim) node.output_shape = shape self._cache_node(node)
def get_single_train_job(self, train_id, manual_update=False): """ Get train job by train ID. Args: train_id (str): Train ID for train job. manual_update (bool): If manual update, True. Returns: dict, single train job, if can not find any data, will return None. """ self._check_status_valid() self._check_train_job_exist(train_id, self._loader_pool) loader = self._get_loader(train_id, manual_update) if loader is None: logger.warning( "No valid summary log in train job %s, " "or it is not in the cache.", train_id) return None train_job = loader.to_dict() train_job.pop('data_loader') plugin_data = {} for plugin_name in PluginNameEnum.list_members(): job = self.get_train_job_by_plugin(train_id, plugin_name=plugin_name) if job is None: plugin_data[plugin_name] = [] else: plugin_data[plugin_name] = job['tags'] train_job.update({'tag_mapping': plugin_data}) return train_job
def _load(self, executor): """ Load all log valid files. When the file is reloaded, it will continue to load from where it left off. Args: executor (executor): The Executor instance. Returns: bool, True if the train job is finished loading. """ filenames = self.filter_valid_files() if not filenames: logger.warning("No valid files can be loaded, summary_dir: %s.", self._summary_dir) raise exceptions.SummaryLogPathInvalid() old_filenames = list(self._valid_filenames) self._valid_filenames = filenames self._check_files_deleted(filenames, old_filenames) finished = True for parser in self._parser_list: finished = parser.parse_files(executor, filenames, events_data=self._events_data) and finished return finished
def _parse_consts(self, consts): """ Parse `anf_ir_pb2.NameValueProto` object, and create a const node. Args: consts (list[anf_ir_pb2.NameValueProto]): Refer to `anf_ir_pb2.NameValueProto` object. """ logger.debug("Start to parse consts from proto.") for const in consts: if not const.key: logger.warning( "Finding a const with an empty key will not save it.") continue check_invalid_character(const.key) node = Node(name=const.key, node_id=const.key) node.type = NodeTypeEnum.CONST.value if const.value.ByteSize() > self.MAX_NODE_ATTRIBUTE_VALUE_BYTES: node.add_attr( {const.key: 'dtype: ' + DataType.Name(const.value.dtype)}) else: node.add_attr({const.key: str(const.value)}) if const.value.dtype == DataType.DT_TENSOR: shape = list(const.value.tensor_val.dims) node.output_shape.append(shape) if const.value.tensor_val.HasField('data_type'): node.elem_types.append( DataType.Name(const.value.tensor_val.data_type)) else: node.elem_types.append(DataType.Name(const.value.dtype)) # dim is zero node.output_shape.append([]) node.output_nums = len(node.output_shape) self._cache_node(node)
def load(self, computing_resource_mgr): """ Load all log valid files. When the file is reloaded, it will continue to load from where it left off. Args: computing_resource_mgr (ComputingResourceManager): The ComputingResourceManager instance. """ logger.debug("Start to load data in ms data loader.") filenames = self.filter_valid_files() if not filenames: logger.warning("No valid files can be loaded, summary_dir: %s.", self._summary_dir) raise exceptions.SummaryLogPathInvalid() old_filenames = list(self._valid_filenames) self._valid_filenames = filenames self._check_files_deleted(filenames, old_filenames) with computing_resource_mgr.get_executor() as executor: for parser in self._parser_list: parser.parse_files(executor, filenames, events_data=self._events_data)
def delete_tensor_event(self, tag): """ This function will delete tensor event by the given tag in memory record. Args: tag (str): The tag name. """ if len(self._deleted_tags) < _MAX_DELETED_TAGS_SIZE: self._deleted_tags.add(tag) else: logger.warning( 'Too many deleted tags, %d upper limit reached, tags updating may not function hereafter', _MAX_DELETED_TAGS_SIZE) logger.info('%r and all related samples are going to be deleted', tag) self._tags.remove(tag) for plugin_name, lock in self._tags_by_plugin_mutex_lock.items(): with lock: if tag in self._tags_by_plugin[plugin_name]: self._tags_by_plugin[plugin_name].remove(tag) break with self._reservoir_mutex_lock: if tag in self._reservoir_by_tag: self._reservoir_by_tag.pop(tag)
def list_summaries(self, summary_base_dir, relative_path='./'): """ Get info of latest summary file within the given summary directory. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: list, list of summary file including the following attributes. - file_name (str): Summary file name. - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> summaries = summary_watcher.list_summaries('/summary/base/dir', './job-01') """ if contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path): return [] if not self._is_valid_summary_directory(summary_base_dir, relative_path): return [] summaries = [] summary_directory = os.path.realpath( os.path.join(summary_base_dir, relative_path)) try: entries = os.scandir(summary_directory) except PermissionError: logger.error('Path of summary directory is not accessible.') raise FileSystemPermissionError( 'Path of summary directory is not accessible.') for entry in entries: if entry.is_symlink() or not entry.is_file(): continue pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) if pattern is None: continue timestamp = int(pattern.groupdict().get('timestamp')) try: # extract created time from filename ctime = datetime.datetime.fromtimestamp(timestamp).astimezone() except OverflowError: continue try: stat = entry.stat() except FileNotFoundError: logger.warning('File %s not found.', entry.name) continue mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone() summaries.append({ 'file_name': entry.name, 'create_time': ctime, 'update_time': mtime, }) # sort by update time in descending order and filename in ascending order summaries.sort( key=lambda x: (-int(x['update_time'].timestamp()), x['file_name'])) return summaries
def _update_summary_dict(self, summary_dict, summary_base_dir, relative_path, entry): """ Update summary_dict with ctime and mtime. Args: summary_dict (dict): Temporary data structure to hold summary directory info. summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . entry (DirEntry): Directory entry instance needed to check with regular expression. """ try: stat = entry.stat() except FileNotFoundError: logger.warning('File %s not found', entry.name) return ctime = datetime.datetime.fromtimestamp(stat.st_ctime).astimezone() mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone() if entry.is_file(): summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name) if summary_pattern is None and pb_pattern is None: return if summary_pattern is not None: timestamp = int(summary_pattern.groupdict().get('timestamp')) try: # extract created time from filename ctime = datetime.datetime.fromtimestamp( timestamp).astimezone() except OverflowError: return if relative_path not in summary_dict: summary_dict[relative_path] = { 'ctime': ctime, 'mtime': mtime, 'profiler': None, } elif summary_dict[relative_path]['ctime'] < ctime: summary_dict[relative_path].update({ 'ctime': ctime, 'mtime': mtime, }) elif entry.is_dir(): profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name) if profiler_pattern is None or not self._is_valid_profiler_directory( full_dir_path): return profiler = { 'directory': os.path.join('.', entry.name), 'ctime': ctime, 'mtime': mtime, } summary_dict[relative_path] = { 'ctime': ctime, 'mtime': mtime, 'profiler': profiler, }
def generate_loaders(self, loader_pool): """ Generate loader from summary path, if summary path is empty, will return empty list. Args: loader_pool (dict[str, LoaderStruct]): Current loader pool in data_manager. Returns: dict[str, LoaderStruct], a dict of `Loader`. """ loader_dict = {} if not FileHandler.exists(self._summary_path): logger.warning( "Summary path does not exist. It will not start loading events data. " "Current path is %r.", self._summary_path) return loader_dict dir_map_mtime_dict = {} min_modify_time = None summaries_info = self._summary_watcher.list_summary_directories( self._summary_path) for item in summaries_info: relative_path = item.get("relative_path") current_dir = FileHandler.join(self._summary_path, relative_path) dataloader = DataLoader(current_dir) if not dataloader.has_valid_files(): logger.debug( "Can not find valid train log file in folder %s , " "will ignore.", relative_path) continue modify_time = item.get("update_time").timestamp() # if loader exists in loader pool and newer time, update its time loader_id = self._generate_loader_id(relative_path) loader = loader_pool.get(loader_id) if loader is not None and loader.latest_update_time > modify_time: modify_time = loader.latest_update_time if not min_modify_time: # The first load, init min modify time min_modify_time = modify_time # We need to find `MAX_DATA_LOADER_SIZE` newly modified folders. if len(dir_map_mtime_dict) < MAX_DATA_LOADER_SIZE: if modify_time < min_modify_time: min_modify_time = modify_time dir_map_mtime_dict.update({relative_path: modify_time}) else: if modify_time >= min_modify_time: dir_map_mtime_dict.update({relative_path: modify_time}) sorted_dir_tuple = sorted(dir_map_mtime_dict.items(), key=lambda d: d[1])[-MAX_DATA_LOADER_SIZE:] for relative_path, modify_time in sorted_dir_tuple: loader_id = self._generate_loader_id(relative_path) loader = self._generate_loader_by_relative_path(relative_path) loader_dict.update({loader_id: loader}) return loader_dict
def _update_summary_dict(self, summary_dict, summary_base_dir, relative_path, entry, list_explain): """ Update summary_dict with ctime and mtime. Args: summary_dict (dict): Temporary data structure to hold summary directory info. summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . entry (DirEntry): Directory entry instance needed to check with regular expression. list_explain (bool): Indicates whether to list only the mindexplain folder. """ try: stat = entry.stat() except FileNotFoundError: logger.warning('File %s not found', entry.name) return ctime = datetime.datetime.fromtimestamp(stat.st_ctime).astimezone() mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone() if entry.is_file(): summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name) if not self._is_valid_pattern_result(summary_pattern, pb_pattern, list_explain, entry): return if summary_pattern is not None: timestamp = int(summary_pattern.groupdict().get('timestamp')) try: # extract created time from filename ctime = datetime.datetime.fromtimestamp(timestamp).astimezone() except OverflowError: return if relative_path not in summary_dict: summary_dict[relative_path] = _new_entry(ctime, mtime) if summary_dict[relative_path]['create_time'] < ctime: summary_dict[relative_path].update({ 'create_time': ctime, 'update_time': mtime, }) if not summary_pattern: summary_dict[relative_path]['graph_files'] += 1 elif entry.name.endswith(LINEAGE_SUMMARY_SUFFIX): summary_dict[relative_path]['lineage_files'] += 1 elif entry.name.endswith(EXPLAIN_SUMMARY_SUFFIX): summary_dict[relative_path]['explain_files'] += 1 else: summary_dict[relative_path]['summary_files'] += 1 elif entry.is_dir(): if list_explain: return profiler_type, is_find = self._find_profiler_dir(entry, summary_base_dir, relative_path) if not is_find: return profiler = { 'directory': os.path.join('.', entry.name), 'create_time': ctime, 'update_time': mtime, "profiler_type": profiler_type } if relative_path in summary_dict: summary_dict[relative_path]['profiler'] = profiler else: summary_dict[relative_path] = _new_entry(ctime, mtime, profiler)
def _event_parse(self, event): """ Transform `Event` data to tensor_event and update it to EventsData. Args: event (Event): Message event in summary proto, data read from file handler. """ plugins = { 'scalar_value': PluginNameEnum.SCALAR, 'image': PluginNameEnum.IMAGE, 'histogram': PluginNameEnum.HISTOGRAM, } if event.HasField('summary'): for value in event.summary.value: for plugin in plugins: if not value.HasField(plugin): continue plugin_name_enum = plugins[plugin] tensor_event_value = getattr(value, plugin) if plugin == 'histogram': tensor_event_value = HistogramContainer( tensor_event_value) # Drop steps if original_buckets_count exceeds HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT # to avoid time-consuming re-sample process. if tensor_event_value.original_buckets_count > HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT: logger.warning( 'original_buckets_count exceeds ' 'HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT' ) continue tensor_event = TensorEvent( wall_time=event.wall_time, step=event.step, tag='{}/{}'.format(value.tag, plugin_name_enum.value), plugin_name=plugin_name_enum.value, value=tensor_event_value, filename=self._latest_filename) self._events_data.add_tensor_event(tensor_event) elif event.HasField('graph_def'): graph = MSGraph() graph.build_graph(event.graph_def) tensor_event = TensorEvent(wall_time=event.wall_time, step=event.step, tag=self._latest_filename, plugin_name=PluginNameEnum.GRAPH.value, value=graph, filename=self._latest_filename) try: graph_tags = self._events_data.list_tags_by_plugin( PluginNameEnum.GRAPH.value) except KeyError: graph_tags = [] summary_tags = self.filter_files(graph_tags) for tag in summary_tags: self._events_data.delete_tensor_event(tag) self._events_data.add_tensor_event(tensor_event)
def _event_parse(self, event): """ Transform `Event` data to tensor_event and update it to EventsData. Args: event (Event): Message event in summary proto, data read from file handler. """ if event.HasField('summary'): for value in event.summary.value: if value.HasField('scalar_value'): tag = '{}/{}'.format(value.tag, PluginNameEnum.SCALAR.value) tensor_event = TensorEvent(wall_time=event.wall_time, step=event.step, tag=tag, plugin_name=PluginNameEnum.SCALAR.value, value=value.scalar_value, filename=self._latest_filename) self._events_data.add_tensor_event(tensor_event) if value.HasField('image'): tag = '{}/{}'.format(value.tag, PluginNameEnum.IMAGE.value) tensor_event = TensorEvent(wall_time=event.wall_time, step=event.step, tag=tag, plugin_name=PluginNameEnum.IMAGE.value, value=value.image, filename=self._latest_filename) self._events_data.add_tensor_event(tensor_event) if value.HasField('histogram'): histogram_msg = HistogramContainer(value.histogram) # Drop steps if original_buckets_count exceeds HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT # to avoid time-consuming re-sample process. if histogram_msg.original_buckets_count > HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT: logger.warning('original_buckets_count exceeds HistogramContainer.MAX_ORIGINAL_BUCKETS_COUNT') else: tag = '{}/{}'.format(value.tag, PluginNameEnum.HISTOGRAM.value) tensor_event = TensorEvent(wall_time=event.wall_time, step=event.step, tag=tag, plugin_name=PluginNameEnum.HISTOGRAM.value, value=histogram_msg, filename=self._latest_filename) self._events_data.add_tensor_event(tensor_event) if event.HasField('graph_def'): graph_proto = event.graph_def graph = MSGraph() graph.build_graph(graph_proto) tensor_event = TensorEvent(wall_time=event.wall_time, step=event.step, tag=self._latest_filename, plugin_name=PluginNameEnum.GRAPH.value, value=graph, filename=self._latest_filename) try: graph_tags = self._events_data.list_tags_by_plugin(PluginNameEnum.GRAPH.value) except KeyError: graph_tags = [] summary_tags = self.filter_files(graph_tags) for tag in summary_tags: self._events_data.delete_tensor_event(tag) self._events_data.add_tensor_event(tensor_event)