def get_latest_lineage_summaries(summary_base_dir): """ Get all latest lineage summary logs in summary base dir. Args: summary_base_dir (str): Summary base dir. Returns: list[str], all latest lineage summary logs in summary base dir. The lineage summary log is absolute path. """ summary_watcher = SummaryWatcher() relative_dirs = summary_watcher.list_summary_directories( summary_base_dir=summary_base_dir) latest_summaries = [] for item in relative_dirs: relative_dir = item.get('relative_path') summaries = summary_watcher.list_summaries( summary_base_dir=summary_base_dir, relative_path=relative_dir) latest_file_name = SummaryPathParser._get_latest_lineage_file( summaries) if latest_file_name is None: continue latest_file = os.path.realpath( os.path.join(summary_base_dir, relative_dir, latest_file_name)) latest_summaries.append(latest_file) return latest_summaries
def query_train_jobs(): """Query train jobs.""" offset = request.args.get("offset", default=0) limit = request.args.get("limit", default=10) summary_watcher = SummaryWatcher() total, directories = summary_watcher.list_summary_directories_by_pagination( settings.SUMMARY_BASE_DIR, offset, limit) train_jobs = [{ 'train_id': directory['relative_path'], 'relative_path': directory['relative_path'], 'create_time': directory['create_time'].strftime('%Y-%m-%d %H:%M:%S'), 'update_time': directory['update_time'].strftime('%Y-%m-%d %H:%M:%S'), } for directory in directories] return jsonify({ 'name': os.path.basename(os.path.realpath(settings.SUMMARY_BASE_DIR)), 'total': total, 'train_jobs': train_jobs, })
def _organize_from_disk(self): """Organize lineage objs from disk.""" if self._summary_base_dir is None: return summary_watcher = SummaryWatcher() relative_dirs = summary_watcher.list_summary_directories( summary_base_dir=self._summary_base_dir) no_lineage_count = 0 for item in relative_dirs: relative_dir = item.get('relative_path') update_time = item.get('update_time') abs_summary_dir = os.path.realpath( os.path.join(self._summary_base_dir, relative_dir)) try: lineage_parser = LineageParser(abs_summary_dir, update_time) super_lineage_obj = lineage_parser.super_lineage_obj if super_lineage_obj is not None: self._super_lineage_objs.update( {abs_summary_dir: super_lineage_obj}) except LineageFileNotFoundError: no_lineage_count += 1 if no_lineage_count == len(relative_dirs): logger.info('There is no summary log file under summary_base_dir.') raise LineageFileNotFoundError( 'There is no summary log file under summary_base_dir.')
def get_lineage_summaries(summary_dir, is_sorted=False, reverse=True): """ Get lineage summaries according to summary dir. Args: summary_dir (str): Summary dir. is_sorted (bool): If it is True, files will be sorted. reverse (bool): If it is True, sort by timestamp increments and filename decrement. Returns: list, if the lineage summary log exist, return the file names, else return []. """ try: summary_watcher = SummaryWatcher() summaries = summary_watcher.list_summaries( summary_base_dir=summary_dir) except MindInsightException as err: logger.warning(str(err)) return [] summary_files = [summary.get('file_name') for summary in summaries] lineage_files_name = list( filter( lambda filename: (filename.endswith(SummaryPathParser.LINEAGE_SUMMARY_SUFFIX)), summary_files)) if is_sorted: lineage_files_name = SummaryPathParser._sorted_summary_files( lineage_files_name, reverse) return lineage_files_name
def __init__(self, summary_base_dir: str): self._summary_base_dir = summary_base_dir self._loader_pool = OrderedDict() self._loading_status = _ExplainManagerStatus.INIT.value self._status_mutex = threading.Lock() self._loader_pool_mutex = threading.Lock() self._max_loaders_num = _MAX_LOADERS_NUM self._summary_watcher = SummaryWatcher()
def __init__(self, summary_path): """ Init DataLoaderGenerator. Args: summary_path (str): A directory path, e.g. '/data/ImageNet/'. """ self._summary_path = self._check_and_normalize_summary_path(summary_path) self._summary_watcher = SummaryWatcher()
def test_list_explain_directories_only_base_dir(self, job_count): """Test list explain directories with summary base dir, and no test offset and limit.""" summary_base_dir = tempfile.mkdtemp(dir=self.base_dir) if job_count: for i in range(job_count): gen_explain_directories_and_files(summary_base_dir, f'run{i}') summary_watcher = SummaryWatcher() total, _ = summary_watcher.list_explain_directories(summary_base_dir) assert total == job_count shutil.rmtree(summary_base_dir)
def test_list_summary_directories_with_overall_on(self): """Test list_summary_directories method success.""" summary_base_dir = tempfile.mkdtemp(dir=self.base_dir) file_count = 10 directory_count = 10 gen_directories_and_files(summary_base_dir, file_count, directory_count) summary_watcher = SummaryWatcher() directories = summary_watcher.list_summary_directories(summary_base_dir, overall=True) expected_directory_count = directory_count + 1 assert len(directories) == min(expected_directory_count, SummaryWatcher.MAX_SUMMARY_DIR_COUNT) shutil.rmtree(summary_base_dir)
def test_list_explain_dir_with_offset_limit(self, offset, limit): """Test list explain dir with offset and limit.""" summary_base_dir = tempfile.mkdtemp(dir=self.base_dir) gen_directories_and_files(summary_base_dir, file_count=1, directory_count=3) for i in range(10): gen_explain_directories_and_files(summary_base_dir, f'run_{i}') summary_watcher = SummaryWatcher() _, result = summary_watcher.list_explain_directories(summary_base_dir, offset, limit) if offset == 3: assert len(result) == 1 else: assert len(result) == limit
def test_list_summaries(self): """Test list_summaries method success.""" summary_base_dir = tempfile.mkdtemp(dir=self.base_dir) file_count = 10 directory_count = 1 gen_directories_and_files(summary_base_dir, file_count, directory_count) summary_watcher = SummaryWatcher() summaries = summary_watcher.list_summaries(summary_base_dir) assert len(summaries) == file_count summaries = summary_watcher.list_summaries(summary_base_dir, './\x00') assert not summaries shutil.rmtree(summary_base_dir)
def test_is_summary_directory(self): """Test is_summary_directory method success.""" summary_base_dir = tempfile.mkdtemp(dir=self.base_dir) file_count = 1 directory_count = 1 gen_directories_and_files(summary_base_dir, file_count, directory_count) summary_watcher = SummaryWatcher() flag = summary_watcher.is_summary_directory(summary_base_dir, './') assert flag flag = summary_watcher.is_summary_directory(summary_base_dir, './\x00') assert not flag shutil.rmtree(summary_base_dir)
def get_latest_lineage_summary(summary_dir): """ Get latest lineage summary log path according to summary dir. Args: summary_dir (str): Summary dir. Returns: Union[str, None], if the lineage summary log exist, return the path, else return None. The lineage summary log path is absolute path. """ summary_watcher = SummaryWatcher() summaries = summary_watcher.list_summaries( summary_base_dir=summary_dir) latest_file_name = SummaryPathParser._get_latest_lineage_file( summaries) return os.path.join(summary_dir, latest_file_name) \ if latest_file_name is not None else None
def test_list_summary_directories_by_pagination(self): """Test list_summary_directories method success.""" summary_base_dir = tempfile.mkdtemp(dir=self.base_dir) file_count = 10 directory_count = 10 gen_directories_and_files(summary_base_dir, file_count, directory_count) summary_watcher = SummaryWatcher() total, directories = summary_watcher.list_summary_directories_by_pagination( summary_base_dir, offset=0, limit=10) if (file_count + 1) * directory_count + file_count >= SummaryWatcher.MAX_SCAN_COUNT: expected_directory_count = math.ceil((SummaryWatcher.MAX_SCAN_COUNT - file_count) / (file_count + 1) + 1) assert total == len(directories) == expected_directory_count else: expected_directory_count = directory_count + 1 assert total == min(expected_directory_count, SummaryWatcher.MAX_SUMMARY_DIR_COUNT) shutil.rmtree(summary_base_dir)
def get_summary_dirs(summary_base_dir): """ Get summary dirs according to summary base dir. Args: summary_base_dir (str): Summary base dir. Returns: list[str], all summary dirs in summary base dir. The summary dir is absolute path. """ summary_watcher = SummaryWatcher() relative_dirs = summary_watcher.list_summary_directories( summary_base_dir=summary_base_dir) summary_dirs = list( map( lambda item: os.path.realpath( os.path.join(summary_base_dir, item.get('relative_path'))), relative_dirs)) return summary_dirs
def update_cache(self, executor): """Update cache.""" logger.info('Start to update BriefCacheManager.') summaries_info = SummaryWatcher().list_summary_directories( self._summary_base_dir) basic_train_jobs = [] for info in summaries_info: basic_train_jobs.append( _BasicTrainJob(abs_summary_base_dir=self._summary_base_dir, entry=info)) with self._lock: new_cache_items = self._merge_with_disk(basic_train_jobs) self._cache_items = new_cache_items for updater in self._updaters.values(): for cache_item in self._cache_items.values(): updater.update_item(cache_item)
def _load_data(self): """This function will load data once and ignore it if the status is loading.""" logger.info("Start to load data, reload interval: %r.", self._reload_interval) with self._status_mutex: if self.status == DataManagerStatus.LOADING.value: logger.debug( "Current status is %s , will ignore to load data.", self.status) return self.status = DataManagerStatus.LOADING.value summaries_info = SummaryWatcher().list_summary_directories( self._summary_base_dir) basic_train_jobs = [] for info in summaries_info: profiler = info['profiler'] basic_train_jobs.append( _BasicTrainJob( train_id=info['relative_path'], abs_summary_base_dir=self._summary_base_dir, abs_summary_dir=os.path.realpath( os.path.join(self._summary_base_dir, info['relative_path'])), create_time=info['create_time'], update_time=info['update_time'], profiler_dir=None if profiler is None else profiler['directory'], )) self._brief_cache.update_cache(basic_train_jobs) self._detail_cache.update_cache(basic_train_jobs) if not self._brief_cache.has_content( ) and not self._detail_cache.has_content(): self.status = DataManagerStatus.INVALID.value else: self.status = DataManagerStatus.DONE.value logger.info( "Load event data end, status: %r, and loader pool size is %r.", self.status, self._detail_cache.loader_pool_size())
def __init__(self, summary_base_dir): super(_BriefCacheManager, self).__init__(summary_base_dir) self._summary_watcher = SummaryWatcher()
class DataLoaderGenerator(LoaderGenerator): """ DataLoaderGenerator generate a loader_dict of loader from summary logs. Each loader helps deal the data of the events. It helps DataManager to generate loaders. """ def __init__(self, summary_path): """ Init DataLoaderGenerator. Args: summary_path (str): A directory path, e.g. '/data/ImageNet/'. """ self._summary_path = self._check_and_normalize_summary_path( summary_path) self._summary_watcher = SummaryWatcher() def _check_and_normalize_summary_path(self, summary_path): """ Check and normalize summary path. Args: summary_path (str): A directory path, e.g. '/data/ImageNet/'. Returns: str, normalized summary path. """ if summary_path is None: logger.warning( "Summary path is None. It will not init data loader generator." ) raise ParamValueError("Summary path is None.") summary_path = os.path.realpath(summary_path) return summary_path def generate_loaders(self, loader_pool): """ Generate loader from summary path, if summary path is empty, will return empty list. Args: loader_pool (dict[str, LoaderStruct]): Current loader pool in data_manager. Returns: dict[str, LoaderStruct], a dict of `Loader`. """ loader_dict = {} if not FileHandler.exists(self._summary_path): logger.warning( "Summary path does not exist. It will not start loading events data. " "Current path is %r.", self._summary_path) return loader_dict dir_map_mtime_dict = {} min_modify_time = None summaries_info = self._summary_watcher.list_summary_directories( self._summary_path) for item in summaries_info: relative_path = item.get("relative_path") current_dir = FileHandler.join(self._summary_path, relative_path) dataloader = DataLoader(current_dir) if not dataloader.has_valid_files(): logger.debug( "Can not find valid train log file in folder %s , " "will ignore.", relative_path) continue modify_time = item.get("update_time").timestamp() # if loader exists in loader pool and newer time, update its time loader_id = self._generate_loader_id(relative_path) loader = loader_pool.get(loader_id) if loader is not None and loader.latest_update_time > modify_time: modify_time = loader.latest_update_time if not min_modify_time: # The first load, init min modify time min_modify_time = modify_time # We need to find `MAX_DATA_LOADER_SIZE` newly modified folders. if len(dir_map_mtime_dict) < MAX_DATA_LOADER_SIZE: if modify_time < min_modify_time: min_modify_time = modify_time dir_map_mtime_dict.update({relative_path: modify_time}) else: if modify_time >= min_modify_time: dir_map_mtime_dict.update({relative_path: modify_time}) sorted_dir_tuple = sorted(dir_map_mtime_dict.items(), key=lambda d: d[1])[-MAX_DATA_LOADER_SIZE:] for relative_path, modify_time in sorted_dir_tuple: loader_id = self._generate_loader_id(relative_path) loader = self._generate_loader_by_relative_path(relative_path) loader_dict.update({loader_id: loader}) return loader_dict def _generate_loader_by_relative_path(self, relative_path): """ Generate loader by relative path. Args: relative_path (str): Relative path of a summary directory, e.g. './log1'. Returns: dict[str, LoaderStruct], a dict of `Loader`. """ current_dir = os.path.realpath( FileHandler.join(self._summary_path, relative_path)) data_loader = DataLoader(current_dir) loader_id = self._generate_loader_id(relative_path) loader = LoaderStruct( loader_id=loader_id, name=self._generate_loader_name(relative_path), path=current_dir, latest_update_time=FileHandler.file_stat(current_dir).mtime, data_loader=data_loader) return loader def _generate_loader_id(self, relative_path): """ Generate loader id from relative path. Args: relative_path (str): Relative path of a summary directory, e.g. './log1'. Returns: str, loader_id for `Loader`. """ loader_id = relative_path return loader_id def _generate_loader_name(self, relative_path): """ Generate loader name from relative path. Args: relative_path (str): Relative path of a summary directory, e.g. './log1'. Returns: str, loader_name for `Loader`. """ loader_name = relative_path return loader_name def _get_relative_path_from_train_id(self, train_id): """ Get relative from train_id. Args: train_id (str): Train ID of a summary directory, e.g. './log1'. Returns: str, relative path of `Loader`. """ relative_path = train_id return relative_path def check_train_job_exist(self, train_id): """ Check if train job exists. Args: train_id (str): Train ID of a summary directory, e.g. './log1'. Returns: bool, if train job exists, return True. """ if not self._is_train_id_valid(train_id): return False relative_path = self._get_relative_path_from_train_id(train_id) if self._summary_watcher.is_summary_directory(self._summary_path, relative_path): return True return False def _is_train_id_valid(self, train_id): """ Check if train_id is valid. Args: train_id (str): Train ID of a summary directory, e.g. './log1'. Returns: bool, if train id is valid, return True. """ if not train_id.startswith('./'): logger.warning("The train_id does not start with './'.") return False if len(train_id.split("/")) > 2: logger.warning("The train_id contains multiple '/'.") return False return True def generate_loader_by_train_id(self, train_id): """ Generate loader by train_id. Args: train_id (str): Train ID of a summary directory, e.g. './log1'. Returns: dict[str, LoaderStruct], a dict of `Loader`. """ relative_path = self._get_relative_path_from_train_id(train_id) loader = self._generate_loader_by_relative_path(relative_path) return loader
def get_timestamp(filename): """Get timestamp from filename.""" timestamp = int( re.search(SummaryWatcher().SUMMARY_FILENAME_REGEX, filename)[1]) return timestamp
class _BriefCacheManager(_BaseCacheManager): """A cache manager that holds all disk train jobs on disk.""" def __init__(self, summary_base_dir): super(_BriefCacheManager, self).__init__(summary_base_dir) self._summary_watcher = SummaryWatcher() def cache_train_job(self, train_id): """ Cache given train job. All disk train jobs are cached on every reload, so this method always return false. Args: train_id (str): Train Id. """ if train_id in self._cache_items: self._cache_items[train_id].update_access_time() return False def update_cache(self, executor): """Update cache.""" logger.info('Start to update BriefCacheManager.') summaries_info = self._summary_watcher.list_summary_directories( self._summary_base_dir) basic_train_jobs = [] for info in summaries_info: basic_train_jobs.append( _BasicTrainJob(abs_summary_base_dir=self._summary_base_dir, entry=info)) with self._lock: new_cache_items = self._merge_with_disk(basic_train_jobs) self._cache_items = new_cache_items for updater in self._updaters.values(): for cache_item in self._cache_items.values(): updater.update_item(cache_item) def _merge_with_disk(self, disk_train_jobs: Iterable[_BasicTrainJob]): """ Merge train jobs in cache with train jobs from disk This method will remove train jobs not on disk. Call this function with lock for thread safety. Args: disk_train_jobs (Iterable[_BasicTrainJob]): Basic train jobs info from disk. Returns: dict, a dict containing train jobs to be cached. """ new_cache_items = {} for train_job in disk_train_jobs: if train_job.train_id not in self._cache_items: new_cache_items[train_job.train_id] = CachedTrainJob(train_job) else: reused_train_job = self._cache_items[train_job.train_id] reused_train_job.basic_info = train_job new_cache_items[train_job.train_id] = reused_train_job return new_cache_items def register_folder_analyzer(self, analyzer): """Register folder analyzer.""" self._summary_watcher.register_folder_analyzer(analyzer) @property def cache_items(self): """Get cache items.""" return self._cache_items
class ExplainManager: """ExplainManager.""" def __init__(self, summary_base_dir: str): self._summary_base_dir = summary_base_dir self._loader_pool = OrderedDict() self._loading_status = _ExplainManagerStatus.INIT.value self._status_mutex = threading.Lock() self._loader_pool_mutex = threading.Lock() self._max_loaders_num = _MAX_LOADERS_NUM self._summary_watcher = SummaryWatcher() @property def summary_base_dir(self): """Return the base directory for summary records.""" return self._summary_base_dir def start_load_data(self, reload_interval: int = 0): """ Start individual thread to cache explain_jobs and loading summary data periodically. Args: reload_interval (int): Specify the loading period in seconds. If interval == 0, data will only be loaded once. Default: 0. """ thread = threading.Thread(target=self._repeat_loading, name='start_load_thread', args=(reload_interval, ), daemon=True) time.sleep(1) thread.start() def get_job(self, loader_id: str) -> Optional[ExplainLoader]: """ Return ExplainLoader given loader_id. If explain job w.r.t given loader_id is not found, None will be returned. Args: loader_id (str): The id of expected ExplainLoader Return: explain_job """ self._check_status_valid() with self._loader_pool_mutex: if loader_id in self._loader_pool: self._loader_pool[loader_id].query_time = datetime.now( ).timestamp() self._loader_pool.move_to_end(loader_id, last=False) return self._loader_pool[loader_id] try: loader = self._generate_loader_from_relative_path(loader_id) loader.query_time = datetime.now().timestamp() self._add_loader(loader) self._reload_data_again() except ParamValueError: logger.warning( 'Cannot find summary in path: %s. No explain_job will be returned.', loader_id) return None return loader def get_job_list(self, offset=0, limit=None): """ Return List of explain jobs. includes job ID, create and update time. Args: offset (int): An offset for page. Ex, offset is 0, mean current page is 1. Default value is 0. limit (int): The max data items for per page. Default value is 10. Returns: tuple[total, directories], total indicates the overall number of explain directories and directories indicate list of summary directory info including the following attributes. - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR, starting with "./". - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. """ total, dir_infos = \ self._summary_watcher.list_explain_directories(self._summary_base_dir, offset=offset, limit=limit) return total, dir_infos def _repeat_loading(self, repeat_interval): """Periodically loading summary.""" while True: try: logger.info('Start to load data, repeat interval: %r.', repeat_interval) self._load_data() if not repeat_interval: return time.sleep(repeat_interval) except UnknownError as ex: logger.error( 'Unexpected error happens when loading data. Loading status: %s, loading pool size: %d' 'Detail: %s', self._loading_status, len(self._loader_pool), str(ex)) def _load_data(self): """ Prepare loaders in cache and start loading the data from summaries. Only a limited number of loaders will be cached in terms of updated_time or query_time. The size of cache pool is determined by _MAX_LOADERS_NUM. When the manager start loading data, only the lastest _MAX_LOADER_NUM summaries will be loaded in cache. If a cached loader if queries by 'get_job', the query_time of the loader will be updated as well as the the loader moved to the end of cache. If an uncached summary is queried, a new loader instance will be generated and put to the end cache. """ try: with self._status_mutex: if self._loading_status == _ExplainManagerStatus.LOADING.value: logger.info( 'Current status is %s, will ignore to load data.', self._loading_status) return self._loading_status = _ExplainManagerStatus.LOADING.value self._cache_loaders() self._execute_loading() if not self._loader_pool: self._loading_status = _ExplainManagerStatus.INVALID.value else: self._loading_status = _ExplainManagerStatus.DONE.value logger.info( 'Load event data end, status: %s, and loader pool size: %d', self._loading_status, len(self._loader_pool)) except Exception as ex: self._loading_status = _ExplainManagerStatus.INVALID.value logger.exception(ex) raise UnknownError(str(ex)) def _cache_loaders(self): """Cache explain loader in cache pool.""" dir_map_mtime_dict = [] _, summaries_info = self._summary_watcher.list_explain_directories( self._summary_base_dir) for summary_info in summaries_info: summary_path = summary_info.get('relative_path') summary_update_time = summary_info.get('update_time').timestamp() if summary_path in self._loader_pool: summary_update_time = max( summary_update_time, self._loader_pool[summary_path].query_time) dir_map_mtime_dict.append((summary_info, summary_update_time)) sorted_summaries_info = sorted(dir_map_mtime_dict, key=lambda x: x[1])[-_MAX_LOADERS_NUM:] with self._loader_pool_mutex: for summary_info, query_time in sorted_summaries_info: summary_path = summary_info['relative_path'] if summary_path not in self._loader_pool: loader = self._generate_loader_from_relative_path( summary_path) self._add_loader(loader) else: self._loader_pool[summary_path].query_time = query_time self._loader_pool.move_to_end(summary_path, last=False) def _generate_loader_from_relative_path( self, relative_path: str) -> ExplainLoader: """Generate explain loader from the given relative path.""" self._check_summary_exist(relative_path) current_dir = os.path.realpath( FileHandler.join(self._summary_base_dir, relative_path)) loader_id = self._generate_loader_id(relative_path) loader = ExplainLoader(loader_id=loader_id, summary_dir=current_dir) return loader def _add_loader(self, loader): """add loader to the loader_pool.""" if loader.train_id not in self._loader_pool: self._loader_pool[loader.train_id] = loader else: self._loader_pool.move_to_end(loader.train_id) while len(self._loader_pool) > self._max_loaders_num: self._loader_pool.popitem(last=False) def _execute_loading(self): """Execute the data loading.""" for loader_id in list(self._loader_pool.keys()): try: with self._loader_pool_mutex: loader = self._loader_pool.get(loader_id, None) if loader is None: logger.debug( 'Loader %r has been deleted, will not load data', loader_id) return loader.load() except MindInsightException as ex: logger.warning( 'Data loader %r load data failed. Delete data_loader. Detail: %s', loader_id, ex) with self._loader_pool_mutex: self._delete_loader(loader_id) def _delete_loader(self, loader_id): """delete loader given loader_id""" if loader_id in self._loader_pool: self._loader_pool.pop(loader_id) logger.debug('delete loader %s', loader_id) def _check_status_valid(self): """Check manager status.""" if self._loading_status == _ExplainManagerStatus.INIT.value: raise exceptions.SummaryLogIsLoading( 'Data is loading, current status is %s' % self._loading_status) def _check_summary_exist(self, loader_id): """Verify thee train_job is existed given loader_id.""" if not self._summary_watcher.is_summary_directory( self._summary_base_dir, loader_id): raise ParamValueError('Can not find the train job in the manager.') def _reload_data_again(self): """Reload the data one more time.""" logger.debug('Start to reload data again.') thread = threading.Thread(target=self._load_data, name='reload_data_thread') thread.daemon = False thread.start() @staticmethod def _generate_loader_id(relative_path): """Generate loader id for given path""" loader_id = relative_path return loader_id