def _parse_summary_logs(self, summary_path): """ Parse summary logs. Args: summary_path (Union[str, list[str]]): The single summary log path or a list of summary log path. """ if not summary_path: raise LineageQuerierParamException('summary_path', 'The summary path is empty.') if isinstance(summary_path, str): self._parse_summary_log(summary_path, 0) elif isinstance(summary_path, list): index = 0 for path in summary_path: parse_result = self._parse_summary_log(path, index) if parse_result: index += 1 else: raise LineageParamTypeError('Summary path is not str or list.') if self._parse_failed_paths: logger.info('Parse failed paths: %s', str(self._parse_failed_paths)) if not self._lineage_objects: raise LineageSummaryParseException()
def _organize_from_disk(self): """Organize lineage objs from disk.""" if self._summary_base_dir is None: return summary_watcher = SummaryWatcher() relative_dirs = summary_watcher.list_summary_directories( summary_base_dir=self._summary_base_dir) no_lineage_count = 0 for item in relative_dirs: relative_dir = item.get('relative_path') update_time = item.get('update_time') abs_summary_dir = os.path.realpath( os.path.join(self._summary_base_dir, relative_dir)) try: lineage_parser = LineageParser(abs_summary_dir, update_time) super_lineage_obj = lineage_parser.super_lineage_obj if super_lineage_obj is not None: self._super_lineage_objs.update( {abs_summary_dir: super_lineage_obj}) except LineageFileNotFoundError: no_lineage_count += 1 if no_lineage_count == len(relative_dirs): logger.info('There is no summary log file under summary_base_dir.') raise LineageFileNotFoundError( 'There is no summary log file under summary_base_dir.')
def load(self): """Find and load summaries.""" # get sorted lineage files lineage_files = SummaryPathParser.get_lineage_summaries(self._summary_dir, is_sorted=True) if not lineage_files: logger.info('There is no summary log file under summary_dir %s.', self._summary_dir) raise LineageFileNotFoundError( 'There is no summary log file under summary_dir.' ) self._init_if_files_deleted(lineage_files) index = 0 if self._latest_filename is not None: index = lineage_files.index(self._latest_filename) for filename in lineage_files[index:]: if filename != self._latest_filename: self._latest_filename = filename self._latest_file_size = 0 file_path = os.path.join(self._summary_dir, filename) new_size = FileHandler(file_path).size if new_size == self._latest_file_size: continue self._latest_file_size = new_size try: self._parse_summary_log() except (LineageSummaryAnalyzeException, LineageEventNotExistException, LineageEventFieldNotExistException) as error: logger.debug("Parse file failed, file_path is %s. Detail: %s", file_path, str(error)) except MindInsightException as error: logger.exception(error) logger.debug("Parse file failed, file_path is %s.", file_path)
def load(self): """Find and load summaries.""" # get sorted lineage files lineage_files = SummaryPathParser.get_lineage_summaries( self._summary_dir, is_sorted=True) if not lineage_files: logger.info('There is no summary log file under summary_dir %s.', self._summary_dir) raise LineageFileNotFoundError( 'There is no summary log file under summary_dir.') self._init_if_files_deleted(lineage_files) index = 0 if self._latest_filename is not None: index = lineage_files.index(self._latest_filename) for filename in lineage_files[index:]: if filename != self._latest_filename: self._latest_filename = filename self._latest_file_size = 0 file_path = os.path.join(self._summary_dir, filename) new_size = FileHandler(file_path).size if new_size == self._latest_file_size: continue self._latest_file_size = new_size self._parse_summary_log()
def _delete_lineage_in_cache(self, cache_item, key, relative_path): with cache_item.lock_key(key): try: cache_item.delete(key=key) logger.info("Parse failed, delete the tran job %s.", relative_path) except ParamValueError: logger.debug("Parse failed, and it is not in cache, " "no need to delete the train job %s.", relative_path)
def _init_if_files_deleted(self, file_list): """Init variables if files deleted.""" cached_file_list = self._cached_file_list self._cached_file_list = file_list if cached_file_list is None: return deleted_files = set(cached_file_list) - set(file_list) if deleted_files: logger.info("There are some files has been deleted, " "all files will be reloaded in path %s.", self._summary_dir) self._init_variables()
def begin(self, run_context): """ Initialize the training progress when the training job begins. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. """ log.info('Initialize training lineage collection...') if self.user_defined_info: self.lineage_summary.record_user_defined_info( self.user_defined_info) if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() if not self.initial_learning_rate: optimizer = run_context_args.get('optimizer') if optimizer and not isinstance(optimizer, Optimizer): log.error( "The parameter optimizer is invalid. It should be an instance of " "mindspore.nn.optim.optimizer.Optimizer.") raise MindInsightException( error=LineageErrors.PARAM_OPTIMIZER_ERROR, message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value) if optimizer: log.info('Obtaining initial learning rate...') self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) else: network = run_context_args.get('train_network') optimizer = AnalyzeObject.get_optimizer_by_network(network) self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) # get train dataset graph train_dataset = run_context_args.get('train_dataset') dataset_graph_dict = ds.serialize(train_dataset) dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2) dataset_graph_dict = json.loads(dataset_graph_json_str) log.info('Logging dataset graph...') try: self.lineage_summary.record_dataset_graph( dataset_graph=dataset_graph_dict) except Exception as error: error_msg = f'Dataset graph log error in TrainLineage begin: {error}' log.error(error_msg) raise LineageLogError(error_msg) log.info('Dataset graph logged successfully.')
def _organize_from_cache(self): """Organize lineage objs from cache.""" if self._data_manager is None: return brief_cache = self._data_manager.get_brief_cache() cache_items = brief_cache.cache_items for relative_dir, cache_train_job in cache_items.items(): try: super_lineage_obj = cache_train_job.get( "lineage").super_lineage_obj self._super_lineage_objs.update( {relative_dir: super_lineage_obj}) except ParamValueError: logger.info("This is no lineage info in train job %s.", relative_dir)
def get_network_args(run_context_args, train_lineage): """ Get the parameters related to the network, such as optimizer, loss function. Args: run_context_args (dict): It contains all information of the training job. train_lineage (dict): A dict contains lineage metadata. Returns: dict, the lineage metadata. """ network = run_context_args.get('train_network') validate_network(network) optimizer = run_context_args.get('optimizer') if not optimizer: optimizer = AnalyzeObject.get_optimizer_by_network(network) loss_fn = run_context_args.get('loss_fn') if not loss_fn: loss_fn = AnalyzeObject.get_loss_fn_by_network(network) loss = None else: loss = run_context_args.get('net_outputs') if loss: log.info('Calculating loss...') loss_numpy = loss.asnumpy() loss = float(np.atleast_1d(loss_numpy)[0]) log.debug('loss: %s', loss) train_lineage[Metadata.loss] = loss else: train_lineage[Metadata.loss] = None # Analyze classname of optimizer, loss function and training network. train_lineage[Metadata.optimizer] = type(optimizer).__name__ \ if optimizer else None train_lineage[ Metadata.train_network] = AnalyzeObject.get_backbone_network( network) train_lineage[Metadata.loss_function] = type(loss_fn).__name__ \ if loss_fn else None return train_lineage
def _get_columns_name(lineage_objects): """Get columns name.""" column_names = set() user_defined_num = 0 for lineage in lineage_objects: model_lineage = lineage.get("model_lineage", {}) metric = model_lineage.get("metric", {}) metric_names = tuple('{}{}'.format(_METRIC_PREFIX, key) for key in metric.keys()) user_defined = model_lineage.get("user_defined", {}) user_defined_names = tuple('{}{}'.format(_USER_DEFINED_PREFIX, key) for key in user_defined.keys()) model_lineage_temp = list(model_lineage.keys()) for key in model_lineage_temp: if key in ["metric", "user_defined"]: model_lineage_temp.remove(key) column_names.update(model_lineage_temp) column_names.update(metric_names) if user_defined_num + len( user_defined_names) <= USER_DEFINED_INFO_LIMIT: column_names.update(user_defined_names) user_defined_num += len(user_defined_names) elif user_defined_num < USER_DEFINED_INFO_LIMIT <= user_defined_num + len( user_defined_names): names = [] for i in range(USER_DEFINED_INFO_LIMIT - user_defined_num): names.append(user_defined_names[i]) column_names.update(names) user_defined_num += len(names) log.info( "Partial user_defined_info is deleted. Currently saved length is: %s.", user_defined_num) else: log.info( "The quantity of user_defined_info has reached the limit %s.", USER_DEFINED_INFO_LIMIT) column_names.update(["train_id"]) return column_names
def update_item(self, cache_item: CachedTrainJob): """Update cache item in place.""" summary_base_dir = cache_item.summary_base_dir summary_dir = cache_item.abs_summary_dir # The summary_base_dir and summary_dir have been normalized in data_manager. if summary_base_dir == summary_dir: relative_path = "./" else: relative_path = f'./{os.path.basename(summary_dir)}' try: lineage_parser = self._lineage_parsing(cache_item) except LineageFileNotFoundError: with cache_item.lock_key(LINEAGE): cache_item.delete(key=LINEAGE) return super_lineage_obj = lineage_parser.super_lineage_obj if super_lineage_obj is None: logger.info("There is no lineage to update in tran job %s.", relative_path) return cache_item.set(key=LINEAGE, value=lineage_parser)
def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. LineageLogError: If recording lineage information fails. """ if self.user_defined_info: self.lineage_summary.record_user_defined_info( self.user_defined_info) if not isinstance(run_context, RunContext): error_msg = f'Invalid EvalLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_eval_run_context(EvalParameter, run_context_args) valid_dataset = run_context_args.get('valid_dataset') eval_lineage = dict() metrics = run_context_args.get('metrics') eval_lineage[Metadata.metrics] = json.dumps(metrics) eval_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') log.info('Analyzing dataset object...') eval_lineage = AnalyzeObject.analyze_dataset(valid_dataset, eval_lineage, 'valid') log.info('Logging evaluation job lineage...') try: self.lineage_summary.record_evaluation_lineage(eval_lineage) except IOError as error: error_msg = f'End error in EvalLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the evaluation job.') raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in EvalLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the evaluation job.') raise LineageLogError(error_msg) log.info('The lineage of the evaluation job has logged successfully.')
def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: LineageLogError: If recording lineage information fails. """ log.info('Start to collect training lineage...') if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_train_run_context(RunContextArgs, run_context_args) train_lineage = dict() train_lineage = AnalyzeObject.get_network_args(run_context_args, train_lineage) train_dataset = run_context_args.get('train_dataset') callbacks = run_context_args.get('list_callback') list_callback = getattr(callbacks, '_callbacks', []) log.info('Obtaining model files...') ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback) train_lineage[Metadata.learning_rate] = self.initial_learning_rate train_lineage[Metadata.epoch] = run_context_args.get('epoch_num') train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') train_lineage[Metadata.parallel_mode] = run_context_args.get( 'parallel_mode') train_lineage[Metadata.device_num] = run_context_args.get( 'device_number') train_lineage[Metadata.batch_size] = run_context_args.get('batch_num') model_path_dict = {'ckpt': ckpt_file_path} train_lineage[Metadata.model_path] = json.dumps(model_path_dict) log.info('Calculating model size...') train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size( ckpt_file_path) log.debug('model_size: %s', train_lineage[Metadata.model_size]) log.info('Analyzing dataset object...') train_lineage = AnalyzeObject.analyze_dataset(train_dataset, train_lineage, 'train') log.info('Logging lineage information...') try: lineage_summary = LineageSummary(self.lineage_log_path) lineage_summary.record_train_lineage(train_lineage) except IOError as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the training job.') raise LineageLogError(error_msg) log.info('The lineage of the training job has logged successfully.')