def make_directory(path): """Make directory.""" if path is None or not isinstance(path, str) or not path.strip(): log.error("Invalid input path: %r.", path) raise LineageParamTypeError("Invalid path type") # convert relative path to abs path path = os.path.realpath(path) log.debug("The abs path is %r", path) # check path exist and its write permissions] if os.path.exists(path): real_path = path else: # All exceptions need to be caught because create directory maybe have some limit(permissions) log.debug("The directory(%s) doesn't exist, will create it", path) try: os.makedirs(path, exist_ok=True) real_path = path except PermissionError as err: log.error("No write permission on the directory(%r), error = %r", path, err) raise LineageParamTypeError( "No write permission on the directory.") return real_path
def begin(self, run_context): """ Initialize the training progress when the training job begins. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. """ log.info('Initialize training lineage collection...') if self.user_defined_info: self.lineage_summary.record_user_defined_info( self.user_defined_info) if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() if not self.initial_learning_rate: optimizer = run_context_args.get('optimizer') if optimizer and not isinstance(optimizer, Optimizer): log.error( "The parameter optimizer is invalid. It should be an instance of " "mindspore.nn.optim.optimizer.Optimizer.") raise MindInsightException( error=LineageErrors.PARAM_OPTIMIZER_ERROR, message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value) if optimizer: log.info('Obtaining initial learning rate...') self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) else: network = run_context_args.get('train_network') optimizer = AnalyzeObject.get_optimizer_by_network(network) self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) # get train dataset graph train_dataset = run_context_args.get('train_dataset') dataset_graph_dict = ds.serialize(train_dataset) dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2) dataset_graph_dict = json.loads(dataset_graph_json_str) log.info('Logging dataset graph...') try: self.lineage_summary.record_dataset_graph( dataset_graph=dataset_graph_dict) except Exception as error: error_msg = f'Dataset graph log error in TrainLineage begin: {error}' log.error(error_msg) raise LineageLogError(error_msg) log.info('Dataset graph logged successfully.')
def get_summary_infos(cls, file_path): """ Get lineage summary information from summary log file. Args: file_path (str): The file path of summary log. Returns: LineageInfo, the lineage summary information. Raises: LineageSummaryAnalyzeException: If failed to get lineage information. """ analyzer = cls(file_path) err_msg = "Can not analyze lineage info, file path is %s. Detail: %s" try: lineage_info = analyzer.get_latest_info() except (MindInsightException, IOError, DecodeError) as err: log.debug(err_msg, file_path, str(err)) raise LineageSummaryAnalyzeException() except Exception as err: log.debug(err_msg, file_path, str(err)) raise LineageSummaryAnalyzeException() return lineage_info
def load(self): """Find and load summaries.""" # get sorted lineage files lineage_files = SummaryPathParser.get_lineage_summaries(self._summary_dir, is_sorted=True) if not lineage_files: logger.info('There is no summary log file under summary_dir %s.', self._summary_dir) raise LineageFileNotFoundError( 'There is no summary log file under summary_dir.' ) self._init_if_files_deleted(lineage_files) index = 0 if self._latest_filename is not None: index = lineage_files.index(self._latest_filename) for filename in lineage_files[index:]: if filename != self._latest_filename: self._latest_filename = filename self._latest_file_size = 0 file_path = os.path.join(self._summary_dir, filename) new_size = FileHandler(file_path).size if new_size == self._latest_file_size: continue self._latest_file_size = new_size try: self._parse_summary_log() except (LineageSummaryAnalyzeException, LineageEventNotExistException, LineageEventFieldNotExistException) as error: logger.debug("Parse file failed, file_path is %s. Detail: %s", file_path, str(error)) except MindInsightException as error: logger.exception(error) logger.debug("Parse file failed, file_path is %s.", file_path)
def _delete_lineage_in_cache(self, cache_item, key, relative_path): with cache_item.lock_key(key): try: cache_item.delete(key=key) logger.info("Parse failed, delete the tran job %s.", relative_path) except ParamValueError: logger.debug("Parse failed, and it is not in cache, " "no need to delete the train job %s.", relative_path)
def _organize_from_cache(self): """Organize lineage objs from cache.""" if self._data_manager is None: return brief_cache = self._data_manager.get_brief_cache() cache_items = brief_cache.cache_items for relative_dir, cache_train_job in cache_items.items(): try: super_lineage_obj = cache_train_job.get("lineage").super_lineage_obj self._super_lineage_objs.update({relative_dir: super_lineage_obj}) except ParamValueError: logger.debug("This is no lineage info in train job %s.", relative_dir)
def _check_crc(source_str, crc_str): """ Check the integrity of source string. Args: source_str (bytes): Source string in bytes. crc_str (bytes): CRC string of source string in bytes. Raises: LineageVerificationException: Raise when verification failed. """ if not crc32.CheckValueAgainstData(crc_str, source_str, len(source_str)): log.debug("The CRC verification not pass. source_str: %s. crc_str: %s.", source_str, crc_str) raise LineageVerificationException("The CRC verification failed.")
def _remove_unsupported_columns(self): """Remove unsupported columns.""" columns_to_drop = [] for name, data in self._df.iteritems(): if not is_simple_numpy_number(data.dtype): columns_to_drop.append(name) if columns_to_drop: log.debug("Unsupported columns: %s", columns_to_drop) self._df = self._df.drop(columns=columns_to_drop) for name in columns_to_drop: if not name.startswith(USER_DEFINED_PREFIX): continue self._drop_columns_info.append({ "name": name, "unselected": True, "reason_code": ReasonCode.NOT_ALL_NUMBERS.value })
def get_network_args(run_context_args, train_lineage): """ Get the parameters related to the network, such as optimizer, loss function. Args: run_context_args (dict): It contains all information of the training job. train_lineage (dict): A dict contains lineage metadata. Returns: dict, the lineage metadata. """ network = run_context_args.get('train_network') validate_network(network) optimizer = run_context_args.get('optimizer') if not optimizer: optimizer = AnalyzeObject.get_optimizer_by_network(network) loss_fn = run_context_args.get('loss_fn') if not loss_fn: loss_fn = AnalyzeObject.get_loss_fn_by_network(network) loss = None else: loss = run_context_args.get('net_outputs') if loss: log.info('Calculating loss...') loss_numpy = loss.asnumpy() loss = float(np.atleast_1d(loss_numpy)[0]) log.debug('loss: %s', loss) train_lineage[Metadata.loss] = loss else: train_lineage[Metadata.loss] = None # Analyze classname of optimizer, loss function and training network. train_lineage[Metadata.optimizer] = type(optimizer).__name__ \ if optimizer else None train_lineage[ Metadata.train_network] = AnalyzeObject.get_backbone_network( network) train_lineage[Metadata.loss_function] = type(loss_fn).__name__ \ if loss_fn else None return train_lineage
def analyze_dataset(dataset, lineage_dict, dataset_type): """ Analyze Dataset, a Dataset object of MindSpore. In this way, we can obtain the following attributes: dataset_path (str), train_dataset_size (int), valid_dataset_size (int), batch_size (int) Args: dataset (Dataset): See mindspore.dataengine.datasets.Dataset. lineage_dict (dict): A dict contains lineage metadata. dataset_type (str): Dataset type, train or valid. Returns: dict, the lineage metadata. """ dataset_batch_size = dataset.get_dataset_size() if dataset_batch_size is not None: validate_int_params(dataset_batch_size, 'dataset_batch_size') log.debug('dataset_batch_size: %d', dataset_batch_size) dataset_path = AnalyzeObject.get_dataset_path_wrapped(dataset) if dataset_path: dataset_path = '/'.join(dataset_path.split('/')[:-1]) step_num = lineage_dict.get('step_num') validate_int_params(step_num, 'step_num') log.debug('step_num: %d', step_num) if dataset_type == 'train': lineage_dict[Metadata.train_dataset_path] = dataset_path epoch = lineage_dict.get('epoch') train_dataset_size = dataset_batch_size * (step_num / epoch) lineage_dict[Metadata.train_dataset_size] = int(train_dataset_size) elif dataset_type == 'valid': lineage_dict[Metadata.valid_dataset_path] = dataset_path lineage_dict[ Metadata.valid_dataset_size] = dataset_batch_size * step_num return lineage_dict
def analyze_dataset(dataset, lineage_dict, dataset_type): """ Analyze Dataset, a Dataset object of MindSpore. In this way, we can obtain the following attributes: dataset_path (str), train_dataset_size (int), valid_dataset_size (int), batch_size (int) Args: dataset (Dataset): See mindspore.dataengine.datasets.Dataset. lineage_dict (dict): A dict contains lineage metadata. dataset_type (str): Dataset type, train or valid. Returns: dict, the lineage metadata. """ batch_num = dataset.get_dataset_size() batch_size = dataset.get_batch_size() if batch_num is not None: validate_int_params(batch_num, 'dataset_batch_num') validate_int_params(batch_num, 'dataset_batch_size') log.debug('dataset_batch_num: %d', batch_num) log.debug('dataset_batch_size: %d', batch_size) dataset_path = AnalyzeObject.get_dataset_path_wrapped(dataset) if dataset_path and os.path.isfile(dataset_path): dataset_path, _ = os.path.split(dataset_path) dataset_size = int(batch_num * batch_size) if dataset_type == 'train': lineage_dict[Metadata.train_dataset_path] = dataset_path lineage_dict[Metadata.train_dataset_size] = dataset_size elif dataset_type == 'valid': lineage_dict[Metadata.valid_dataset_path] = dataset_path lineage_dict[Metadata.valid_dataset_size] = dataset_size return lineage_dict
def update_item(self, cache_item: CachedTrainJob): """Update cache item in place.""" summary_base_dir = cache_item.summary_base_dir summary_dir = cache_item.abs_summary_dir # The summary_base_dir and summary_dir have been normalized in data_manager. if summary_base_dir == summary_dir: relative_path = "./" else: relative_path = f'./{os.path.basename(summary_dir)}' try: lineage_parser = self._lineage_parsing(cache_item) except LineageFileNotFoundError: self._delete_lineage_in_cache(cache_item, LINEAGE, relative_path) return super_lineage_obj = lineage_parser.super_lineage_obj if super_lineage_obj is None: logger.debug("There is no lineage to update in train job %s.", relative_path) return cache_item.set(key=LINEAGE, value=lineage_parser)
def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: LineageLogError: If recording lineage information fails. """ log.info('Start to collect training lineage...') if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_train_run_context(RunContextArgs, run_context_args) train_lineage = dict() train_lineage = AnalyzeObject.get_network_args(run_context_args, train_lineage) train_dataset = run_context_args.get('train_dataset') callbacks = run_context_args.get('list_callback') list_callback = getattr(callbacks, '_callbacks', []) log.info('Obtaining model files...') ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback) train_lineage[Metadata.learning_rate] = self.initial_learning_rate train_lineage[Metadata.epoch] = run_context_args.get('epoch_num') train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') train_lineage[Metadata.parallel_mode] = run_context_args.get( 'parallel_mode') train_lineage[Metadata.device_num] = run_context_args.get( 'device_number') train_lineage[Metadata.batch_size] = run_context_args.get('batch_num') model_path_dict = {'ckpt': ckpt_file_path} train_lineage[Metadata.model_path] = json.dumps(model_path_dict) log.info('Calculating model size...') train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size( ckpt_file_path) log.debug('model_size: %s', train_lineage[Metadata.model_size]) log.info('Analyzing dataset object...') train_lineage = AnalyzeObject.analyze_dataset(train_dataset, train_lineage, 'train') log.info('Logging lineage information...') try: lineage_summary = LineageSummary(self.lineage_log_path) lineage_summary.record_train_lineage(train_lineage) except IOError as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the training job.') raise LineageLogError(error_msg) log.info('The lineage of the training job has logged successfully.')