def __init__(self, summary_record, raise_exception=False, user_defined_info=None): super(TrainLineage, self).__init__() try: validate_raise_exception(raise_exception) self.raise_exception = raise_exception if isinstance(summary_record, str): # make directory if not exist self.lineage_log_dir = make_directory(summary_record) else: validate_summary_record(summary_record) summary_log_path = summary_record.full_file_name validate_file_path(summary_log_path) self.lineage_log_dir = os.path.dirname(summary_log_path) self.lineage_summary = LineageSummary(self.lineage_log_dir) self.initial_learning_rate = None self.user_defined_info = user_defined_info if user_defined_info: validate_user_defined_info(user_defined_info) except MindInsightException as err: log.error(err) if raise_exception: raise
def begin(self, run_context): """ Initialize the training progress when the training job begins. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. """ log.info('Initialize training lineage collection...') if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() if not self.initial_learning_rate: optimizer = run_context_args.get('optimizer') if optimizer and not isinstance(optimizer, Optimizer): log.error( "The parameter optimizer is invalid. It should be an instance of " "mindspore.nn.optim.optimizer.Optimizer.") raise MindInsightException( error=LineageErrors.PARAM_OPTIMIZER_ERROR, message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value) if optimizer: log.info('Obtaining initial learning rate...') self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) else: network = run_context_args.get('train_network') validate_network(network) optimizer = AnalyzeObject.get_optimizer_by_network(network) self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) # get train dataset graph train_dataset = run_context_args.get('train_dataset') dataset_graph_dict = ds.serialize(train_dataset) dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2) dataset_graph_dict = json.loads(dataset_graph_json_str) log.info('Logging dataset graph...') try: lineage_summary = LineageSummary(self.lineage_log_path) lineage_summary.record_dataset_graph( dataset_graph=dataset_graph_dict) except Exception as error: error_msg = f'Dataset graph log error in TrainLineage begin: {error}' log.error(error_msg) raise LineageLogError(error_msg) log.info('Dataset graph logged successfully.')
def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. LineageLogError: If recording lineage information fails. """ if not isinstance(run_context, RunContext): error_msg = f'Invalid EvalLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_eval_run_context(EvalParameter, run_context_args) valid_dataset = run_context_args.get('valid_dataset') eval_lineage = dict() metrics = run_context_args.get('metrics') eval_lineage[Metadata.metrics] = json.dumps(metrics) eval_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') log.info('Analyzing dataset object...') eval_lineage = AnalyzeObject.analyze_dataset(valid_dataset, eval_lineage, 'valid') log.info('Logging evaluation job lineage...') try: lineage_summary = LineageSummary(self.lineage_log_path) lineage_summary.record_evaluation_lineage(eval_lineage) except IOError as error: error_msg = f'End error in EvalLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the evaluation job.') raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in EvalLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the evaluation job.') raise LineageLogError(error_msg) log.info('The lineage of the evaluation job has logged successfully.')
def test_package_train_message(self): """Test package_train_message.""" event = LineageSummary.package_train_message(self.run_context_args) self.assertEqual(event.train_lineage.algorithm.network, self.run_context_args.get("train_network")) self.assertEqual(event.train_lineage.hyper_parameters.optimizer, self.run_context_args.get("optimizer")) self.assertEqual(event.train_lineage.train_dataset.train_dataset_path, self.run_context_args.get("train_dataset_path"))
def test_write_event_to_file(self): """Test write event to file.""" run_context_args = {"train_network": "res"} content = LineageSummary.package_train_message( run_context_args).SerializeToString() event_writer = EventWriter(self.log_path, True) event_writer.write_event_to_file(content) lineage_info = LineageSummaryAnalyzer.get_summary_infos(self.log_path) self.assertEqual( lineage_info.train_lineage.train_lineage.algorithm.network, run_context_args.get("train_network"))
def test_record_eval_lineage(self, write_file): """Test record_eval_lineage.""" write_file.return_value = True lineage_summray = LineageSummary(lineage_log_dir="test.log") lineage_summray.record_evaluation_lineage(self.eval_args)
def test_package_evaluation_message(self): """Test package_evaluation_message.""" event = LineageSummary.package_evaluation_message(self.eval_args) self.assertEqual(event.evaluation_lineage.metric, self.eval_args.get("metrics"))
def test_record_train_lineage(self, write_file): """Test record_train_lineage.""" write_file.return_value = True lineage_summray = LineageSummary(lineage_log_dir="test.log") lineage_summray.record_train_lineage(self.run_context_args)
def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: LineageLogError: If recording lineage information fails. """ log.info('Start to collect training lineage...') if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_train_run_context(RunContextArgs, run_context_args) train_lineage = dict() train_lineage = AnalyzeObject.get_network_args(run_context_args, train_lineage) train_dataset = run_context_args.get('train_dataset') callbacks = run_context_args.get('list_callback') list_callback = getattr(callbacks, '_callbacks', []) log.info('Obtaining model files...') ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback) train_lineage[Metadata.learning_rate] = self.initial_learning_rate train_lineage[Metadata.epoch] = run_context_args.get('epoch_num') train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') train_lineage[Metadata.parallel_mode] = run_context_args.get( 'parallel_mode') train_lineage[Metadata.device_num] = run_context_args.get( 'device_number') train_lineage[Metadata.batch_size] = run_context_args.get('batch_num') model_path_dict = {'ckpt': ckpt_file_path} train_lineage[Metadata.model_path] = json.dumps(model_path_dict) log.info('Calculating model size...') train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size( ckpt_file_path) log.debug('model_size: %s', train_lineage[Metadata.model_size]) log.info('Analyzing dataset object...') train_lineage = AnalyzeObject.analyze_dataset(train_dataset, train_lineage, 'train') log.info('Logging lineage information...') try: lineage_summary = LineageSummary(self.lineage_log_path) lineage_summary.record_train_lineage(train_lineage) except IOError as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the training job.') raise LineageLogError(error_msg) log.info('The lineage of the training job has logged successfully.')
class TrainLineage(Callback): """ Collect lineage of a training job. Args: summary_record (Union[SummaryRecord, str]): The `SummaryRecord` object which is used to record the summary value(see mindspore.train.summary.SummaryRecord), or a log dir(as a `str`) to be passed to `LineageSummary` to create a lineage summary recorder. It should be noted that instead of making use of summary_record to record lineage info directly, we obtain log dir from it then create a new summary file to write lineage info. raise_exception (bool): Whether to raise exception when error occurs in TrainLineage. If True, raise exception. If False, catch exception and continue. Default: False. user_defined_info (dict): User defined information. Only flatten dict with str key and int/float/str value is supported. Default: None. Raises: MindInsightException: If validating parameter fails. LineageLogError: If recording lineage information fails. Examples: >>> from mindinsight.lineagemgr import TrainLineage >>> from mindspore.train.callback import ModelCheckpoint, SummaryStep >>> from mindspore.train.summary import SummaryRecord >>> model = Model(train_network) >>> model_ckpt = ModelCheckpoint(directory='/dir/to/save/model/') >>> summary_writer = SummaryRecord(log_dir='./') >>> summary_callback = SummaryStep(summary_writer, flush_step=2) >>> lineagemgr = TrainLineage(summary_record=summary_writer) >>> model.train(epoch_num, dataset, callbacks=[model_ckpt, summary_callback, lineagemgr]) """ def __init__(self, summary_record, raise_exception=False, user_defined_info=None): super(TrainLineage, self).__init__() try: validate_raise_exception(raise_exception) self.raise_exception = raise_exception if isinstance(summary_record, str): # make directory if not exist self.lineage_log_dir = make_directory(summary_record) else: validate_summary_record(summary_record) summary_log_path = summary_record.full_file_name validate_file_path(summary_log_path) self.lineage_log_dir = os.path.dirname(summary_log_path) self.lineage_summary = LineageSummary(self.lineage_log_dir) self.initial_learning_rate = None self.user_defined_info = user_defined_info if user_defined_info: validate_user_defined_info(user_defined_info) except MindInsightException as err: log.error(err) if raise_exception: raise @try_except(log) def begin(self, run_context): """ Initialize the training progress when the training job begins. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. """ log.info('Initialize training lineage collection...') if self.user_defined_info: self.lineage_summary.record_user_defined_info( self.user_defined_info) if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() if not self.initial_learning_rate: optimizer = run_context_args.get('optimizer') if optimizer and not isinstance(optimizer, Optimizer): log.error( "The parameter optimizer is invalid. It should be an instance of " "mindspore.nn.optim.optimizer.Optimizer.") raise MindInsightException( error=LineageErrors.PARAM_OPTIMIZER_ERROR, message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value) if optimizer: log.info('Obtaining initial learning rate...') self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) else: network = run_context_args.get('train_network') validate_network(network) optimizer = AnalyzeObject.get_optimizer_by_network(network) self.initial_learning_rate = AnalyzeObject.analyze_optimizer( optimizer) log.debug('initial_learning_rate: %s', self.initial_learning_rate) # get train dataset graph train_dataset = run_context_args.get('train_dataset') dataset_graph_dict = ds.serialize(train_dataset) dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2) dataset_graph_dict = json.loads(dataset_graph_json_str) log.info('Logging dataset graph...') try: self.lineage_summary.record_dataset_graph( dataset_graph=dataset_graph_dict) except Exception as error: error_msg = f'Dataset graph log error in TrainLineage begin: {error}' log.error(error_msg) raise LineageLogError(error_msg) log.info('Dataset graph logged successfully.') @try_except(log) def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: LineageLogError: If recording lineage information fails. """ log.info('Start to collect training lineage...') if not isinstance(run_context, RunContext): error_msg = f'Invalid TrainLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_train_run_context(RunContextArgs, run_context_args) train_lineage = dict() train_lineage = AnalyzeObject.get_network_args(run_context_args, train_lineage) train_dataset = run_context_args.get('train_dataset') callbacks = run_context_args.get('list_callback') list_callback = getattr(callbacks, '_callbacks', []) log.info('Obtaining model files...') ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback) train_lineage[Metadata.learning_rate] = self.initial_learning_rate train_lineage[Metadata.epoch] = run_context_args.get('epoch_num') train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') train_lineage[Metadata.parallel_mode] = run_context_args.get( 'parallel_mode') train_lineage[Metadata.device_num] = run_context_args.get( 'device_number') train_lineage[Metadata.batch_size] = run_context_args.get('batch_num') model_path_dict = {'ckpt': ckpt_file_path} train_lineage[Metadata.model_path] = json.dumps(model_path_dict) log.info('Calculating model size...') train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size( ckpt_file_path) log.debug('model_size: %s', train_lineage[Metadata.model_size]) log.info('Analyzing dataset object...') train_lineage = AnalyzeObject.analyze_dataset(train_dataset, train_lineage, 'train') log.info('Logging lineage information...') try: self.lineage_summary.record_train_lineage(train_lineage) except IOError as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in TrainLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the training job.') raise LineageLogError(error_msg) log.info('The lineage of the training job has logged successfully.')
class EvalLineage(Callback): """ Collect lineage of an evaluation job. Args: summary_record (Union[SummaryRecord, str]): The `SummaryRecord` object which is used to record the summary value(see mindspore.train.summary.SummaryRecord), or a log dir(as a `str`) to be passed to `LineageSummary` to create a lineage summary recorder. It should be noted that instead of making use of summary_record to record lineage info directly, we obtain log dir from it then create a new summary file to write lineage info. raise_exception (bool): Whether to raise exception when error occurs in EvalLineage. If True, raise exception. If False, catch exception and continue. Default: False. user_defined_info (dict): User defined information. Only flatten dict with str key and int/float/str value is supported. Default: None. Raises: MindInsightException: If validating parameter fails. LineageLogError: If recording lineage information fails. Examples: >>> from mindinsight.lineagemgr import EvalLineage >>> from mindspore.train.callback import ModelCheckpoint, SummaryStep >>> from mindspore.train.summary import SummaryRecord >>> model = Model(train_network) >>> model_ckpt = ModelCheckpoint(directory='/dir/to/save/model/') >>> summary_writer = SummaryRecord(log_dir='./') >>> summary_callback = SummaryStep(summary_writer, flush_step=2) >>> lineagemgr = EvalLineage(summary_record=summary_writer) >>> model.eval(epoch_num, dataset, callbacks=[model_ckpt, summary_callback, lineagemgr]) """ def __init__(self, summary_record, raise_exception=False, user_defined_info=None): super(EvalLineage, self).__init__() try: validate_raise_exception(raise_exception) self.raise_exception = raise_exception if isinstance(summary_record, str): # make directory if not exist self.lineage_log_dir = make_directory(summary_record) else: validate_summary_record(summary_record) summary_log_path = summary_record.full_file_name validate_file_path(summary_log_path) self.lineage_log_dir = os.path.dirname(summary_log_path) self.lineage_summary = LineageSummary(self.lineage_log_dir) self.user_defined_info = user_defined_info if self.user_defined_info: validate_user_defined_info(self.user_defined_info) except MindInsightException as err: log.error(err) if raise_exception: raise @try_except(log) def end(self, run_context): """ Collect lineage information when the training job ends. Args: run_context (RunContext): It contains all lineage information, see mindspore.train.callback.RunContext. Raises: MindInsightException: If validating parameter fails. LineageLogError: If recording lineage information fails. """ if self.user_defined_info: self.lineage_summary.record_user_defined_info( self.user_defined_info) if not isinstance(run_context, RunContext): error_msg = f'Invalid EvalLineage run_context.' log.error(error_msg) raise LineageParamRunContextError(error_msg) run_context_args = run_context.original_args() validate_eval_run_context(EvalParameter, run_context_args) valid_dataset = run_context_args.get('valid_dataset') eval_lineage = dict() metrics = run_context_args.get('metrics') eval_lineage[Metadata.metrics] = json.dumps(metrics) eval_lineage[Metadata.step_num] = run_context_args.get('cur_step_num') log.info('Analyzing dataset object...') eval_lineage = AnalyzeObject.analyze_dataset(valid_dataset, eval_lineage, 'valid') log.info('Logging evaluation job lineage...') try: self.lineage_summary.record_evaluation_lineage(eval_lineage) except IOError as error: error_msg = f'End error in EvalLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the evaluation job.') raise LineageLogError(error_msg) except Exception as error: error_msg = f'End error in EvalLineage: {error}' log.error(error_msg) log.error('Fail to log the lineage of the evaluation job.') raise LineageLogError(error_msg) log.info('The lineage of the evaluation job has logged successfully.')