Пример #1
0
    def __init__(self,
                 summary_record,
                 raise_exception=False,
                 user_defined_info=None):
        super(TrainLineage, self).__init__()
        try:
            validate_raise_exception(raise_exception)
            self.raise_exception = raise_exception

            if isinstance(summary_record, str):
                # make directory if not exist
                self.lineage_log_dir = make_directory(summary_record)
            else:
                validate_summary_record(summary_record)
                summary_log_path = summary_record.full_file_name
                validate_file_path(summary_log_path)
                self.lineage_log_dir = os.path.dirname(summary_log_path)

            self.lineage_summary = LineageSummary(self.lineage_log_dir)

            self.initial_learning_rate = None

            self.user_defined_info = user_defined_info
            if user_defined_info:
                validate_user_defined_info(user_defined_info)

        except MindInsightException as err:
            log.error(err)
            if raise_exception:
                raise
Пример #2
0
    def begin(self, run_context):
        """
        Initialize the training progress when the training job begins.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
        """
        log.info('Initialize training lineage collection...')

        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        if not self.initial_learning_rate:
            optimizer = run_context_args.get('optimizer')
            if optimizer and not isinstance(optimizer, Optimizer):
                log.error(
                    "The parameter optimizer is invalid. It should be an instance of "
                    "mindspore.nn.optim.optimizer.Optimizer.")
                raise MindInsightException(
                    error=LineageErrors.PARAM_OPTIMIZER_ERROR,
                    message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value)
            if optimizer:
                log.info('Obtaining initial learning rate...')
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)
            else:
                network = run_context_args.get('train_network')
                validate_network(network)
                optimizer = AnalyzeObject.get_optimizer_by_network(network)
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)

        # get train dataset graph
        train_dataset = run_context_args.get('train_dataset')
        dataset_graph_dict = ds.serialize(train_dataset)
        dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2)
        dataset_graph_dict = json.loads(dataset_graph_json_str)
        log.info('Logging dataset graph...')
        try:
            lineage_summary = LineageSummary(self.lineage_log_path)
            lineage_summary.record_dataset_graph(
                dataset_graph=dataset_graph_dict)
        except Exception as error:
            error_msg = f'Dataset graph log error in TrainLineage begin: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        log.info('Dataset graph logged successfully.')
Пример #3
0
    def end(self, run_context):
        """
        Collect lineage information when the training job ends.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
            LineageLogError: If recording lineage information fails.
        """
        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid EvalLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        validate_eval_run_context(EvalParameter, run_context_args)

        valid_dataset = run_context_args.get('valid_dataset')

        eval_lineage = dict()
        metrics = run_context_args.get('metrics')
        eval_lineage[Metadata.metrics] = json.dumps(metrics)
        eval_lineage[Metadata.step_num] = run_context_args.get('cur_step_num')

        log.info('Analyzing dataset object...')
        eval_lineage = AnalyzeObject.analyze_dataset(valid_dataset,
                                                     eval_lineage, 'valid')

        log.info('Logging evaluation job lineage...')
        try:
            lineage_summary = LineageSummary(self.lineage_log_path)
            lineage_summary.record_evaluation_lineage(eval_lineage)
        except IOError as error:
            error_msg = f'End error in EvalLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the evaluation job.')
            raise LineageLogError(error_msg)
        except Exception as error:
            error_msg = f'End error in EvalLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the evaluation job.')
            raise LineageLogError(error_msg)
        log.info('The lineage of the evaluation job has logged successfully.')
Пример #4
0
 def test_package_train_message(self):
     """Test package_train_message."""
     event = LineageSummary.package_train_message(self.run_context_args)
     self.assertEqual(event.train_lineage.algorithm.network,
                      self.run_context_args.get("train_network"))
     self.assertEqual(event.train_lineage.hyper_parameters.optimizer,
                      self.run_context_args.get("optimizer"))
     self.assertEqual(event.train_lineage.train_dataset.train_dataset_path,
                      self.run_context_args.get("train_dataset_path"))
Пример #5
0
    def test_write_event_to_file(self):
        """Test write event to file."""
        run_context_args = {"train_network": "res"}
        content = LineageSummary.package_train_message(
            run_context_args).SerializeToString()
        event_writer = EventWriter(self.log_path, True)
        event_writer.write_event_to_file(content)

        lineage_info = LineageSummaryAnalyzer.get_summary_infos(self.log_path)
        self.assertEqual(
            lineage_info.train_lineage.train_lineage.algorithm.network,
            run_context_args.get("train_network"))
Пример #6
0
 def test_record_eval_lineage(self, write_file):
     """Test record_eval_lineage."""
     write_file.return_value = True
     lineage_summray = LineageSummary(lineage_log_dir="test.log")
     lineage_summray.record_evaluation_lineage(self.eval_args)
Пример #7
0
 def test_package_evaluation_message(self):
     """Test package_evaluation_message."""
     event = LineageSummary.package_evaluation_message(self.eval_args)
     self.assertEqual(event.evaluation_lineage.metric,
                      self.eval_args.get("metrics"))
Пример #8
0
 def test_record_train_lineage(self, write_file):
     """Test record_train_lineage."""
     write_file.return_value = True
     lineage_summray = LineageSummary(lineage_log_dir="test.log")
     lineage_summray.record_train_lineage(self.run_context_args)
Пример #9
0
    def end(self, run_context):
        """
        Collect lineage information when the training job ends.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            LineageLogError: If recording lineage information fails.
        """
        log.info('Start to collect training lineage...')
        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        validate_train_run_context(RunContextArgs, run_context_args)

        train_lineage = dict()
        train_lineage = AnalyzeObject.get_network_args(run_context_args,
                                                       train_lineage)

        train_dataset = run_context_args.get('train_dataset')
        callbacks = run_context_args.get('list_callback')
        list_callback = getattr(callbacks, '_callbacks', [])

        log.info('Obtaining model files...')
        ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback)

        train_lineage[Metadata.learning_rate] = self.initial_learning_rate
        train_lineage[Metadata.epoch] = run_context_args.get('epoch_num')
        train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num')
        train_lineage[Metadata.parallel_mode] = run_context_args.get(
            'parallel_mode')
        train_lineage[Metadata.device_num] = run_context_args.get(
            'device_number')
        train_lineage[Metadata.batch_size] = run_context_args.get('batch_num')
        model_path_dict = {'ckpt': ckpt_file_path}
        train_lineage[Metadata.model_path] = json.dumps(model_path_dict)

        log.info('Calculating model size...')
        train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size(
            ckpt_file_path)
        log.debug('model_size: %s', train_lineage[Metadata.model_size])

        log.info('Analyzing dataset object...')
        train_lineage = AnalyzeObject.analyze_dataset(train_dataset,
                                                      train_lineage, 'train')

        log.info('Logging lineage information...')
        try:
            lineage_summary = LineageSummary(self.lineage_log_path)
            lineage_summary.record_train_lineage(train_lineage)
        except IOError as error:
            error_msg = f'End error in TrainLineage: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        except Exception as error:
            error_msg = f'End error in TrainLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the training job.')
            raise LineageLogError(error_msg)
        log.info('The lineage of the training job has logged successfully.')
Пример #10
0
class TrainLineage(Callback):
    """
    Collect lineage of a training job.

    Args:
        summary_record (Union[SummaryRecord, str]): The `SummaryRecord` object which
            is used to record the summary value(see mindspore.train.summary.SummaryRecord),
            or a log dir(as a `str`) to be passed to `LineageSummary` to create
            a lineage summary recorder. It should be noted that instead of making
            use of summary_record to record lineage info directly, we obtain
            log dir from it then create a new summary file to write lineage info.
        raise_exception (bool): Whether to raise exception when error occurs in
            TrainLineage. If True, raise exception. If False, catch exception
            and continue. Default: False.
        user_defined_info (dict): User defined information. Only flatten dict with
            str key and int/float/str value is supported. Default: None.

    Raises:
        MindInsightException: If validating parameter fails.
        LineageLogError: If recording lineage information fails.

    Examples:
        >>> from mindinsight.lineagemgr import TrainLineage
        >>> from mindspore.train.callback import ModelCheckpoint, SummaryStep
        >>> from mindspore.train.summary import SummaryRecord
        >>> model = Model(train_network)
        >>> model_ckpt = ModelCheckpoint(directory='/dir/to/save/model/')
        >>> summary_writer = SummaryRecord(log_dir='./')
        >>> summary_callback = SummaryStep(summary_writer, flush_step=2)
        >>> lineagemgr = TrainLineage(summary_record=summary_writer)
        >>> model.train(epoch_num, dataset, callbacks=[model_ckpt, summary_callback, lineagemgr])
    """
    def __init__(self,
                 summary_record,
                 raise_exception=False,
                 user_defined_info=None):
        super(TrainLineage, self).__init__()
        try:
            validate_raise_exception(raise_exception)
            self.raise_exception = raise_exception

            if isinstance(summary_record, str):
                # make directory if not exist
                self.lineage_log_dir = make_directory(summary_record)
            else:
                validate_summary_record(summary_record)
                summary_log_path = summary_record.full_file_name
                validate_file_path(summary_log_path)
                self.lineage_log_dir = os.path.dirname(summary_log_path)

            self.lineage_summary = LineageSummary(self.lineage_log_dir)

            self.initial_learning_rate = None

            self.user_defined_info = user_defined_info
            if user_defined_info:
                validate_user_defined_info(user_defined_info)

        except MindInsightException as err:
            log.error(err)
            if raise_exception:
                raise

    @try_except(log)
    def begin(self, run_context):
        """
        Initialize the training progress when the training job begins.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
        """
        log.info('Initialize training lineage collection...')

        if self.user_defined_info:
            self.lineage_summary.record_user_defined_info(
                self.user_defined_info)

        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        if not self.initial_learning_rate:
            optimizer = run_context_args.get('optimizer')
            if optimizer and not isinstance(optimizer, Optimizer):
                log.error(
                    "The parameter optimizer is invalid. It should be an instance of "
                    "mindspore.nn.optim.optimizer.Optimizer.")
                raise MindInsightException(
                    error=LineageErrors.PARAM_OPTIMIZER_ERROR,
                    message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value)
            if optimizer:
                log.info('Obtaining initial learning rate...')
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)
            else:
                network = run_context_args.get('train_network')
                validate_network(network)
                optimizer = AnalyzeObject.get_optimizer_by_network(network)
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)

        # get train dataset graph
        train_dataset = run_context_args.get('train_dataset')
        dataset_graph_dict = ds.serialize(train_dataset)
        dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2)
        dataset_graph_dict = json.loads(dataset_graph_json_str)
        log.info('Logging dataset graph...')
        try:
            self.lineage_summary.record_dataset_graph(
                dataset_graph=dataset_graph_dict)
        except Exception as error:
            error_msg = f'Dataset graph log error in TrainLineage begin: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        log.info('Dataset graph logged successfully.')

    @try_except(log)
    def end(self, run_context):
        """
        Collect lineage information when the training job ends.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            LineageLogError: If recording lineage information fails.
        """
        log.info('Start to collect training lineage...')
        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        validate_train_run_context(RunContextArgs, run_context_args)

        train_lineage = dict()
        train_lineage = AnalyzeObject.get_network_args(run_context_args,
                                                       train_lineage)

        train_dataset = run_context_args.get('train_dataset')
        callbacks = run_context_args.get('list_callback')
        list_callback = getattr(callbacks, '_callbacks', [])

        log.info('Obtaining model files...')
        ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback)

        train_lineage[Metadata.learning_rate] = self.initial_learning_rate
        train_lineage[Metadata.epoch] = run_context_args.get('epoch_num')
        train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num')
        train_lineage[Metadata.parallel_mode] = run_context_args.get(
            'parallel_mode')
        train_lineage[Metadata.device_num] = run_context_args.get(
            'device_number')
        train_lineage[Metadata.batch_size] = run_context_args.get('batch_num')
        model_path_dict = {'ckpt': ckpt_file_path}
        train_lineage[Metadata.model_path] = json.dumps(model_path_dict)

        log.info('Calculating model size...')
        train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size(
            ckpt_file_path)
        log.debug('model_size: %s', train_lineage[Metadata.model_size])

        log.info('Analyzing dataset object...')
        train_lineage = AnalyzeObject.analyze_dataset(train_dataset,
                                                      train_lineage, 'train')

        log.info('Logging lineage information...')
        try:
            self.lineage_summary.record_train_lineage(train_lineage)
        except IOError as error:
            error_msg = f'End error in TrainLineage: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        except Exception as error:
            error_msg = f'End error in TrainLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the training job.')
            raise LineageLogError(error_msg)
        log.info('The lineage of the training job has logged successfully.')
Пример #11
0
class EvalLineage(Callback):
    """
    Collect lineage of an evaluation job.

    Args:
        summary_record (Union[SummaryRecord, str]): The `SummaryRecord` object which
            is used to record the summary value(see mindspore.train.summary.SummaryRecord),
            or a log dir(as a `str`) to be passed to `LineageSummary` to create
            a lineage summary recorder. It should be noted that instead of making
            use of summary_record to record lineage info directly, we obtain
            log dir from it then create a new summary file to write lineage info.
        raise_exception (bool): Whether to raise exception when error occurs in
            EvalLineage. If True, raise exception. If False, catch exception
            and continue. Default: False.
        user_defined_info (dict): User defined information. Only flatten dict with
            str key and int/float/str value is supported. Default: None.

    Raises:
        MindInsightException: If validating parameter fails.
        LineageLogError: If recording lineage information fails.

    Examples:
        >>> from mindinsight.lineagemgr import EvalLineage
        >>> from mindspore.train.callback import ModelCheckpoint, SummaryStep
        >>> from mindspore.train.summary import SummaryRecord
        >>> model = Model(train_network)
        >>> model_ckpt = ModelCheckpoint(directory='/dir/to/save/model/')
        >>> summary_writer = SummaryRecord(log_dir='./')
        >>> summary_callback = SummaryStep(summary_writer, flush_step=2)
        >>> lineagemgr = EvalLineage(summary_record=summary_writer)
        >>> model.eval(epoch_num, dataset, callbacks=[model_ckpt, summary_callback, lineagemgr])
    """
    def __init__(self,
                 summary_record,
                 raise_exception=False,
                 user_defined_info=None):
        super(EvalLineage, self).__init__()
        try:
            validate_raise_exception(raise_exception)
            self.raise_exception = raise_exception

            if isinstance(summary_record, str):
                # make directory if not exist
                self.lineage_log_dir = make_directory(summary_record)
            else:
                validate_summary_record(summary_record)
                summary_log_path = summary_record.full_file_name
                validate_file_path(summary_log_path)
                self.lineage_log_dir = os.path.dirname(summary_log_path)

            self.lineage_summary = LineageSummary(self.lineage_log_dir)

            self.user_defined_info = user_defined_info
            if self.user_defined_info:
                validate_user_defined_info(self.user_defined_info)

        except MindInsightException as err:
            log.error(err)
            if raise_exception:
                raise

    @try_except(log)
    def end(self, run_context):
        """
        Collect lineage information when the training job ends.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
            LineageLogError: If recording lineage information fails.
        """
        if self.user_defined_info:
            self.lineage_summary.record_user_defined_info(
                self.user_defined_info)

        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid EvalLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        validate_eval_run_context(EvalParameter, run_context_args)

        valid_dataset = run_context_args.get('valid_dataset')

        eval_lineage = dict()
        metrics = run_context_args.get('metrics')
        eval_lineage[Metadata.metrics] = json.dumps(metrics)
        eval_lineage[Metadata.step_num] = run_context_args.get('cur_step_num')

        log.info('Analyzing dataset object...')
        eval_lineage = AnalyzeObject.analyze_dataset(valid_dataset,
                                                     eval_lineage, 'valid')

        log.info('Logging evaluation job lineage...')
        try:
            self.lineage_summary.record_evaluation_lineage(eval_lineage)
        except IOError as error:
            error_msg = f'End error in EvalLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the evaluation job.')
            raise LineageLogError(error_msg)
        except Exception as error:
            error_msg = f'End error in EvalLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the evaluation job.')
            raise LineageLogError(error_msg)
        log.info('The lineage of the evaluation job has logged successfully.')