예제 #1
0
    def _parse_summary_logs(self, summary_path):
        """
        Parse summary logs.

        Args:
            summary_path (Union[str, list[str]]): The single summary log path or
                a list of summary log path.
        """
        if not summary_path:
            raise LineageQuerierParamException('summary_path',
                                               'The summary path is empty.')
        if isinstance(summary_path, str):
            self._parse_summary_log(summary_path, 0)
        elif isinstance(summary_path, list):
            index = 0
            for path in summary_path:
                parse_result = self._parse_summary_log(path, index)
                if parse_result:
                    index += 1
        else:
            raise LineageParamTypeError('Summary path is not str or list.')

        if self._parse_failed_paths:
            logger.info('Parse failed paths: %s',
                        str(self._parse_failed_paths))

        if not self._lineage_objects:
            raise LineageSummaryParseException()
예제 #2
0
    def _organize_from_disk(self):
        """Organize lineage objs from disk."""
        if self._summary_base_dir is None:
            return
        summary_watcher = SummaryWatcher()
        relative_dirs = summary_watcher.list_summary_directories(
            summary_base_dir=self._summary_base_dir)

        no_lineage_count = 0
        for item in relative_dirs:
            relative_dir = item.get('relative_path')
            update_time = item.get('update_time')
            abs_summary_dir = os.path.realpath(
                os.path.join(self._summary_base_dir, relative_dir))

            try:
                lineage_parser = LineageParser(abs_summary_dir, update_time)
                super_lineage_obj = lineage_parser.super_lineage_obj
                if super_lineage_obj is not None:
                    self._super_lineage_objs.update(
                        {abs_summary_dir: super_lineage_obj})
            except LineageFileNotFoundError:
                no_lineage_count += 1

        if no_lineage_count == len(relative_dirs):
            logger.info('There is no summary log file under summary_base_dir.')
            raise LineageFileNotFoundError(
                'There is no summary log file under summary_base_dir.')
예제 #3
0
    def load(self):
        """Find and load summaries."""
        # get sorted lineage files
        lineage_files = SummaryPathParser.get_lineage_summaries(self._summary_dir, is_sorted=True)
        if not lineage_files:
            logger.info('There is no summary log file under summary_dir %s.', self._summary_dir)
            raise LineageFileNotFoundError(
                'There is no summary log file under summary_dir.'
            )
        self._init_if_files_deleted(lineage_files)

        index = 0
        if self._latest_filename is not None:
            index = lineage_files.index(self._latest_filename)

        for filename in lineage_files[index:]:
            if filename != self._latest_filename:
                self._latest_filename = filename
                self._latest_file_size = 0

            file_path = os.path.join(self._summary_dir, filename)
            new_size = FileHandler(file_path).size
            if new_size == self._latest_file_size:
                continue

            self._latest_file_size = new_size
            try:
                self._parse_summary_log()
            except (LineageSummaryAnalyzeException,
                    LineageEventNotExistException,
                    LineageEventFieldNotExistException) as error:
                logger.debug("Parse file failed, file_path is %s. Detail: %s", file_path, str(error))
            except MindInsightException as error:
                logger.exception(error)
                logger.debug("Parse file failed, file_path is %s.", file_path)
예제 #4
0
    def load(self):
        """Find and load summaries."""
        # get sorted lineage files
        lineage_files = SummaryPathParser.get_lineage_summaries(
            self._summary_dir, is_sorted=True)
        if not lineage_files:
            logger.info('There is no summary log file under summary_dir %s.',
                        self._summary_dir)
            raise LineageFileNotFoundError(
                'There is no summary log file under summary_dir.')
        self._init_if_files_deleted(lineage_files)

        index = 0
        if self._latest_filename is not None:
            index = lineage_files.index(self._latest_filename)

        for filename in lineage_files[index:]:
            if filename != self._latest_filename:
                self._latest_filename = filename
                self._latest_file_size = 0

            file_path = os.path.join(self._summary_dir, filename)
            new_size = FileHandler(file_path).size
            if new_size == self._latest_file_size:
                continue

            self._latest_file_size = new_size
            self._parse_summary_log()
예제 #5
0
 def _delete_lineage_in_cache(self, cache_item, key, relative_path):
     with cache_item.lock_key(key):
         try:
             cache_item.delete(key=key)
             logger.info("Parse failed, delete the tran job %s.", relative_path)
         except ParamValueError:
             logger.debug("Parse failed, and it is not in cache, "
                          "no need to delete the train job %s.", relative_path)
예제 #6
0
    def _init_if_files_deleted(self, file_list):
        """Init variables if files deleted."""
        cached_file_list = self._cached_file_list
        self._cached_file_list = file_list
        if cached_file_list is None:
            return

        deleted_files = set(cached_file_list) - set(file_list)
        if deleted_files:
            logger.info("There are some files has been deleted, "
                        "all files will be reloaded in path %s.", self._summary_dir)
            self._init_variables()
예제 #7
0
    def begin(self, run_context):
        """
        Initialize the training progress when the training job begins.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
        """
        log.info('Initialize training lineage collection...')

        if self.user_defined_info:
            self.lineage_summary.record_user_defined_info(
                self.user_defined_info)

        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        if not self.initial_learning_rate:
            optimizer = run_context_args.get('optimizer')
            if optimizer and not isinstance(optimizer, Optimizer):
                log.error(
                    "The parameter optimizer is invalid. It should be an instance of "
                    "mindspore.nn.optim.optimizer.Optimizer.")
                raise MindInsightException(
                    error=LineageErrors.PARAM_OPTIMIZER_ERROR,
                    message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value)
            if optimizer:
                log.info('Obtaining initial learning rate...')
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)
            else:
                network = run_context_args.get('train_network')
                optimizer = AnalyzeObject.get_optimizer_by_network(network)
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)

        # get train dataset graph
        train_dataset = run_context_args.get('train_dataset')
        dataset_graph_dict = ds.serialize(train_dataset)
        dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2)
        dataset_graph_dict = json.loads(dataset_graph_json_str)
        log.info('Logging dataset graph...')
        try:
            self.lineage_summary.record_dataset_graph(
                dataset_graph=dataset_graph_dict)
        except Exception as error:
            error_msg = f'Dataset graph log error in TrainLineage begin: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        log.info('Dataset graph logged successfully.')
예제 #8
0
 def _organize_from_cache(self):
     """Organize lineage objs from cache."""
     if self._data_manager is None:
         return
     brief_cache = self._data_manager.get_brief_cache()
     cache_items = brief_cache.cache_items
     for relative_dir, cache_train_job in cache_items.items():
         try:
             super_lineage_obj = cache_train_job.get(
                 "lineage").super_lineage_obj
             self._super_lineage_objs.update(
                 {relative_dir: super_lineage_obj})
         except ParamValueError:
             logger.info("This is no lineage info in train job %s.",
                         relative_dir)
예제 #9
0
    def get_network_args(run_context_args, train_lineage):
        """
        Get the parameters related to the network,
        such as optimizer, loss function.

        Args:
            run_context_args (dict): It contains all information of the training job.
            train_lineage (dict): A dict contains lineage metadata.

        Returns:
            dict, the lineage metadata.
        """
        network = run_context_args.get('train_network')
        validate_network(network)
        optimizer = run_context_args.get('optimizer')
        if not optimizer:
            optimizer = AnalyzeObject.get_optimizer_by_network(network)
        loss_fn = run_context_args.get('loss_fn')
        if not loss_fn:
            loss_fn = AnalyzeObject.get_loss_fn_by_network(network)
            loss = None
        else:
            loss = run_context_args.get('net_outputs')

        if loss:
            log.info('Calculating loss...')
            loss_numpy = loss.asnumpy()
            loss = float(np.atleast_1d(loss_numpy)[0])
            log.debug('loss: %s', loss)
            train_lineage[Metadata.loss] = loss
        else:
            train_lineage[Metadata.loss] = None

        # Analyze classname of optimizer, loss function and training network.
        train_lineage[Metadata.optimizer] = type(optimizer).__name__ \
            if optimizer else None
        train_lineage[
            Metadata.train_network] = AnalyzeObject.get_backbone_network(
                network)
        train_lineage[Metadata.loss_function] = type(loss_fn).__name__ \
            if loss_fn else None

        return train_lineage
예제 #10
0
def _get_columns_name(lineage_objects):
    """Get columns name."""
    column_names = set()
    user_defined_num = 0
    for lineage in lineage_objects:
        model_lineage = lineage.get("model_lineage", {})
        metric = model_lineage.get("metric", {})
        metric_names = tuple('{}{}'.format(_METRIC_PREFIX, key)
                             for key in metric.keys())
        user_defined = model_lineage.get("user_defined", {})
        user_defined_names = tuple('{}{}'.format(_USER_DEFINED_PREFIX, key)
                                   for key in user_defined.keys())
        model_lineage_temp = list(model_lineage.keys())
        for key in model_lineage_temp:
            if key in ["metric", "user_defined"]:
                model_lineage_temp.remove(key)
        column_names.update(model_lineage_temp)
        column_names.update(metric_names)
        if user_defined_num + len(
                user_defined_names) <= USER_DEFINED_INFO_LIMIT:
            column_names.update(user_defined_names)
            user_defined_num += len(user_defined_names)
        elif user_defined_num < USER_DEFINED_INFO_LIMIT <= user_defined_num + len(
                user_defined_names):
            names = []
            for i in range(USER_DEFINED_INFO_LIMIT - user_defined_num):
                names.append(user_defined_names[i])
            column_names.update(names)
            user_defined_num += len(names)
            log.info(
                "Partial user_defined_info is deleted. Currently saved length is: %s.",
                user_defined_num)
        else:
            log.info(
                "The quantity of user_defined_info has reached the limit %s.",
                USER_DEFINED_INFO_LIMIT)
    column_names.update(["train_id"])

    return column_names
예제 #11
0
    def update_item(self, cache_item: CachedTrainJob):
        """Update cache item in place."""
        summary_base_dir = cache_item.summary_base_dir
        summary_dir = cache_item.abs_summary_dir

        # The summary_base_dir and summary_dir have been normalized in data_manager.
        if summary_base_dir == summary_dir:
            relative_path = "./"
        else:
            relative_path = f'./{os.path.basename(summary_dir)}'

        try:
            lineage_parser = self._lineage_parsing(cache_item)
        except LineageFileNotFoundError:
            with cache_item.lock_key(LINEAGE):
                cache_item.delete(key=LINEAGE)
            return

        super_lineage_obj = lineage_parser.super_lineage_obj
        if super_lineage_obj is None:
            logger.info("There is no lineage to update in tran job %s.", relative_path)
            return

        cache_item.set(key=LINEAGE, value=lineage_parser)
예제 #12
0
    def end(self, run_context):
        """
        Collect lineage information when the training job ends.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
            LineageLogError: If recording lineage information fails.
        """
        if self.user_defined_info:
            self.lineage_summary.record_user_defined_info(
                self.user_defined_info)

        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid EvalLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        validate_eval_run_context(EvalParameter, run_context_args)

        valid_dataset = run_context_args.get('valid_dataset')

        eval_lineage = dict()
        metrics = run_context_args.get('metrics')
        eval_lineage[Metadata.metrics] = json.dumps(metrics)
        eval_lineage[Metadata.step_num] = run_context_args.get('cur_step_num')

        log.info('Analyzing dataset object...')
        eval_lineage = AnalyzeObject.analyze_dataset(valid_dataset,
                                                     eval_lineage, 'valid')

        log.info('Logging evaluation job lineage...')
        try:
            self.lineage_summary.record_evaluation_lineage(eval_lineage)
        except IOError as error:
            error_msg = f'End error in EvalLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the evaluation job.')
            raise LineageLogError(error_msg)
        except Exception as error:
            error_msg = f'End error in EvalLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the evaluation job.')
            raise LineageLogError(error_msg)
        log.info('The lineage of the evaluation job has logged successfully.')
예제 #13
0
    def end(self, run_context):
        """
        Collect lineage information when the training job ends.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            LineageLogError: If recording lineage information fails.
        """
        log.info('Start to collect training lineage...')
        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        validate_train_run_context(RunContextArgs, run_context_args)

        train_lineage = dict()
        train_lineage = AnalyzeObject.get_network_args(run_context_args,
                                                       train_lineage)

        train_dataset = run_context_args.get('train_dataset')
        callbacks = run_context_args.get('list_callback')
        list_callback = getattr(callbacks, '_callbacks', [])

        log.info('Obtaining model files...')
        ckpt_file_path, _ = AnalyzeObject.get_file_path(list_callback)

        train_lineage[Metadata.learning_rate] = self.initial_learning_rate
        train_lineage[Metadata.epoch] = run_context_args.get('epoch_num')
        train_lineage[Metadata.step_num] = run_context_args.get('cur_step_num')
        train_lineage[Metadata.parallel_mode] = run_context_args.get(
            'parallel_mode')
        train_lineage[Metadata.device_num] = run_context_args.get(
            'device_number')
        train_lineage[Metadata.batch_size] = run_context_args.get('batch_num')
        model_path_dict = {'ckpt': ckpt_file_path}
        train_lineage[Metadata.model_path] = json.dumps(model_path_dict)

        log.info('Calculating model size...')
        train_lineage[Metadata.model_size] = AnalyzeObject.get_model_size(
            ckpt_file_path)
        log.debug('model_size: %s', train_lineage[Metadata.model_size])

        log.info('Analyzing dataset object...')
        train_lineage = AnalyzeObject.analyze_dataset(train_dataset,
                                                      train_lineage, 'train')

        log.info('Logging lineage information...')
        try:
            lineage_summary = LineageSummary(self.lineage_log_path)
            lineage_summary.record_train_lineage(train_lineage)
        except IOError as error:
            error_msg = f'End error in TrainLineage: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        except Exception as error:
            error_msg = f'End error in TrainLineage: {error}'
            log.error(error_msg)
            log.error('Fail to log the lineage of the training job.')
            raise LineageLogError(error_msg)
        log.info('The lineage of the training job has logged successfully.')