Пример #1
0
def phase_pb_file(file_path: str) -> Union[MSGraph, None]:
    """
    Parse pb file to graph

    Args:
        file_path (str): The file path of pb file.

    Returns:
        MSGraph, if load pb file and build graph success, will return the graph, else return None.
    """
    if not CONFIG.VERBOSE:
        logger.setLevel(logging.ERROR)
    logger.info("Start to load graph from pb file, file path: %s.", file_path)
    model_proto = anf_ir_pb2.ModelProto()
    try:
        model_proto.ParseFromString(FileHandler(file_path).read())
    except ParseError:
        logger.warning("The given file is not a valid pb file, file path: %s.",
                       file_path)
        return None

    graph = MSGraph()

    try:
        graph.build_graph(model_proto.graph)
    except Exception as ex:
        logger.error("Build graph failed, file path: %s.", file_path)
        logger.exception(ex)
        raise UnknownError(str(ex))

    logger.info("Build graph success, file path: %s.", file_path)
    return graph
Пример #2
0
    def _update_node_name_of_cache(self, node, new_name, update_parent=False):
        """
        Update a node name which is stored in cache.

        Args:
            node (Node): The node that will be renamed.
            new_name (str): The new name.
            update_parent (bool): Determines whether the input and output of the parent node need to be updated.
        """
        logger.debug('Update node name of cache, node(%s), new name is %s.', str(node), new_name)
        origin_name = node.name
        node.name = new_name

        # Find all nodes that need to modify the input and input
        update_node_map = {}
        for method in ['input', 'output', 'proxy_input', 'proxy_output']:
            for target_name in getattr(node, method):
                target_node = self._get_normal_node(node_name=target_name)
                if target_node is None:
                    message = f"Node should not be None, name: {target_name}, {method}: {list(getattr(node, method))}."
                    logger.error(message)
                    continue

                update_node_map.update({target_name: target_node})

                if not update_parent:
                    continue

                slash_index = target_name.find('/')
                while slash_index != -1:
                    scope_name = target_name[:slash_index]
                    slash_index = target_name.find('/', slash_index+1)

                    if update_node_map.get(scope_name):
                        continue

                    scope_node = self._get_normal_node(node_name=scope_name)
                    if scope_node is None:
                        message = f"Can not find the scope node by scope name({scope_name}), " \
                                  f"may be this scope node has not been built."
                        logger.debug(message)
                        continue

                    update_node_map.update({scope_name: scope_node})

        # Update the input and output of the nodes
        for target_node in update_node_map.values():
            for method in ['input', 'output', 'proxy_input', 'proxy_output']:
                attr_temp = getattr(target_node, method).get(origin_name)
                if attr_temp is None:
                    # This method does not have this node, so it is skipped
                    continue

                # Delete the old attribute and update new name to source node or destination node.
                getattr(target_node, f'delete_{method}')(origin_name)
                getattr(target_node, f'add_{method}')(new_name, attr_temp)

        # Delete the origin node in cache.
        self._delete_nodes_of_cache(node_names=[origin_name])
        self._cache_node(node)
Пример #3
0
 def check_int(param_name, value, min_num, max_num):
     """Check if the value is positive integer."""
     if isinstance(value, int) and min_num <= value <= max_num:
         return
     log.error("Invalid param `%s`. The integer should be in [%d, %d].",
               param_name, min_num, max_num)
     raise DebuggerParamTypeError(f"Invalid param `{param_name}`.")
Пример #4
0
    def is_summary_directory(self, summary_base_dir, relative_path):
        """
        Check if the given summary directory is valid.

        Args:
            summary_base_dir (str): Path of summary base directory.
            relative_path (str): Relative path of summary directory, referring to summary base directory,
                                starting with "./" .

        Returns:
            bool, indicates if the given summary directory is valid.

        Examples:
            >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
            >>> summary_watcher = SummaryWatcher()
            >>> summaries = summary_watcher.is_summary_directory('/summary/base/dir', './job-01')
        """
        if contains_null_byte(summary_base_dir=summary_base_dir,
                              relative_path=relative_path):
            return False

        if not self._is_valid_summary_directory(summary_base_dir,
                                                relative_path):
            return False

        summary_directory = os.path.realpath(
            os.path.join(summary_base_dir, relative_path))
        try:
            entries = os.scandir(summary_directory)
        except PermissionError:
            logger.error('Path of summary base directory is not accessible.')
            raise FileSystemPermissionError(
                'Path of summary base directory is not accessible.')

        for entry in entries:
            if entry.is_symlink():
                continue

            summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX,
                                        entry.name)
            if summary_pattern is not None and entry.is_file():
                return True

            pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name)
            if pb_pattern is not None and entry.is_file():
                return True

            if entry.is_dir():
                profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX,
                                             entry.name)
                cluster_profiler_pattern = re.search(
                    self.CLUSTER_PROFILER_DIRECTORY_REGEX, entry.name)
                if profiler_pattern is not None or cluster_profiler_pattern is not None:
                    full_path = os.path.realpath(
                        os.path.join(summary_directory, entry.name))
                    if self._is_valid_profiler_directory(full_path)[0] or \
                            self._is_valid_cluster_profiler_directory(full_path)[0]:
                        return True
        return False
Пример #5
0
    def _convert_to_numpy_array(self):
        """Convert a list data to numpy array."""
        try:
            ndarray = np.array(self._data).reshape(self._dims)
        except ValueError as ex:
            logger.error("Reshape array fail, detail: %r", str(ex))
            return

        self._np_array = ndarray
Пример #6
0
def validate_stack_pattern(stack_pattern):
    """Check stack pattern."""
    if stack_pattern:
        if not isinstance(stack_pattern, str):
            log.error(
                "Invalid stack pattern. String type is required, but got %s.",
                type(stack_pattern))
            raise DebuggerParamTypeError("stack_pattern is not string type.")
        pattern_limit = 255
        if len(stack_pattern) > pattern_limit:
            log.error(
                "The length of stack_pattern is %s, which should no greater than %s.",
                len(stack_pattern), pattern_limit)
            raise DebuggerParamValueError(
                "stack_pattern is over length limit.")
Пример #7
0
    def _parse_pb_file(summary_dir, filename):
        """
        Parse pb file and write content to `EventsData`.

        Args:
            filename (str): The file path of pb file.

        Returns:
            TensorEvent, if load pb file and build graph success, will return tensor event, else return None.
        """
        file_path = FileHandler.join(summary_dir, filename)
        logger.info("Start to load graph from pb file, file path: %s.",
                    file_path)
        filehandler = FileHandler(file_path)
        model_proto = anf_ir_pb2.ModelProto()
        try:
            model_proto.ParseFromString(filehandler.read())
        except ParseError:
            logger.warning(
                "The given file is not a valid pb file, file path: %s.",
                file_path)
            return None

        graph = MSGraph()

        try:
            graph.build_graph(model_proto.graph)
        except Exception as ex:
            # Normally, there are no exceptions, and it is only possible for users on the MindSpore side
            # to dump other non-default graphs.
            logger.error("Build graph failed, file path: %s.", file_path)
            logger.exception(ex)
            raise UnknownError(str(ex))

        tensor_event = TensorEvent(
            wall_time=FileHandler.file_stat(file_path).mtime,
            step=0,
            tag=filename,
            plugin_name=PluginNameEnum.GRAPH.value,
            value=graph,
            filename=filename)

        logger.info("Build graph success, file path: %s.", file_path)
        return tensor_event
Пример #8
0
    def cache_train_jobs(self, train_ids):
        """
        Cache train jobs.

        Args:
            train_ids (list): Specify list of train_ids to be cached.

        Returns:
            dict, indicates train job ID and its current cache status.

        Raises:
            ParamTypeError, if the given train_ids parameter is not in valid type.
        """
        if not isinstance(train_ids, list):
            logger.error("train_ids must be list.")
            raise ParamTypeError('train_ids', list)

        cache_result = []
        for train_id in train_ids:
            if not isinstance(train_id, str):
                logger.error("train_id must be str.")
                raise ParamTypeError('train_id', str)

            try:
                train_job = self._data_manager.get_train_job(train_id)
            except exceptions.TrainJobNotExistError:
                logger.warning('Train job %s not existed', train_id)
                continue

            if train_job.cache_status == CacheStatus.NOT_IN_CACHE:
                self._data_manager.cache_train_job(train_id)
                # Update loader cache status to CACHING for consistency in response.
                train_job.cache_status = CacheStatus.CACHING

            cache_result.append(
                dict(
                    train_id=train_id,
                    cache_status=train_job.cache_status.value,
                ))

        return cache_result
Пример #9
0
    def list_summary_directories(self, summary_base_dir, overall=True):
        """
        List summary directories within base directory.

        Args:
            summary_base_dir (str): Path of summary base directory.
            overall (bool): Limit the total num of scanning if overall is False.

        Returns:
            list, list of summary directory info, each of which including the following attributes.
                - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR,
                                        starting with "./".
                - create_time (datetime): Creation time of summary file.
                - update_time (datetime): Modification time of summary file.
                - profiler (dict): profiler info, including profiler subdirectory path, profiler creation time and
                                    profiler modification time.

        Examples:
            >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
            >>> summary_watcher = SummaryWatcher()
            >>> directories = summary_watcher.list_summary_directories('/summary/base/dir')
        """
        if contains_null_byte(summary_base_dir=summary_base_dir):
            return []

        relative_path = os.path.join('.', '')
        if not self._is_valid_summary_directory(summary_base_dir,
                                                relative_path):
            return []

        summary_dict = {}
        counter = Counter(max_count=None if overall else self.MAX_SCAN_COUNT)

        try:
            entries = os.scandir(summary_base_dir)
        except PermissionError:
            logger.error('Path of summary base directory is not accessible.')
            raise FileSystemPermissionError(
                'Path of summary base directory is not accessible.')

        for entry in entries:
            if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
                break
            try:
                counter.add()
            except MaxCountExceededError:
                logger.info(
                    'Stop further scanning due to overall is False and '
                    'number of scanned files exceeds upper limit.')
                break
            if entry.is_symlink():
                pass
            elif entry.is_file():
                self._update_summary_dict(summary_dict, summary_base_dir,
                                          relative_path, entry)
            elif entry.is_dir():
                entry_path = os.path.realpath(
                    os.path.join(summary_base_dir, entry.name))
                self._scan_subdir_entries(summary_dict, summary_base_dir,
                                          entry_path, entry.name, counter)

        directories = []
        for key, value in summary_dict.items():
            directory = {
                'relative_path': key,
                'profiler': None,
                'create_time': value['ctime'],
                'update_time': value['mtime'],
            }
            profiler = value.get('profiler')
            if profiler is not None:
                directory['profiler'] = {
                    'directory': profiler['directory'],
                    'create_time': profiler['ctime'],
                    'update_time': profiler['mtime'],
                }
            directories.append(directory)

        # sort by update time in descending order and relative path in ascending order
        directories.sort(key=lambda x: (-int(x['update_time'].timestamp()), x[
            'relative_path']))

        return directories
Пример #10
0
    def list_summaries(self, summary_base_dir, relative_path='./'):
        """
        Get info of latest summary file within the given summary directory.

        Args:
            summary_base_dir (str): Path of summary base directory.
            relative_path (str): Relative path of summary directory, referring to summary base directory,
                                starting with "./" .

        Returns:
            list, list of summary file including the following attributes.
                - file_name (str): Summary file name.
                - create_time (datetime): Creation time of summary file.
                - update_time (datetime): Modification time of summary file.

        Examples:
            >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
            >>> summary_watcher = SummaryWatcher()
            >>> summaries = summary_watcher.list_summaries('/summary/base/dir', './job-01')
        """
        if contains_null_byte(summary_base_dir=summary_base_dir,
                              relative_path=relative_path):
            return []

        if not self._is_valid_summary_directory(summary_base_dir,
                                                relative_path):
            return []

        summaries = []
        summary_directory = os.path.realpath(
            os.path.join(summary_base_dir, relative_path))
        try:
            entries = os.scandir(summary_directory)
        except PermissionError:
            logger.error('Path of summary directory is not accessible.')
            raise FileSystemPermissionError(
                'Path of summary directory is not accessible.')

        for entry in entries:
            if entry.is_symlink() or not entry.is_file():
                continue

            pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name)
            if pattern is None:
                continue

            timestamp = int(pattern.groupdict().get('timestamp'))
            try:
                # extract created time from filename
                ctime = datetime.datetime.fromtimestamp(timestamp).astimezone()
            except OverflowError:
                continue

            try:
                stat = entry.stat()
            except FileNotFoundError:
                logger.warning('File %s not found.', entry.name)
                continue

            mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone()

            summaries.append({
                'file_name': entry.name,
                'create_time': ctime,
                'update_time': mtime,
            })

        # sort by update time in descending order and filename in ascending order
        summaries.sort(
            key=lambda x: (-int(x['update_time'].timestamp()), x['file_name']))

        return summaries
Пример #11
0
    def _load_single_file(self, file_handler, executor, events_data):
        """
        Load a log file data.

        Args:
            file_handler (FileHandler): A file handler.
            executor (Executor): The executor instance.
            events_data (EventsData): The container of event data.

        Returns:
            bool, True if the summary file is finished loading.
        """
        while True:
            start_offset = file_handler.offset
            try:
                event_str = self.event_load(file_handler)
                if event_str is None:
                    file_handler.reset_offset(start_offset)
                    return True
                if len(event_str) > MAX_EVENT_STRING:
                    logger.warning(
                        "file_path: %s, event string: %d exceeds %d and drop it.",
                        file_handler.file_path, len(event_str),
                        MAX_EVENT_STRING)
                    continue

                future = executor.submit(self._event_parse, event_str,
                                         self._latest_filename)

                def _add_tensor_event_callback(future_value):
                    tensor_values = future_value.result()
                    for tensor_value in tensor_values:
                        if tensor_value.plugin_name == PluginNameEnum.GRAPH.value:
                            try:
                                graph_tags = events_data.list_tags_by_plugin(
                                    PluginNameEnum.GRAPH.value)
                            except KeyError:
                                graph_tags = []

                            summary_tags = self.filter_files(graph_tags)
                            for tag in summary_tags:
                                events_data.delete_tensor_event(tag)

                        events_data.add_tensor_event(tensor_value)

                future.add_done_callback(
                    exception_no_raise_wrapper(_add_tensor_event_callback))
                return False
            except (exceptions.CRCFailedError,
                    exceptions.CRCLengthFailedError) as exc:
                file_handler.reset_offset(start_offset)
                file_size = file_handler.file_stat(file_handler.file_path).size
                logger.error(
                    "Check crc failed and ignore this file, please check the integrity of the file, "
                    "file_path: %s, offset: %s, file size: %s. Detail: %s.",
                    file_handler.file_path, file_handler.offset, file_size,
                    str(exc))
                return True
            except (OSError, DecodeError,
                    exceptions.MindInsightException) as ex:
                logger.error(
                    "Parse log file fail, and ignore this file, detail: %r, "
                    "file path: %s.", str(ex), file_handler.file_path)
                return True
            except Exception as ex:
                logger.exception(ex)
                raise UnknownError(str(ex))
Пример #12
0
 def wrapper(*args, **kwargs):
     try:
         return exception_wrapper(func)(*args, **kwargs)
     except UnknownError as err:
         logger.error(str(err))
Пример #13
0
    def list_summary_directories(self, summary_base_dir, overall=True):
        """
        List summary directories within base directory.

        Args:
            summary_base_dir (str): Path of summary base directory.

        Returns:
            list, list of summary directory info, each of which including the following attributes.
                - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR,
                                        starting with "./".
                - create_time (datetime): Creation time of summary file.
                - update_time (datetime): Modification time of summary file.

        Examples:
            >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
            >>> summary_watcher = SummaryWatcher()
            >>> directories = summary_watcher.list_summary_directories('/summary/base/dir')
        """
        if self._contains_null_byte(summary_base_dir=summary_base_dir):
            return []

        if not os.path.exists(summary_base_dir):
            logger.warning('Path of summary base directory not exists.')
            return []

        if not os.path.isdir(summary_base_dir):
            logger.warning(
                'Path of summary base directory is not a valid directory.')
            return []

        summary_dict = {}
        scan_count = 0

        try:
            entries = os.scandir(summary_base_dir)
        except PermissionError:
            logger.error('Path of summary base directory is not accessible.')
            raise FileSystemPermissionError(
                'Path of summary base directory is not accessible.')

        for entry in entries:
            if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
                break
            relative_path = os.path.join('.', '')
            if entry.is_symlink():
                pass
            elif entry.is_file():
                self._update_summary_dict(summary_dict, relative_path, entry)
            elif entry.is_dir():
                full_path = os.path.realpath(
                    os.path.join(summary_base_dir, entry.name))

                try:
                    subdir_entries = os.scandir(full_path)
                except PermissionError:
                    logger.warning(
                        'Path of %s under summary base directory is not accessible.',
                        entry.name)
                else:
                    for subdir_entry in subdir_entries:
                        if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
                            break
                        subdir_relative_path = os.path.join('.', entry.name)
                        if subdir_entry.is_symlink():
                            pass
                        elif subdir_entry.is_file():
                            self._update_summary_dict(summary_dict,
                                                      subdir_relative_path,
                                                      subdir_entry)

                        scan_count += 1
                        if not overall and scan_count >= self.MAX_SCAN_COUNT:
                            break

            scan_count += 1
            if not overall and scan_count >= self.MAX_SCAN_COUNT:
                logger.info(
                    'Stop further scanning due to overall is False and '
                    'number of scanned files exceeds upper limit.')
                break

        directories = [{
            'relative_path': key,
            'create_time': value['ctime'],
            'update_time': value['mtime'],
        } for key, value in summary_dict.items()]

        # sort by update time in descending order and relative path in ascending order
        directories.sort(key=lambda x: (-int(x['update_time'].timestamp()), x[
            'relative_path']))

        return directories