示例#1
0
    def _get_pipeline_path(self, source_dir):
        """
        Get the minddata pipeline file path.

        Args:
            source_dir (str): The minddata pipeline source dir.

        Returns:
            str, the minddata pipeline file path.
        """
        pipeline_path = os.path.join(
            source_dir, self._raw_pipeline_file_name.format(self._device_id))

        try:
            pipeline_path = validate_and_normalize_path(
                pipeline_path, 'profiler')
        except ValidationError:
            logger.warning('Minddata pipeline file is invalid.')
            raise ProfilerPathErrorException(
                'Minddata pipeline file is invalid.')
        if not os.path.isfile(pipeline_path):
            logger.warning('The minddata pipeline file <%s> not found.',
                           pipeline_path)
            raise ProfilerFileNotFoundException(pipeline_path)

        return pipeline_path
示例#2
0
    def get_proposer(self, proposer_type, *args):
        """
        Get the specified proposer according to the proposer type.

        Args:
            proposer_type (str): The proposer type.
            args (list): The parameters required for the specific proposer class.

        Returns:
            Proposer, the specified proposer instance.

        Examples:
            >>> proposer_type = 'step_trace'
            >>> proposer = ProposerFactory.instance().get_proposer(proposer_type, self.profiling_dir, self.device_id)

        """
        logger.debug("The 'proposer_type' is %s,The 'args' is %s",
                     proposer_type, str(args))
        proposer_instance = None
        sub_name = proposer_type.split('_')
        proposer_class_name = ''.join([name.capitalize() for name in sub_name])
        proposer_class_name += 'Proposer'

        if hasattr(proposer_module, proposer_class_name):
            proposer_instance = getattr(proposer_module,
                                        proposer_class_name)(*args)
        else:
            logger.warning("The proposer class %s does not exist.",
                           proposer_class_name)
        return proposer_instance
示例#3
0
    def _load(self):
        """Load data according to the parsed AICORE operator file."""
        op_detail_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_detail_time.format(self._device_id))
        framework_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_framework_info.format(self._device_id))
        if not os.path.isfile(op_detail_file_path):
            logger.warning('The file <%s> does not exist.',
                           op_detail_file_path)
            return
        if not os.path.isfile(framework_file_path):
            logger.warning('The file <%s> does not exist.',
                           framework_file_path)
            return

        framework_infos = dict()
        with open(framework_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                framework_infos[info[3]] = self._convert_framework_field_type(
                    info)

        with open(op_detail_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                detail_info = self._get_op_detail_info(info, framework_infos)
                self._data.append(detail_info)

        del framework_infos
示例#4
0
    def _search_file(input_dir):
        """Search step trace file under specific input directory."""
        # validate input_dir
        if not os.path.isdir(input_dir):
            raise ProfilerPathErrorException(
                '{} does not exist or is not a dir'.format(input_dir))
        # get step trace files
        files = os.listdir(input_dir)
        step_trace_files = list(
            filter(
                lambda file: file.startswith('training_trace') and not file.
                endswith('.done'), files))
        # validate result
        if len(step_trace_files) > 1:
            # the format of file name is like
            # `training_trace.46.dev.profiler_default_tag.$id.slice_$number`
            # use the $number as the sorted key
            try:
                step_trace_files.sort(
                    key=lambda path: int(path.rsplit('_', 1)[-1]))
            except ValueError as err:
                log.warning("Unable to parse file names: %s. %s",
                            step_trace_files, err)
                step_trace_files = []

        file_paths = [
            os.path.join(input_dir, file) for file in step_trace_files
        ]
        log.info("Find %d step trace files.", len(file_paths))
        return file_paths
示例#5
0
 def _record_trace_event(self, step_trace):
     """Record trace event."""
     self._step_num += 1
     start_time = step_trace.get('start')
     end_time = step_trace.get('end')
     fp_time = step_trace.get('fp')
     bp_time = step_trace.get('bp')
     if not (start_time and end_time and fp_time and bp_time):
         log.warning("The step %d is missing basic time.", self._step_num)
         return
     if start_time == '-':
         start_time = fp_time
     row_data = {
         'step_num': self._step_num,
         'start_point': start_time,
         'end_point': end_time,
         'total': end_time - start_time,
         'fp_point': fp_time,
         'bp_point': bp_time,
         'iteration_interval': fp_time - start_time,
         'fp_and_bp': bp_time - fp_time,
         'tail': end_time - bp_time
     }
     # update reduce info
     self._update_reduce_info(step_trace, row_data)
     # save the row data
     if not self._header:
         self._header = list(row_data.keys())
     row_data_list = [
         row_data.get(header_name, 0) for header_name in self._header
     ]
     self._result.append(row_data_list)
示例#6
0
    def get_flops_summary(self):
        """
        Get flops summary information for UI display.

        Returns:
            json, the content of flops summary information.
        """
        summary_filename = self._flops_summary_filename.format(self._device_id)

        file_path = os.path.join(self._profiling_dir, summary_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid flops summary path.'
        )

        flops_summary = {}
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    flops_summary = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error('Error occurred when read flops summary file: %s', err)
                raise ProfilerIOException()
        else:
            logger.warning('No flops summary file. Please check the output path.')

        return flops_summary
示例#7
0
 def get_analyser_result(self, analyser_type, condition=None):
     logger.debug("The Proposer 'analyser_type' is %s, 'options' is %s", str(analyser_type), str(condition))
     analyser_result = {}
     try:
         analyser = AnalyserFactory.instance().get_analyser(analyser_type, self.profiling_path, self.device_id)
         analyser_result = analyser.query(condition)
         logger.debug("The 'analyser_result' is %s, the 'condition' is %s.", str(analyser_result), str(condition))
     except MindInsightException as e:
         logger.warning(e)
     return analyser_result
    def execute(self):
        """Execute the parser, get result data, and write it to the output file."""

        if not os.path.exists(self._source_file_name):
            logger.info("Did not find the aicpu profiling source file")
            return

        with open(self._source_file_name, 'rb') as ai_cpu_data:
            ai_cpu_str = str(ai_cpu_data.read().replace(b'\n\x00', b' ___ ')
                             .replace(b'\x00', b' ___ '))[2:-1]
            ai_cpu_lines = ai_cpu_str.split(" ___ ")

        node_list = list()
        ai_cpu_total_time_summary = 0
        # node serial number
        serial_number = 1
        for i in range(len(ai_cpu_lines)-1):
            node_line = ai_cpu_lines[i]
            thread_line = ai_cpu_lines[i+1]
            if "Node" in node_line and "Thread" in thread_line:
                # get the node data from node_line
                node_name = node_line.split(',')[0].split(':')[-1]
                run_v2_start = node_line.split(',')[1].split(':')[-1]
                compute_start = node_line.split(',')[2].split(':')[-1]
                mercy_start = node_line.split(',')[3].split(':')[-1]
                mercy_end = node_line.split(',')[4].split(':')[-1]
                run_v2_end = node_line.split(',')[5].split(':')[-1]
                # get total_time and dispatch_time from thread line
                total_time = thread_line.split(',')[-1].split('=')[-1].split()[0]
                dispatch_time = thread_line.split(',')[-2].split('=')[-1].split()[0]

                node_data = [serial_number, node_name, total_time, dispatch_time, run_v2_start, compute_start,
                             mercy_start, mercy_end, run_v2_end]

                node_list.append(node_data)
                # calculate the total time
                ai_cpu_total_time_summary += int(total_time)
                # increase node serial number
                serial_number += 1
            elif "Node" in node_line and "Thread" not in thread_line:
                node_name = node_line.split(',')[0].split(':')[-1]
                logger.warning("The node:%s cannot find thread data", node_name)

        node_list.append(["AI CPU Total Time(us):", ai_cpu_total_time_summary])

        if node_list:
            fwrite_format(self._output_filename, data_source=_dst_file_title, is_print=True,
                          is_start=True)
            fwrite_format(self._output_filename,
                          data_source=tabulate(node_list, _dst_file_column_title,
                                               tablefmt='simple'),
                          is_start=True, is_print=True)
示例#9
0
 def _construct_time_point(self, name, start, duration):
     """Construct time point."""
     point = {}
     if start >= 0 and duration >= 0:
         point = {
             self._attr_ui_name: name,
             self._attr_ui_start: round(start, 4),
             self._attr_ui_duration: round(duration, 4)
         }
     else:
         log.warning("Not invalid point info: "
                     "name: %s, start: %s, duration: %s", name, start, duration)
     return point
示例#10
0
    def _load(self):
        """Load data according to the parsed AICORE operator types file."""
        op_type_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_type_time.format(self._device_id))
        if not os.path.isfile(op_type_file_path):
            logger.warning('The file <%s> does not exist.', op_type_file_path)
            return

        with open(op_type_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                self._data.append(self._convert_field_type(info))
    def _load(self):
        """Load data according to the parsed minddata pipeline file."""
        pipeline_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_pipeline.format(self._device_id))
        if not os.path.isfile(pipeline_file_path):
            logger.warning('The file <%s> does not exist.', pipeline_file_path)
            return

        with open(pipeline_file_path, 'r') as file:
            csv.field_size_limit(sys.maxsize)
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                self._data.append(self._convert_field_type(info))
示例#12
0
 def _update_reduce_info(step_trace, row_data):
     """Extract reduce info."""
     reduce_time = step_trace.get('reduce', {})
     for stream_id, time_points in reduce_time.items():
         time_point_num = len(time_points)
         if time_point_num % 2:
             log.warning("Stream %d has %d reduce time points.", stream_id,
                         time_point_num)
             continue
         for index, point_id in enumerate(range(0, time_point_num, 2)):
             field_name = f'stream_{stream_id}_parallel_{index}'
             row_data[field_name + '_start_point'] = time_points[point_id]
             row_data[field_name + '_end_point'] = time_points[point_id + 1]
             row_data[field_name] = time_points[point_id +
                                                1] - time_points[point_id]
示例#13
0
    def _load(self):
        """Load data according to the parsed AICPU operator file."""
        aicpu_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicpu_time.format(self._device_id))
        aicpu_file_path = validate_and_normalize_path(
            aicpu_file_path, raise_key='Invalid aicpu file path.')
        if not os.path.isfile(aicpu_file_path):
            logger.warning('The file <%s> does not exist.', aicpu_file_path)
            return

        with open(aicpu_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for info in csv_reader:
                aicpu_info = self._convert_field_type(info)
                self._data.append(aicpu_info)
示例#14
0
    def _load(self):
        """Load data according to the parsed AICPU operator file."""
        aicpu_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicpu_time.format(self._device_id))
        aicpu_file_path = validate_and_normalize_path(
            aicpu_file_path, raise_key='Invalid aicpu file path.')

        if not os.path.isfile(aicpu_file_path):
            logger.warning('The file <%s> does not exist.', aicpu_file_path)
            return

        type_detail_cache = dict()
        with open(aicpu_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for item in csv_reader:
                op_type = item[1]
                info = type_detail_cache.get(op_type)
                if info:
                    info.append(item)
                else:
                    type_detail_cache[op_type] = [item]
        type_temp_detail_cache = dict()
        total_avg_time = 0
        result = []
        for key, value in type_detail_cache.items():
            exec_frequency = len(value)
            total_time_index = 2
            exec_avg_time = sum([float(i[total_time_index])
                                 for i in value]) / exec_frequency
            exec_avg_time = round(exec_avg_time, 6)
            total_avg_time += exec_avg_time
            type_temp_detail_cache[key] = [key, exec_avg_time, exec_frequency]

        for key, value in type_temp_detail_cache.items():
            execution_time_index = 1
            percent = round(
                (value[execution_time_index] / total_avg_time) * 100, 2)
            value.append(percent)
            result.append(value)

        self._data = result
示例#15
0
    def _get_save_path(self, output_path):
        """
        Get the save path.

        Args:
            output_path (str): The output dir.

        Returns:
            str, the save path.
        """
        try:
            output_dir = validate_and_normalize_path(output_path, 'profiler')
        except ValidationError:
            logger.warning('Output path is invalid.')
            raise ProfilerPathErrorException('Output path is invalid.')
        if not os.path.isdir(output_dir):
            logger.warning('The output dir <%s> not found.', output_dir)
            raise ProfilerDirNotFoundException(output_dir)
        return os.path.join(
            output_dir,
            self._parsed_pipeline_file_name.format(self._device_id))
示例#16
0
    def parse(self):
        """
        Parse the minddata pipeline files.

        Raises:
            ProfilerRawFileException: If fails to parse the raw file of
                minddata pipeline or the file is empty.
        """
        with open(self._pipeline_path, 'r') as file:
            try:
                pipeline_info = json.load(file)
            except (json.JSONDecodeError, TypeError) as err:
                logger.exception(err)
                raise ProfilerRawFileException(
                    'Fail to parse minddata pipeline file.')
        if not pipeline_info:
            logger.warning('The minddata pipeline file is empty.')
            raise ProfilerRawFileException(
                'The minddata pipeline file is empty.')

        self._parse_and_save(pipeline_info)
示例#17
0
def validate_and_set_job_id_env(job_id_env):
    """
    Validate the job id and set it in environment.

    Args:
        job_id_env (str): The id that to be set in environment parameter `JOB_ID`.

    Returns:
        int, the valid job id env.
    """
    if job_id_env is None:
        return job_id_env
    # get job_id_env in int type
    valid_id = to_int(job_id_env, 'job_id_env')
    # check the range of valid_id
    if valid_id and 255 < valid_id < sys.maxsize:
        os.environ['JOB_ID'] = job_id_env
    else:
        log.warning(
            "Invalid job_id_env %s. The value should be int and between 255 and %s. Use"
            "default job id env instead.", job_id_env, sys.maxsize)
    return valid_id
示例#18
0
    def record_point_info(self, point_info, output_path):
        """
        Record point info into json.

        Args:
            point_info (dict): The point info about tag id and relative op name.
            output_path (str): The output path for saving point info.

        Returns:
            dict, parsed point info.
        """
        points = {
            'fp_start': point_info.get(self._fp_tag, ''),
            'bp_end': point_info.get(self._bp_tag, '')
        }
        try:
            with open(output_path, 'w') as json_file:
                json.dump(points, json_file)
            os.chmod(output_path, stat.S_IREAD)
        except (IOError, OSError) as err:
            log.warning('Failed to save point info. %s', err)
            raise ProfilerIOException
        return points
示例#19
0
 def _iter_interval_analyze(self, step_trace_condition):
     """Get the proposals of iteration interval."""
     iter_interval_dict = OrderedDict()
     default_iter_interval_lst = [0]
     iter_interval_condition = step_trace_condition.get("iter_interval", {})
     analyser_result = self.get_analyser_result(
         self.__proposer_type, condition=iter_interval_condition)
     iter_interval_length_lst = analyser_result.get("info", {}).get(
         "iteration_interval", default_iter_interval_lst)
     logger.debug("The 'iter_interval_length_lst' is %s",
                  str(iter_interval_length_lst))
     # Check the iter_interval_length_lst.
     if not isinstance(iter_interval_length_lst,
                       list) or not iter_interval_length_lst:
         logger.warning(
             "The 'iter_interval_length_lst' is %s, it is null or not a list",
             str(iter_interval_length_lst))
     else:
         if iter_interval_length_lst[
                 0] > self.__step_trace_iter_interval_threshold:
             iter_interval_dict[self.__iter_interval_label] = [
                 str(self.__step_trace_iter_interval_threshold)
             ]
             self.__proposal_dict.update(iter_interval_dict)
示例#20
0
    def _load(self):
        """Load data according to the parsed AICORE operator file."""
        op_detail_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_detail_time.format(self._device_id))
        framework_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_framework_info.format(self._device_id))
        flops_file_path = os.path.join(
            self._profiling_dir, self._file_name_flops.format(self._device_id))
        op_detail_file_path = validate_and_normalize_path(
            op_detail_file_path, raise_key='Invalid aicore_detail file path.')
        framework_file_path = validate_and_normalize_path(
            framework_file_path, raise_key='Invalid framework file path.')
        flops_file_path = validate_and_normalize_path(
            flops_file_path, raise_key='Invalid flops file path.')
        if not os.path.isfile(op_detail_file_path):
            logger.warning('The file <%s> does not exist.',
                           op_detail_file_path)
            return
        if not os.path.isfile(framework_file_path):
            logger.warning('The file <%s> does not exist.',
                           framework_file_path)
            return

        framework_infos = dict()
        with open(framework_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for info in csv_reader:
                framework_infos[info[3]] = self._convert_framework_field_type(
                    info)

        flops_infos = dict()
        if os.path.isfile(flops_file_path):
            with open(flops_file_path, 'r') as f_obj:
                # skip the first line which is header info.
                next(f_obj)
                for line in f_obj:
                    flops_line = line.strip().split(',')
                    # flops_line[0] is full_op_name.
                    flops_infos[flops_line[0]] = flops_line[1:]
        else:
            logger.warning('The file <%s> does not exist.', flops_file_path)

        with open(op_detail_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for info in csv_reader:
                detail_info = self._get_op_detail_info(info, framework_infos,
                                                       flops_infos)
                self._data.append(detail_info)

        del framework_infos
        del flops_infos
示例#21
0
    def analyse(self):
        """
        Collect and analyse performance data, called after training or during training.

        Examples:
            >>> from mindinsight.profiler import Profiler
            >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
            >>>                     device_id=int(os.environ["DEVICE_ID"]))
            >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
            >>> model = Model(train_network)
            >>> dataset = get_dataset()
            >>> model.train(2, dataset)
            >>> profiler.analyse()
        """

        try:
            from mindspore.communication.management import release
            release()
        except ImportError:
            logger.error("Profiling: fail to import release from mindspore.")

        job_id = self._get_profiling_job_id()
        logger.info("Profiling: job id is %s ", job_id)

        source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
        # parse hwts.log.data.45.dev file, and get task profiling data
        hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
        hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
        hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
        result = hwtslog_parser.execute()
        if not result:
            logger.error("Profiling: fail to parse hwts log file.")
            return

        # parse Framework file, and get the relation of op and tasks
        framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
        framework_parser.parse()
        op_task_dict = framework_parser.to_task_id_full_op_name_dict()
        if not op_task_dict:
            logger.error("Profiling: fail to parse framework files.")
            return

        # get op compute time from hwts data and framework data, write output_op_compute_time.txt
        opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
        opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
        optime_parser = OPComputeTimeParser(
            hwts_output_filename, opcompute_output_filename,
            op_task_dict, self._output_path, self._dev_id
        )
        optime_parser.execute()

        # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
        output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
        output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
        aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
        aicpu_data_parser.execute()

        # Parsing minddata AICPU profiling
        MinddataParser.execute(source_path, self._output_path, self._dev_id)

        # parse minddata pipeline operator and queue
        try:
            pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
            pipeline_parser.parse()
        except MindInsightException as err:
            logger.warning(err.message)

        # analyse op compute time info
        try:
            self._analyser_op_info()
        except MindInsightException as err:
            logger.warning(err.message)

        # analyse step trace info
        try:
            self._analyse_step_trace(source_path, framework_parser)
        except MindInsightException as err:
            logger.warning(err.message)

        # analyse timeline info
        try:
            self._analyse_timeline()
        except (ProfilerIOException, ProfilerFileNotFoundException, ValidationError) as err:
            logger.warning('Fail to write timeline data: %s', err)