예제 #1
0
    def _get_total_step_num(self):
        """Get the num of train step."""
        total_step_num = 0
        # take the data of one of the machines to get the total number of steps.
        host_ip_dir = self._host_ips_dir[0]
        target_dir_path = os.path.join(self._cluster_profiler_dir,
                                       'cluster_profiler', host_ip_dir,
                                       'profiler')
        target_dir_path = validate_and_normalize_path(
            target_dir_path, raise_key="Invalid profiler dir path.")
        if not os.path.exists(target_dir_path):
            log.error('Did not find cluster_profiler dir : %s',
                      target_dir_path)
            raise ProfilerDirNotFoundException(
                msg='Did not find cluster_profiler dir:{}'.format(
                    target_dir_path))

        entries = os.scandir(target_dir_path)
        for entry in entries:
            if entry.is_symlink():
                continue
            if entry.is_file() and entry.name.startswith('step_trace_raw'):
                file_path = os.path.join(target_dir_path, entry.name)
                with open(file_path, 'r') as src_file:
                    lines = src_file.readlines()
                # The penultimate line represents the information of the last step
                # The step num index is 0
                if len(lines) > 1:
                    total_step_num = lines[-2].split(',')[0]
                break
        return total_step_num
예제 #2
0
 def _get_step_trace_info(self, host_ip, device_id, step_num):
     """Get step trace info."""
     file_name = 'step_trace_raw_{}_detail_time.csv'.format(device_id)
     step_trace_file_path = \
         os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name)
     step_trace_file_path = validate_and_normalize_path(
         step_trace_file_path, raise_key="Invalid step trace file path.")
     if not os.path.exists(step_trace_file_path):
         log.error('Did not find the file: %s', step_trace_file_path)
         raise ProfilerFileNotFoundException(
             msg='Did not find the file:{}'.format(step_trace_file_path))
     step_trace_info = list()
     step_num = str(step_num)
     with open(step_trace_file_path, 'r') as src_file:
         lines = src_file.readlines()
         # when the step_num value is 0, it means the average value.
         # The last line of the step_trace_raw_{}_detail_time.csv records the average value.
         if step_num == '0':
             step_trace_info = lines[-1].strip('\n').split(',')
         else:
             for line in lines:
                 line = line.strip('\n').split(',')
                 if line[0] == step_num:
                     step_trace_info = line
     # step_trace_info[6]: iteration_interval time
     # step_trace_info[7]: fp_and_bp time
     # step_trace_info[8]: tail time
     # divided by 1e5, the unit becomes a millisecond
     iteration_interval = float(step_trace_info[6]) / 1e5
     fp_and_bp = float(step_trace_info[7]) / 1e5
     tail = float(step_trace_info[8]) / 1e5
     step_trace_info = [iteration_interval, fp_and_bp, tail]
     return step_trace_info
예제 #3
0
    def get_memory_usage_breakdowns(self, device_type, graph_id, node_id):
        """
        Get memory usage breakdowns for each node.

        Args:
            device_type (str): Device type, e.g., GPU, Ascend.
            graph_id (int): Graph id.
            node_id (int): Node id.

        Returns:
            json, the content of memory usage breakdowns.
        """
        memory_details = self._get_file_content(device_type, FileType.DETAILS.value)
        if graph_id not in memory_details:
            logger.error('Invalid graph id: %s', graph_id)
            raise ParamValueError('Invalid graph id.')

        graph = memory_details[graph_id]
        if not ('breakdowns' in graph and node_id < len(graph['breakdowns'])):
            logger.error('Invalid node id: %s', node_id)
            raise ParamValueError('Invalid node id.')

        memory_breakdowns = graph.get('breakdowns')[node_id]

        return {'breakdowns': memory_breakdowns}
예제 #4
0
    def _write_timeline_data_into_file(self, timeline_data):
        """
        Write the timeline information into the file, including
            operator name, stream id, start time and duration.

        Args:
            timeline_data (list): The metadata to be written into the file.
                [
                    ['op_name_1', 'stream_id_1', 'start_time_1', 'durarion_1'],
                    ['op_name_2', 'stream_id_2', 'start_time_2', 'durarion_2'],
                    [...]
                ]
        """
        # sorted by start times
        timeline_data.sort(key=lambda x: float(x[2]))
        filename = 'output_timeline_data_{}.txt'.format(self._device_id)
        file_path = os.path.join(self._output_path, filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid file path of timeline data.')

        # write to file
        try:
            with open(file_path, 'w') as f_obj:
                f_obj.write(TIMELINE_FILE_COLUMN_TITLE + '\n')
                for timeline in timeline_data:
                    timeline = [str(item) for item in timeline]
                    f_obj.write(','.join(timeline) + '\n')
        except (IOError, OSError) as err:
            logger.error(
                'Error occurred when writing intermediate timeline file: %s',
                err)
            raise ProfilerIOException
    def _get_minddata_pipeline_info(self):
        """Get the number of thread cores in minddata pipeline operator"""
        file_name = self._minddata_pipeline_display_filename.format(
            self._device_id)
        file_path = os.path.join(self._profiling_dir, file_name)
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid minddata_pipeline_info file path.")
        if not os.path.exists(file_path):
            log.error('Did not find the minddata_pipeline file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the minddata_pipeline file:{}'.format(
                    file_path))

        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                minddata_pipeline_info = json.load(file)
            except json.JSONDecodeError as err:
                log.exception(err)
                raise ProfilerRawFileException(
                    "Fail to parse minddata pipeline file")

        minddata_pipeline_op_info = []
        for item in minddata_pipeline_info.get("op_info"):
            op_info_dict = dict()
            op_info_dict["op_id"] = item.get("op_id")
            op_info_dict["num_workers"] = item.get("num_workers")
            minddata_pipeline_op_info.append(op_info_dict)
        return minddata_pipeline_op_info
예제 #6
0
    def write_timeline_to_json_by_limitation(self):
        """Write timeline to json by limitation."""
        display_filename = self._display_filename.format(self._device_id)
        display_file_path = os.path.join(
            self._profiling_dir,
            display_filename
        )
        display_file_path = validate_and_normalize_path(
            display_file_path, raise_key='Invalid timeline display json path.'
        )

        length = len(self._timeline_meta)
        try:
            with open(display_file_path, 'w') as json_file:
                json_file.write('[')
                for index, item in enumerate(self._timeline_meta):
                    json.dump(item, json_file)
                    file_size = os.path.getsize(display_file_path)
                    if file_size > SIZE_LIMIT:
                        break
                    if index == length - 1:
                        break
                    json_file.write(',')
                json_file.write(']')
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline display file: %s', err)
            raise ProfilerIOException
예제 #7
0
    def get_timeline_summary(self):
        """
        Get timeline summary information for UI display.

        Returns:
            json, the content of timeline summary information.
        """
        summary_filename = self._timeline_summary_filename.format(self._device_id)
        file_path = os.path.join(self._profiling_dir, summary_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline summary path.'
        )

        timeline_summary = {}
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline_summary = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error('Error occurred when read timeline summary file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline summary file. Please check the output path.')

        return timeline_summary
예제 #8
0
    def _get_host_device_rank_relation(self):
        """Get host_ip device_id rank_id relation."""
        rank_table_file_path = self._get_rank_table_file_path()
        if not os.path.exists(rank_table_file_path):
            log.error('Did not find rank table file under %s',
                      self._cluster_profiler_dir)
            raise ProfilerFileNotFoundException(
                msg='Did not find rank table file')
        with open(rank_table_file_path, 'r', encoding='utf-8') as file:
            try:
                relation_info = json.load(file)
            except json.JSONDecodeError as err:
                log.exception(err)
        host_device_rank_relation = list()
        servers_info = relation_info.get("server_list")
        for server_info in servers_info:
            server_id = server_info.get("server_id")
            devices_info = server_info.get("device")
            for device_info in devices_info:
                device_id = device_info.get("device_id")
                rank_id = device_info.get("rank_id")
                host_device_rank_relation.append(
                    [server_id, device_id, rank_id])

        host_ips_mapping_info = self._get_host_ips_mapping_info()
        for item in host_device_rank_relation:
            # host_ip_index:0,host_mapping_id_index:1
            target_info = [i for i in host_ips_mapping_info if item[0] == i[0]]
            # target_info is like:[[host_ip, host_mapping_ip]]
            item[0] = target_info[0][1]

        return host_device_rank_relation
예제 #9
0
    def _get_communication_info(self, host_ip, device_id, step_num):
        """Get step trace info."""
        file_name = 'hccl_raw_{}.csv'.format(device_id)
        communication_file_path = \
            os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name)
        communication_file_path = validate_and_normalize_path(
            communication_file_path,
            raise_key="Invalid  communication file path.")
        if not os.path.exists(communication_file_path):
            log.error('Did not find the file: %s', communication_file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the file:{}'.format(communication_file_path))
        communication_info = list()
        step_num = str(step_num)
        with open(communication_file_path, 'r') as src_file:
            csv_reader = csv.reader(src_file)
            # when the step_num value is 0, it means the average value.
            # The last line of the step_trace_raw_{}_detail_time.csv records the average value.
            # The first element of the last line is '-'.
            step_num = '-' if step_num == '0' else step_num
            for row in csv_reader:
                if row[0] == step_num:
                    communication_info = row
                    break
        # Convert string to floating point and dictionary
        if communication_info:
            communication_info[1] = float(communication_info[1])
            communication_info[2] = float(communication_info[2])
            communication_info[3] = json.loads(communication_info[3])
            communication_info[4] = json.loads(communication_info[4])

        return communication_info
예제 #10
0
    def _load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id)
        )
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.'
        )
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        timeline_list = []
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        timeline_list.append(line_list)
        except (IOError, OSError) as err:
            logger.error('Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        return timeline_list
예제 #11
0
    def _get_op_task_id_map(self):
        """
        Read hwts data file, get the task time info.

        Returns:
           list: all hwts task time info.
        """

        op_map_result = []
        hwts_list = []

        if not os.path.exists(self._hwts_output_file):
            logger.error('The hwts output file does not exist.')
            raise ProfilerFileNotFoundException('hwts output file')

        with open(self._hwts_output_file, 'r') as data_file:
            lines = data_file.readlines()
            for line in lines:
                if line.startswith("Start of task") or line.startswith(
                        "End of task"):
                    line_split = line.split()
                    container = HWTSContainer(line_split)
                    hwts_list.append(container)

        # hwts op map by taskId
        for hwts in hwts_list:
            if hwts.task_id in self._op_task_info.keys():
                hwts.op_name = self._op_task_info[hwts.task_id]
                op_map_result.append(hwts)

        return op_map_result
예제 #12
0
def get_memory_usage_breakdowns():
    """
    Get memory breakdowns of each node.

    Returns:
        Response, the memory breakdowns for each node.

    Examples:
        >>> GET http://xxxx/v1/mindinsight/profile/memory-breakdowns
    """
    summary_dir = request.args.get("dir")
    profiler_dir_abs = validate_and_normalize_profiler_path(
        summary_dir, settings.SUMMARY_BASE_DIR)
    check_train_job_and_profiler_dir(profiler_dir_abs)

    device_id = request.args.get("device_id", default='0')
    to_int(device_id, 'device_id')
    device_type = request.args.get("device_type", default='ascend')
    graph_id = request.args.get("graph_id", default='0')
    node_id = request.args.get("node_id", default='0')
    node_id = to_int(node_id, 'node_id')
    if device_type not in ['ascend']:
        logger.error(
            "Invalid device_type, Memory Usage only supports Ascend for now.")
        raise ParamValueError("Invalid device_type.")

    analyser = AnalyserFactory.instance().get_analyser('memory_usage',
                                                       profiler_dir_abs,
                                                       device_id)
    breakdowns = analyser.get_memory_usage_breakdowns(device_type, graph_id,
                                                      node_id)

    return breakdowns
예제 #13
0
def validate_and_normalize_profiler_path(summary_dir, summary_base_dir):
    """
    Validate and normalize profiler path.

    Args:
        summary_dir (str): The relative path of summary directory.
        summary_base_dir (str): The summary base directory.

    Returns:
        str, normalized path of profiler directory.
    """
    if not summary_dir:
        raise ProfilerParamValueErrorException('The file dir does not exist.')
    try:
        unquote_path = unquote(summary_dir, errors='strict')
    except UnicodeDecodeError:
        raise ProfilerParamValueErrorException('Unquote error with strict mode')
    profiler_dir = os.path.join(summary_base_dir, unquote_path, 'profiler')
    try:
        profiler_dir = validate_and_normalize_path(profiler_dir, 'profiler')
    except ValidationError:
        log.error('profiler dir <%s> is invalid', profiler_dir)
        raise ProfilerParamValueErrorException('Profiler dir is invalid.')

    return profiler_dir
예제 #14
0
    def _get_proc_details(self, proc_name, step_id=None, time_type='realtime'):
        """
        Get step trace info for selected step and save the result.

        Args:
            proc_name (str): The selected field name.
            step_id (int): The selected step_id. If not given, it means all steps is required.
                If the value is 0, it means average info for all steps except the first is
                required. Default: None.
            time_type (str): The value type. `systime` keeps the original value.
                `realtime` transforms the value in millisecond. Default: `realtime`.
        """
        if proc_name is None:
            log.error('`proc_name` is required for query.')
            raise ProfilerParamValueErrorException(
                '`proc_name` is required for query.')
        if step_id is None:
            rows_info = self._data[:-1]
        else:
            rows_info = [self._data[step_id - 1]]

        proc_info = [
            get_field_value(row_info, proc_name, self.__column__, time_type)
            for row_info in rows_info
        ]
        self._result['info'] = {proc_name: proc_info}
예제 #15
0
    def get_timeline_summary(self):
        """
        Get timeline summary information for UI display.

        Returns:
            json, the content of timeline summary information.
        """
        file_path = None
        summary_file_name = 'timeline_summary_{}.json'.format(self._device_id)
        if summary_file_name in os.listdir(self._profiling_dir):
            file_path = os.path.join(self._profiling_dir, summary_file_name)

        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline summary path.')

        timeline_summary = {}
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline_summary = json.load(f_obj)
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read timeline summary file: %s', err)
                raise ProfilerIOException

        return timeline_summary
예제 #16
0
    def get_display_timeline(self, device_type):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        if device_type == "ascend":
            display_filename = self._ascend_display_filename.format(
                self._device_id)
        elif device_type == "gpu":
            display_filename = self._gpu_display_filename.format(
                self._device_id)
        else:
            logger.info(
                'device type should be ascend or gpu. Please check the device type.'
            )
            raise ParamValueError("Invalid device_type.")
        file_path = os.path.join(self._profiling_dir, display_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.')

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error(
                    'Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
예제 #17
0
    def get_min_cycle_counter_from_file(self):
        """
        Get minimum cycle counter.

        Returns:
            float, the minimum value of the cycle counter.
        """
        file_path = os.path.join(
            self._profiling_dir,
            self._min_cycle_counter_file_path.format(self._device_id))

        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid min cycle counter file path.')

        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    min_cycle_counter = f_obj.read()
                    min_cycle_counter = float(min_cycle_counter) \
                        if not min_cycle_counter == 'inf' else 0
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read minimum cycle counter: %s', err)
                raise ProfilerIOException
        else:
            min_cycle_counter = 0
            logger.info("No min cycle counter recorded.")

        return min_cycle_counter
예제 #18
0
    def get_display_timeline(self):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        # Search timeline json file under profiling dir.
        timeline_filename = self._timeline_filename.format(self._device_id)
        display_filename = self._display_filename.format(self._device_id)
        file_list = [
            filename for filename in os.listdir(self._profiling_dir)
            if timeline_filename in filename or display_filename in filename
        ]

        # Check if there is a timeline json file for display
        file_path = os.path.join(self._profiling_dir, display_filename)
        if display_filename not in file_list:
            file_path = os.path.join(self._profiling_dir, timeline_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.')

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
예제 #19
0
    def _get_total_step_num(self):
        """Get the num of train step."""
        total_step_num = 0
        # Take the data of one of the machines to get the total number of steps.
        host_ip_dir = self._host_ips_dir[0]
        target_dir_path = os.path.join(self._cluster_profiler_dir,
                                       'cluster_profiler', host_ip_dir,
                                       'profiler')
        target_dir_path = validate_and_normalize_path(
            target_dir_path, raise_key="Invalid profiler dir path.")
        if not os.path.exists(target_dir_path):
            log.error('Did not find cluster_profiler dir : %s',
                      target_dir_path)
            raise ProfilerDirNotFoundException(
                msg='Did not find cluster_profiler dir:{}'.format(
                    target_dir_path))

        entries = os.scandir(target_dir_path)
        for entry in entries:
            if entry.is_symlink():
                continue
            if entry.is_file() and entry.name.startswith('hccl_raw'):
                file_path = os.path.join(target_dir_path, entry.name)
                with open(file_path, 'r') as src_file:
                    lines = src_file.readlines()
                # The first row is col_name, the last row is the average.
                if len(lines) > 2:
                    total_step_num = len(lines) - 2
                break
        return total_step_num
예제 #20
0
    def load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id))
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.')
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        stream_count_dict = {}
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        self._parse_timeline_data(line_list)
                        self._update_num_of_streams(line_list,
                                                    stream_count_dict)
        except (IOError, OSError) as err:
            logger.error(
                'Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] = len(
            stream_count_dict.keys())
    def _get_minddata_queue_step_time_info(self):
        """Get the sampling time information at the steps of the host queue"""
        minddata_queue_step_time_info = []
        minddata_analyser = MinddataAnalyser(self._profiling_dir,
                                             self._device_id)
        file_path = minddata_analyser.get_device_queue_file_path()
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid device_queue file path")
        if not os.path.exists(file_path):
            log.error('Did not find the device queue file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the device queue file.')

        with open(file_path) as data_file:
            for line in data_file.readlines():
                op_info = line.split()
                # op_info is a list like:['1','64','8','2','85406783']
                # The value of the first element in op_info is '0' or '1'.
                # '0' means that the time information is recorded.
                # '1' means that the queue information is recorded.
                # '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time.
                if op_info and op_info[0] == "1":
                    minddata_queue_step_time_info.append(
                        [op_info[2], op_info[4]])
        return minddata_queue_step_time_info
예제 #22
0
    def _get_file_path(self, device_type, file_type):
        """
        Get memory usage summary file.

        Args:
            device_type (str): Device type, e.g., GPU, Ascend.
            file_type (str): memory usage file type, e.g., summary, details.

        Returns:
            str, file path of memory usage file corresponding to its file_type.
        """
        filename = ""
        if device_type == "ascend":
            if file_type is FileType.SUMMARY.value:
                filename = self._summary_filename.format(self._device_id)
            elif file_type is FileType.DETAILS.value:
                filename = self._details_filename.format(self._device_id)
        else:
            logger.error('Memory Usage only supports Ascend for now. Please check the device type.')
            raise ParamValueError("Invalid device type.")

        file_path = os.path.join(self._profiling_dir, filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid memory usage file path.'
        )

        return file_path
예제 #23
0
def validate_group_condition(search_condition):
    """
    Verify the group_condition in search_condition is valid or not.

    Args:
        search_condition (dict): The search condition.

    Raises:
        ProfilerGroupConditionException: If the group_condition param in search_condition is invalid.
    """
    group_condition = search_condition.get("group_condition")
    if not isinstance(group_condition, dict):
        raise ProfilerGroupConditionException("The group condition must be dict.")
    if "limit" in group_condition:
        limit = group_condition.get("limit", 10)
        if isinstance(limit, bool) \
                or not isinstance(group_condition.get("limit"), int):
            log.error("The limit must be int.")
            raise ProfilerGroupConditionException("The limit must be int.")
        if limit < 1 or limit > 100:
            raise ProfilerGroupConditionException("The limit must in [1, 100].")

    if "offset" in group_condition:
        offset = group_condition.get("offset", 0)
        if isinstance(offset, bool) \
                or not isinstance(group_condition.get("offset"), int):
            log.error("The offset must be int.")
            raise ProfilerGroupConditionException("The offset must be int.")
        if offset < 0:
            raise ProfilerGroupConditionException("The offset must ge 0.")

        if offset > 1000000:
            raise ProfilerGroupConditionException("The offset must le 1000000.")
예제 #24
0
def validate_minddata_pipeline_condition(condition):
    """
    Verify the minddata pipeline search condition is valid or not.

    Args:
        condition (dict): The minddata pipeline search condition.

    Raises:
        ProfilerParamTypeErrorException: If the type of the search condition is
            invalid.
        ProfilerDeviceIdException: If the device_id param in the search
            condition is invalid.
        ProfilerGroupConditionException: If the group_condition param in the
            search condition is invalid.
        ProfilerSortConditionException: If the sort_condition param in the
            search condition is invalid.
        ProfilerFilterConditionException: If the filter_condition param in the
            search condition is invalid.
    """
    if not isinstance(condition, dict):
        log.error("Invalid condition type, it should be dict.")
        raise ProfilerParamTypeErrorException(
            "Invalid condition type, it should be dict."
        )

    if "device_id" in condition:
        device_id = condition.get("device_id")
        if not isinstance(device_id, str):
            raise ProfilerDeviceIdException(
                "Invalid device_id type, it should be str."
            )

    if "group_condition" in condition:
        validate_group_condition(condition)

    if "sort_condition" in condition:
        validate_sort_condition(condition, MINDDATA_PIPELINE_COL)

    if "filter_condition" in condition:
        filter_condition = condition.get('filter_condition')
        if not isinstance(filter_condition, dict):
            raise ProfilerFilterConditionException(
                "The filter condition must be dict."
            )
        for key, value in filter_condition.items():
            if key == 'op_id':
                validate_op_filter_condition(
                    value, value_type=int, value_type_msg='int'
                )
            elif key == 'op_type':
                validate_op_filter_condition(value)
            elif key == 'is_display_op_detail':
                if not isinstance(key, bool):
                    raise ProfilerFilterConditionException(
                        "The condition must be bool."
                    )
            else:
                raise ProfilerFilterConditionException(
                    "The key {} of filter_condition is not support.".format(key)
                )
예제 #25
0
def validate_condition(search_condition):
    """
    Verify the param in search_condition is valid or not.

    Args:
        search_condition (dict): The search condition.

    Raises:
        ProfilerParamTypeErrorException: If the type of the param in search_condition is invalid.
        ProfilerDeviceIdException: If the device_id param in search_condition is invalid.
        ProfilerOpTypeException: If the op_type param in search_condition is invalid.
        ProfilerGroupConditionException: If the group_condition param in search_condition is invalid.
        ProfilerSortConditionException: If the sort_condition param in search_condition is invalid.
        ProfilerFilterConditionException: If the filter_condition param in search_condition is invalid.
    """
    if not isinstance(search_condition, dict):
        log.error("Invalid search_condition type, it should be dict.")
        raise ProfilerParamTypeErrorException(
            "Invalid search_condition type, it should be dict.")

    if "device_id" in search_condition:
        device_id = search_condition.get("device_id")
        if not isinstance(device_id, str):
            raise ProfilerDeviceIdException(
                "Invalid device_id type, it should be str.")

    if "op_type" in search_condition:
        op_type = search_condition.get("op_type")
        if op_type == "aicpu_type":
            search_scope = AICPU_TYPE_COL
        elif op_type == "aicpu_detail":
            search_scope = AICPU_DETAIL_COL
        elif op_type == "aicore_type":
            search_scope = AICORE_TYPE_COL
        elif op_type == "aicore_detail":
            search_scope = AICORE_DETAIL_COL
        elif op_type == "gpu_op_type":
            search_scope = GPU_TYPE_COL
        elif op_type == "gpu_op_info":
            search_scope = GPU_DETAIL_COL
        elif op_type == "gpu_cuda_activity":
            search_scope = GPU_ACTIVITY_COL
        else:
            raise ProfilerOpTypeException(
                "The op_type must in ['aicpu_type','aicpu_detail', 'aicore_type', 'aicore_detail', "
                "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']")
    else:
        raise ProfilerOpTypeException(
            "The op_type must in ['aicpu_type','aicpu_detail', 'aicore_type', 'aicore_detail', "
            "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']")

    if "group_condition" in search_condition:
        validate_group_condition(search_condition)

    if "sort_condition" in search_condition:
        validate_sort_condition(search_condition, search_scope)

    if "filter_condition" in search_condition:
        validate_filter_condition(search_condition)
예제 #26
0
 def _validate_str_param(proc_name, accept_param, error_name=''):
     """Validate proc_name."""
     if proc_name is None or isinstance(proc_name,
                                        str) and proc_name in accept_param:
         return
     log.error("Invalid param %s in request. Acceptable value is %s.",
               error_name, accept_param)
     raise ProfilerParamValueErrorException(f"Invalid {error_name}.")
예제 #27
0
 def _validate_step_id(self, step_id):
     """Validate step_id."""
     if step_id is None or isinstance(step_id,
                                      int) and 0 <= step_id <= self._size:
         return
     log.error("Invalid step_id in request. step_id should be in [0, %d].",
               self._size)
     raise StepNumNotSupportedException([0, self._size])
예제 #28
0
    def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data',
                 optypes_to_deal='', optypes_not_deal='Variable', job_id=""):
        # get device_id and device_target
        device_target = ""
        try:
            import mindspore.context as context
            dev_id = str(context.get_context("device_id"))
            device_target = context.get_context("device_target")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError as err:
            logger.error("Profiling: fail to get context, %s", err.message)

        if not dev_id:
            dev_id = os.getenv('DEVICE_ID')
        if not dev_id:
            dev_id = "0"
            logger.error("Fail to get DEVICE_ID, use 0 instead.")

        if device_target and device_target != "Davinci" \
            and device_target != "Ascend":
            msg = ("Profiling: unsupport backend: %s" \
                  % device_target)
            raise RuntimeError(msg)

        self._dev_id = dev_id
        self._container_path = os.path.join(self._base_profiling_container_path, dev_id)
        data_path = os.path.join(self._container_path, "data")
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        self._output_path = validate_and_normalize_path(output_path,
                                                        'Profiler output path (' + output_path + ')')
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path)

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
        # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
        try:
            import mindspore.context as context
            context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError as err:
            logger.error("Profiling: fail to set context, %s", err.message)

        os.environ['AICPU_PROFILING_MODE'] = 'true'
        os.environ['PROFILING_DIR'] = str(self._container_path)
        self._subgraph = check_subgraph(subgraph)
        self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else []
        self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
        self._detail = check_bool(is_detail, 'is_detail')
        self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
        self._profiling_job_id = job_id
        self._start_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling start time: %d", self._start_time)
예제 #29
0
    def _get_file_content(file_path):
        """Get file content."""
        try:
            with open(file_path, 'r') as f_obj:
                file_content = json.load(f_obj)
        except (IOError, OSError, json.JSONDecodeError) as err:
            log.error('Error occurred when read flops file: %s', err)
            raise ProfilerIOException()

        return file_content
예제 #30
0
 def _load(self):
     """Load data according to the parsed AICORE operator types file."""
     file_path = query_latest_trace_time_file(self._profiling_dir, self._device_id)
     if not file_path:
         log.error("Failed to find parsed trace time file.")
         raise ProfilerFileNotFoundException('parsed step trace time file')
     with open(file_path, 'r') as handle:
         csv_reader = csv.reader(handle)
         self.__column__ = next(csv_reader)
         self._data = list(csv_reader)
     self._size = len(self._data) - 1
     self._display_col_names = self._col_names[:]
     self._load_point_info()