def _get_minddata_pipeline_info(self): """Get the number of thread cores in minddata pipeline operator""" file_name = self._minddata_pipeline_display_filename.format( self._device_id) file_path = os.path.join(self._profiling_dir, file_name) file_path = validate_and_normalize_path( file_path, raise_key="Invalid minddata_pipeline_info file path.") if not os.path.exists(file_path): log.error('Did not find the minddata_pipeline file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the minddata_pipeline file:{}'.format( file_path)) with open(file_path, 'r', encoding='utf-8') as file: try: minddata_pipeline_info = json.load(file) except json.JSONDecodeError as err: log.exception(err) raise ProfilerRawFileException( "Fail to parse minddata pipeline file") minddata_pipeline_op_info = [] for item in minddata_pipeline_info.get("op_info"): op_info_dict = dict() op_info_dict["op_id"] = item.get("op_id") op_info_dict["num_workers"] = item.get("num_workers") minddata_pipeline_op_info.append(op_info_dict) return minddata_pipeline_op_info
def _load_point_info(self): """Load point info.""" file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json') file_path = validate_and_normalize_path( file_path, raise_key="Invalid step_trace_point_info file path.") # If step_trace_point_info_{self._device_id}.json file exist, load this file. file_path_new = os.path.join( self._profiling_dir, f'step_trace_point_info_{self._device_id}.json') file_path_new = validate_and_normalize_path( file_path_new, raise_key="Invalid step_trace_point_info file path.") if os.path.isfile(file_path_new): file_path = file_path_new if os.path.isfile(file_path): with open(file_path, 'r', encoding='utf-8') as file: try: self._point_info = json.load(file) except (json.JSONDecodeError, TypeError) as err: log.exception(err) raise ProfilerRawFileException( 'Fail to parse point info file.')
def _get_host_device_rank_relation(self): """Get host_ip device_id rank_id relation.""" rank_table_file_path = self._get_rank_table_file_path() if not os.path.exists(rank_table_file_path): log.error('Did not find rank table file under %s', self._cluster_profiler_dir) raise ProfilerFileNotFoundException( msg='Did not find rank table file') with open(rank_table_file_path, 'r', encoding='utf-8') as file: try: relation_info = json.load(file) except json.JSONDecodeError as err: log.exception(err) host_device_rank_relation = list() servers_info = relation_info.get("server_list") for server_info in servers_info: server_id = server_info.get("server_id") devices_info = server_info.get("device") for device_info in devices_info: device_id = device_info.get("device_id") rank_id = device_info.get("rank_id") host_device_rank_relation.append( [server_id, device_id, rank_id]) host_ips_mapping_info = self._get_host_ips_mapping_info() for item in host_device_rank_relation: # host_ip_index:0,host_mapping_id_index:1 target_info = [i for i in host_ips_mapping_info if item[0] == i[0]] # target_info is like:[[host_ip, host_mapping_ip]] item[0] = target_info[0][1] return host_device_rank_relation
def _load_point_info(self): """Load point info.""" file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json') if os.path.isfile(file_path): with open(file_path, 'r', encoding='utf-8') as file: try: self._point_info = json.load(file) except (json.JSONDecodeError, TypeError) as err: log.exception(err) raise ProfilerRawFileException('Fail to parse point info file.')
def parse_and_save(self): """Parse step trace files and save the result.""" try: source_files = self._get_step_trace_files() self._parse(source_files) self._save() except IOError as err: log.exception(err) raise ProfilerIOException() else: log.info("Finish to save intermediate result for step trace file.")
def __init__(self, profiling_dir, device_id): self._profiling_dir = self._normalize_profiling_dir(profiling_dir) self._device_id = device_id self._data = [] self._result = None self._display_col_names = None self._size = 0 self._none_filter_condition_key = [] try: self._load() except IOError as err: logger.exception(err) raise ProfilerIOException()
def _load(self): """Load cpu_utilization info.""" file_name = self._cpu_utilization_display_filename.format( self._device_id) file_path = os.path.join(self._profiling_dir, file_name) file_path = validate_and_normalize_path( file_path, raise_key="Invalid cpu_utilization_info file path.") if not os.path.exists(file_path): log.error('Did not find the cpu utilization file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the cpu utilization file.') with open(file_path, 'r', encoding='utf-8') as src_file: try: self._data = json.load(src_file) except json.JSONDecodeError as err: log.exception(err) raise ProfilerRawFileException( "Fail to parse cpu_utilization info file")
def minddata_cpu_utilization_proposal(self): """Get the proposals of minddata cpu utilization""" filename = "minddata_cpu_utilization_{}.json".format(self.device_id) file_path = os.path.join(self.profiling_path, filename) # Forward compatibility, it is reasonable that the file does not exist. if not os.path.exists(file_path): return minddata_cpu_utilization = OrderedDict() minddata_cpu_utilization_analyser = AnalyserFactory.instance().get_analyser( 'minddata_cpu_utilization', self.profiling_path, self.device_id) try: idle_utilization_avg = minddata_cpu_utilization_analyser.get_idle_utilization_avg() # The maximum value of this cpu_activate_utilization_avg is 100%. cpu_activate_utilization_avg = 100 - idle_utilization_avg cpu_activate_utilization_threshold = 80 if cpu_activate_utilization_avg > cpu_activate_utilization_threshold: minddata_cpu_utilization["minddata_cpu_utilization"] = [cpu_activate_utilization_avg] self.__proposal_dict.update(minddata_cpu_utilization) except (ProfilerRawFileException, ProfilerFileNotFoundException) as err: log.exception(err)
def parse(self): """ Parse the minddata pipeline files. Raises: ProfilerRawFileException: If fails to parse the raw file of minddata pipeline or the file is empty. """ with open(self._pipeline_path, 'r') as file: try: pipeline_info = json.load(file) except (json.JSONDecodeError, TypeError) as err: logger.exception(err) raise ProfilerRawFileException( 'Fail to parse minddata pipeline file.') if not pipeline_info: logger.warning('The minddata pipeline file is empty.') raise ProfilerRawFileException( 'The minddata pipeline file is empty.') self._parse_and_save(pipeline_info)
def _convert_field_type(self, row): """ Convert the field type to the specific type. Args: row (list): One row data from parsed data. Returns: list, the converted data. """ try: return [ row[0], row[1], row[2], row[3], int(row[4]), self._format_float_data(float(row[5])), self._format_float_data(float(row[6])), self._format_float_data(float(row[7])), row[8] ] except IndexError as err: log.exception(err) raise ProfilerRawFileException( 'failed to get HOST CPU operator detail data.')
def _convert_field_type(row): """ Convert the field type to the specific type. Args: row (list): One row data from parsed data. Returns: list, the converted data. """ try: return [ row[0], int(row[1]), int(row[2]), float(row[3]), float(row[4]), float(row[5]) * 100 ] except IndexError as err: log.exception(err) raise ProfilerRawFileException( 'failed to get HOST CPU operator type data.')
def analyse(self): """ Collect and analyse performance data, called after training or during training. Examples: >>> from mindinsight.profiler import Profiler >>> context.set_context(mode=context.GRAPH_MODE, device_target=“Ascend”, >>> device_id=int(os.environ["DEVICE_ID"])) >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data') >>> model = Model(train_network) >>> dataset = get_dataset() >>> model.train(2, dataset) >>> profiler.analyse() """ try: from mindspore.communication.management import release release() except ImportError: logger.error("Profiling: fail to import release from mindspore.") logger.info("begin profiler analyse") job_id = self._get_profiling_job_id() if not job_id: msg = ("Fail to get profiling job, please check whether job dir was generated under path %s" \ % PROFILING_LOG_BASE_PATH) raise RuntimeError(msg) logger.info("Profiling: job id is %s ", job_id) source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename) result = hwtslog_parser.execute() if not result: logger.error("Profiling: fail to parse hwts log file.") return # parse Framework file, and get the relation of op and tasks framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path) framework_parser.parse() op_task_dict = framework_parser.to_task_id_full_op_name_dict() if not op_task_dict: logger.error("Profiling: fail to parse framework files.") return # get op compute time from hwts data and framework data, write output_op_compute_time.txt opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt" opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename) optime_parser = OPComputeTimeParser(hwts_output_filename, opcompute_output_filename, op_task_dict) optime_parser.execute() # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt" output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu) try: aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu) aicpu_data_parser.execute() except FileNotFoundError as err: logger.exception(err) # analyse op compute time info try: self._analyser_op_info() except MindInsightException as err: logger.error(err.message)