def get_timeline_summary(self): """ Get timeline summary information for UI display. Returns: json, the content of timeline summary information. """ summary_filename = self._timeline_summary_filename.format(self._device_id) file_path = os.path.join(self._profiling_dir, summary_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline summary path.' ) timeline_summary = {} if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline_summary = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read timeline summary file: %s', err) raise ProfilerIOException else: logger.info('No timeline summary file. Please check the output path.') return timeline_summary
def analyze(self, options=None): """ Get the proposal from proposer. Args: options (dict): options for proposer analysis. - step_trace: include optional parameters for step trace,The dictionary key is iter_interval used to get the analyser options for iteration interval time. Returns: dict, the proposal from proposer instance,the dictionary key is a language internationalization label, and the value is used to format the value in the language internationalization string. Examples: >>> proposer_type = 'step_trace' >>> proposer = ProposerFactory.instance().get_proposer(proposer_type, self.profiling_dir, self.device_id) >>> result = proposer.analyze(options) """ logger.info("The StepTraceProposer is running") options = get_options(options) logger.debug("The StepTraceProposer 'options' is %s", str(options)) step_trace_condition = options.get("step_trace", {}) # Get the proposals of iteration interval. self._iter_interval_analyze(step_trace_condition) return self.__proposal_dict
def get_timeline_detail(): """ Get timeline detail. Returns: Response, the detail information of timeline. Examples: >>> GET http://xxxx/v1/mindinsight/profile/timeline """ summary_dir = request.args.get("dir") profiler_dir = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) if not os.path.exists(profiler_dir): raise ProfilerDirNotFoundException(msg=summary_dir) device_id = request.args.get("device_id", default='0') _ = to_int(device_id, 'device_id') device_type = request.args.get("device_type", default='ascend') if device_type not in ['gpu', 'ascend']: logger.info( "Invalid device_type, device_type should be gpu or ascend.") raise ParamValueError("Invalid device_type.") analyser = AnalyserFactory.instance().get_analyser('timeline', profiler_dir, device_id) timeline = analyser.get_display_timeline(device_type) return jsonify(timeline)
def get_display_timeline(self, device_type): """ Get timeline data for UI display. Returns: json, the content of timeline data. """ if device_type == "ascend": display_filename = self._ascend_display_filename.format( self._device_id) elif device_type == "gpu": display_filename = self._gpu_display_filename.format( self._device_id) else: logger.info( 'device type should be ascend or gpu. Please check the device type.' ) raise ParamValueError("Invalid device_type.") file_path = os.path.join(self._profiling_dir, display_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline json path.') timeline = [] if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error( 'Error occurred when read timeline display file: %s', err) raise ProfilerIOException else: logger.info('No timeline file. Please check the output path.') return timeline
def get_display_timeline(self): """ Get timeline data for UI display. Returns: json, the content of timeline data. """ # Search timeline json file under profiling dir. display_filename = self._display_filename.format(self._device_id) # Check if there is a timeline json file for display file_path = os.path.join(self._profiling_dir, display_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline json path.' ) timeline = [] if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline = json.load(f_obj) except (IOError, OSError) as err: logger.error('Error occurred when read timeline display file: %s', err) raise ProfilerIOException else: logger.info('No timeline file. Please check the output path.') return timeline
def _analyse_timeline(self): """ Analyse and parse timeline info. """ timeline_analyser = AnalyserFactory.instance().get_analyser( 'timeline', self._output_path, self._dev_id ) min_cycle_counter = timeline_analyser.get_min_cycle_counter() # Get framework info aicoredetail_analyser = AnalyserFactory.instance().get_analyser( 'aicore_detail', self._output_path, self._dev_id ) framework_info = aicoredetail_analyser.query() # Get all reduce info step_trace_analyser = AnalyserFactory.instance().get_analyser( 'step_trace', self._output_path, self._dev_id ) all_reduce_info = step_trace_analyser.query_for_all_reduce(min_cycle_counter) # Get timeline info logger.info('Start writing timeline info...') logger.info('Warm Prompt: It could take a few minutes if you are training ' 'with a complex network or more than 10 steps.') # Add AllReduce and framework info into timeline timeline_analyser.init_timeline(all_reduce_info, framework_info) timeline_analyser.write_timeline() timeline_analyser.write_timeline_summary()
def init_timeline(self, all_reduce_info, framework_info): """ Init timeline metadata, adding all collected info. Args: all_reduce_info (list[list]): The metadata of AllReduce operator. framework_info (dict): The framework metadata. """ logger.info('Initiating timeline...') timeline_list = self._load_timeline_data() self._timeline_summary['op_exe_times'] = len(timeline_list) # Add AllReduce info to timeline temp list and sort by start time. if all_reduce_info: logger.debug('AllReduce info found. Start adding info into timeline...') timeline_list.extend(all_reduce_info) timeline_list.sort(key=lambda x: float(x[2])) # Init a dict for counting the num of streams. stream_count_dict = {} for timeline in timeline_list: self._parse_timeline_data(timeline) # Updating the collection of streams. if len(timeline) == 4: self._update_num_of_streams(timeline, stream_count_dict) # Get framework metadata. framework_obj_list = framework_info.get('object') # The length of list is the number of operators. self._timeline_summary['num_of_ops'] = len(framework_obj_list) self._add_framework_info(framework_obj_list) logger.info('Finished adding info into timeline...') # Update timeline summary info self._timeline_summary['num_of_streams'] = len(stream_count_dict.keys())
def get_min_cycle_counter(self): """ Get minimum cycle counter. Returns: float, the minimum value of the cycle counter. """ file_path = os.path.join( self._profiling_dir, self._min_cycle_counter_file_path.format(self._device_id) ) file_path = validate_and_normalize_path( file_path, raise_key='Invalid min cycle counter file path.' ) if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: min_cycle_counter = f_obj.read() min_cycle_counter = float(min_cycle_counter) \ if not min_cycle_counter == 'inf' else 0 except (IOError, OSError) as err: logger.error('Error occurred when read minimum cycle counter: %s', err) raise ProfilerIOException else: min_cycle_counter = 0 logger.info("No min cycle counter recorded.") return min_cycle_counter
def _get_reduce_time_in_order(self, row_info_dict): """ Get reduce time in order. Args: row_info_dict (dict): Step trace information. Returns: dict, sorted reduce information. The reduce info is format like: {stream_id: List[Tuple(start_point, end_point, duration, field_name)]} """ reduce_info = {} reduce_fields = [ field_name for field_name in self.__column__ if field_name.startswith('stream_') and not field_name.endswith('point') ] for reduce_field in reduce_fields: reduce_start = row_info_dict.get(reduce_field + '_start_point', 0) reduce_end = row_info_dict.get(reduce_field + '_end_point', 0) reduce_duration = row_info_dict.get(reduce_field, 0) if not (reduce_start and reduce_end and reduce_duration): log.info("Reduce event missing value.") continue cur_stream_id = reduce_field.split('_', 2)[1] cur_stream = reduce_info.get(cur_stream_id) if not cur_stream: cur_stream = [] reduce_info[cur_stream_id] = cur_stream cur_stream.append( (reduce_start, reduce_end, reduce_duration, reduce_field)) for _, reduce_events in reduce_info.items(): reduce_events.sort(key=lambda elem: elem[1]) return reduce_info
def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data', optypes_to_deal='', optypes_not_deal='Variable', job_id=""): # get device_id and device_target device_target = "" try: import mindspore.context as context dev_id = str(context.get_context("device_id")) device_target = context.get_context("device_target") except ImportError: logger.error("Profiling: fail to import context from mindspore.") except ValueError as err: logger.error("Profiling: fail to get context, %s", err.message) if not dev_id: dev_id = os.getenv('DEVICE_ID') if not dev_id: dev_id = "0" logger.error("Fail to get DEVICE_ID, use 0 instead.") if device_target and device_target != "Davinci" \ and device_target != "Ascend": msg = ("Profiling: unsupport backend: %s" \ % device_target) raise RuntimeError(msg) self._dev_id = dev_id self._container_path = os.path.join(self._base_profiling_container_path, dev_id) data_path = os.path.join(self._container_path, "data") if not os.path.exists(data_path): os.makedirs(data_path) self._output_path = validate_and_normalize_path(output_path, 'Profiler output path (' + output_path + ')') self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path) os.environ['PROFILING_MODE'] = 'true' os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace' # use context interface to open profiling, for the new mindspore version(after 2020.5.21) try: import mindspore.context as context context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") except ImportError: logger.error("Profiling: fail to import context from mindspore.") except ValueError as err: logger.error("Profiling: fail to set context, %s", err.message) os.environ['AICPU_PROFILING_MODE'] = 'true' os.environ['PROFILING_DIR'] = str(self._container_path) self._subgraph = check_subgraph(subgraph) self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else [] self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else [] self._detail = check_bool(is_detail, 'is_detail') self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path') self._profiling_job_id = job_id self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def get_proposal(self, options=None): """ Get compose proposals. Args: options (dict): options for composed proposal. - compose_proposal_result: execution results of the already running proposers. - step_trace: include optional parameters for step trace,The dictionary key is iter_interval used to get the analyser options for iteration interval time. Returns: dict, the proposals from multiple different proposers. Examples: >>> type_list = ['common', 'step_trace'] >>> condition = {"filter_condition": {'mode': "proc", "proc_name": "iteration_interval"}} >>> options = {'step_trace': {"iter_interval": condition}} >>> cp = ComposeProposal(self.profiling_dir, self.device_id, type_list) >>> result_proposal = cp.get_proposal(options=options) """ logger.info("The ComposeProposal is running") options = get_options(options) logger.debug("The 'options' is %s", str(options)) # The flag whether to write category label. type_label_flag = options.get("type_label_flag", True) compose_proposal_result = OrderedDict() logger.debug("The 'compose_proposer_type_list' is %s", str(self.compose_proposer_type_list)) for proposer_type in self.compose_proposer_type_list: proposer = ProposerFactory.instance().get_proposer( proposer_type, self.profiling_path, self.device_id) if proposer is None: continue # Write the result of proposals to option for other proposer to get. options["compose_proposal_result"] = compose_proposal_result result = proposer.analyze(options) # Insert category label. if result and type_label_flag: proposer_type_label = proposer_type + "-type_label" # Get the name of the category label, the default is the same as the proposer type. type_label_name = options.get(proposer_type_label, proposer_type) # Add postfix to category label name type_proposal_label = type_label_name + self.type_label_postfix compose_proposal_result[type_proposal_label] = None # Merge results to the proposals dictionary. compose_proposal_result.update(result) elif result and not type_label_flag: # Merge results to the proposals dictionary. compose_proposal_result.update(result) logger.debug("The 'compose_proposal_result' is %s", str(compose_proposal_result)) return compose_proposal_result
def _save(self): log.info("Start to save step trace file.") if not self._header: return with open(self._output_path, 'w') as file_handle: csv_writer = csv.writer(file_handle) csv_writer.writerow(self._header) for row_data in self._result: csv_writer.writerow(row_data) os.chmod(self._output_path, stat.S_IREAD)
def parse_and_save(self): """Parse step trace files and save the result.""" try: source_files = self._get_step_trace_files() self._parse(source_files) self._save() except IOError as err: log.exception(err) raise ProfilerIOException() else: log.info("Finish to save intermediate result for step trace file.")
def execute(self): """Execute the parser, get result data, and write it to the output file.""" if not os.path.exists(self._source_file_name): logger.info("Did not find the aicpu profiling source file") return with open(self._source_file_name, 'rb') as ai_cpu_data: ai_cpu_str = str(ai_cpu_data.read().replace(b'\n\x00', b' ___ ') .replace(b'\x00', b' ___ '))[2:-1] ai_cpu_lines = ai_cpu_str.split(" ___ ") node_list = list() ai_cpu_total_time_summary = 0 # node serial number serial_number = 1 for i in range(len(ai_cpu_lines)-1): node_line = ai_cpu_lines[i] thread_line = ai_cpu_lines[i+1] if "Node" in node_line and "Thread" in thread_line: # get the node data from node_line node_name = node_line.split(',')[0].split(':')[-1] run_v2_start = node_line.split(',')[1].split(':')[-1] compute_start = node_line.split(',')[2].split(':')[-1] mercy_start = node_line.split(',')[3].split(':')[-1] mercy_end = node_line.split(',')[4].split(':')[-1] run_v2_end = node_line.split(',')[5].split(':')[-1] # get total_time and dispatch_time from thread line total_time = thread_line.split(',')[-1].split('=')[-1].split()[0] dispatch_time = thread_line.split(',')[-2].split('=')[-1].split()[0] node_data = [serial_number, node_name, total_time, dispatch_time, run_v2_start, compute_start, mercy_start, mercy_end, run_v2_end] node_list.append(node_data) # calculate the total time ai_cpu_total_time_summary += int(total_time) # increase node serial number serial_number += 1 elif "Node" in node_line and "Thread" not in thread_line: node_name = node_line.split(',')[0].split(':')[-1] logger.warning("The node:%s cannot find thread data", node_name) node_list.append(["AI CPU Total Time(us):", ai_cpu_total_time_summary]) if node_list: fwrite_format(self._output_filename, data_source=_dst_file_title, is_print=True, is_start=True) fwrite_format(self._output_filename, data_source=tabulate(node_list, _dst_file_column_title, tablefmt='simple'), is_start=True, is_print=True)
def analyze(self, options=None): """ Get the proposal from proposer. Args: options: options for proposer analysis Returns: dict, the proposal from proposer instance. """ logger.info('The CommonProposer is running') options = get_options(options) return self.__proposal_dict
def _parse(self, source_files): """Parse source step trace files.""" log.info("Start to parse step trace file.") event_info = {} for source_file in source_files: with open(source_file, 'rb') as handler: content = handler.read() for step_trace in self._get_next_step_trace( content, event_info): if self._skip_first_step: self._skip_first_step = False continue self._record_trace_event(step_trace) self._record_average_info() log.info("Finish to parse step trace file.")
def _get_profiling_job_id(self): """Get profiling job id, which was generated by ada service. Returns: str: profiling jon id. """ if self._profiling_job_id: return self._profiling_job_id job_id = "" cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'" r = os.popen(cmd) profiling_job_dirs = r.readlines() r.close() for item in profiling_job_dirs: path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip()) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error("Profiling: job path %s, host_start.log not exist.", path) continue log_file = os.path.join(path, log_file[0]) item_dict = self._parse_host_start_log(log_file) if not item_dict: logger.error("Profiling: job path %s, fail to get job start info.", path) continue if self._start_time > int(item_dict["start_time"]): logger.info("Profiling: job path %s, start_time %s, training start_time %d.", path, item_dict["start_time"], self._start_time) break if self._dev_id != item_dict["device_id"]: logger.info("Profiling: job path %s, dev id %s, training device id %s.", path, item_dict["device_id"], self._dev_id) continue job_id = item.strip() break if not job_id: msg = ("Fail to get profiling job, please check whether job dir was generated") raise RuntimeError(msg) return job_id
def _record_average_info(self): """Calculate average info.""" result_size = len(self._result) # calculate average data for each column in result data average_data = [0] * len(self._header) if result_size >= 2: for row_info in self._result[1:]: average_data = [ Decimal(i) + Decimal(j) for i, j in zip(row_info, average_data) ] average_data = [ round((item / (result_size - 1))) for item in average_data ] # change step num info in average_data to None step_num_index = self._header.index('step_num') average_data[step_num_index] = '-' self._result.append(average_data) log.info("Finish add average info for step trace.")
def query(self, condition=None): """ Query data according to the condition. Args: condition (dict): The search condition, only contains `filter_condition` parameter. Default: None. Returns: dict, the result after filtered, sorted and grouped. """ if condition is None: condition = {} filter_condition = condition.get('filter_condition', {}) log.info("Receive query request. %s", filter_condition) self._validate_filter_condition(filter_condition) self._result = {'size': self._size} self._filter(filter_condition) return self._result
def get_display_timeline(self, device_type, scope_name_num): """ Get timeline data for UI display. Returns: json, the content of timeline data. """ if device_type == "ascend": display_filename = self._ascend_display_filename.format( self._device_id) elif device_type == "gpu": display_filename = self._gpu_display_filename.format( self._device_id) else: logger.info( 'device type should be ascend or gpu. Please check the device type.' ) raise ParamValueError("Invalid device_type.") file_path = os.path.join(self._profiling_dir, display_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline json path.') timeline = [] if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline = json.load(f_obj) for idx, time_item in enumerate(timeline): if time_item["tid"] == 100001 and \ time_item["ph"] != "M" and \ int(time_item["scope_level"]) >= int(scope_name_num): timeline[idx] = None timeline = list(filter(lambda x: x, timeline)) except (IOError, OSError, json.JSONDecodeError) as err: logger.error( 'Error occurred when read timeline display file: %s', err) raise ProfilerIOException() else: logger.info('No timeline file. Please check the output path.') return timeline
def _sort_reduce_by_time(self, row_info_dict): """ Sort reduce info by time. Args: row_info_dict (dict): Step trace information. Returns: list, including the all reduce info sorted by start time only. [ [reduce_field, stream_id, reduce_start, reduce_duration], [...], [...] ] """ factor = 1e5 # convert time unit from 10ns to 1ms reduce_pid = 10000 reduce_info = [] reduce_fields = [ field_name for field_name in self.__column__ if field_name.startswith('stream_') and not field_name.endswith('point') ] for reduce_field in reduce_fields: reduce_start = row_info_dict.get(reduce_field + '_start_point') reduce_start = reduce_start / factor \ if reduce_start else 0 reduce_duration = row_info_dict.get(reduce_field) reduce_duration = reduce_duration / factor if reduce_duration else 0 if not (reduce_start and reduce_duration): log.info("Reduce event missing value.") continue cur_stream_id = reduce_field.split('_', 2)[1] reduce_meta = [ reduce_field, int(cur_stream_id), reduce_start, reduce_duration, reduce_pid ] reduce_info.append(reduce_meta) return reduce_info
def _analyse_step_trace(self, source_path, framework_parser): """ Analyse step trace data and save the result. Args: source_path (str): The directory that contains the step trace original data. framework_parser (FrameworkParser): The framework parse instance. """ logger.info("Begin to parse step trace.") # construct output path step_trace_intermediate_file_path = os.path.join( self._output_path, f'step_trace_raw_{self._dev_id}_detail_time.csv' ) point_info_file_path = os.path.join( self._output_path, 'step_trace_point_info.json' ) # whether keep the first step skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) point_info = framework_parser.point_info # parser the step trace files and save the result to disk parser = StepTraceParser(input_dir=source_path, output_file_path=step_trace_intermediate_file_path, job_id=self._job_id_env, skip_first_step=skip_first_step_flag) parser.parse_and_save() point_info = parser.record_point_info(point_info, point_info_file_path) # print parser result parser.show() logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path) logger.info("The point info is: %s", point_info)
def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data', optypes_to_deal='', optypes_not_deal='Variable', job_id=""): # get device_id and device_target self._get_devid_and_devtarget() self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) self._output_path = validate_and_normalize_path(output_path, 'Profiler output path (' + output_path + ')') self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) os.environ['PROFILING_MODE'] = 'true' os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path os.environ['DEVICE_ID'] = self._dev_id # use context interface to open profiling, for the new mindspore version(after 2020.5.21) try: import mindspore.context as context context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") except ImportError: logger.error("Profiling: fail to import context from mindspore.") except ValueError: logger.error("Profiling: fail to set context enable_profiling") os.environ['AICPU_PROFILING_MODE'] = 'true' os.environ['PROFILING_DIR'] = str(self._container_path) self._subgraph = check_subgraph(subgraph) self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else [] self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else [] self._detail = check_bool(is_detail, 'is_detail') self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path') self._profiling_job_id = job_id # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def add_framework_info(self, framework_info): """ Add framework info into timeline metadata. Args: framework_info (dict): The framework metadata. """ logger.info('Adding framework info...') framework_obj_list = framework_info.get('object') self._timeline_summary['num_of_ops'] = len(framework_obj_list) for framework_obj in framework_obj_list: op_name = framework_obj[0] op_type = framework_obj[1] op_full_name = framework_obj[4] op_info = framework_obj[5] for timeline_obj in self._timeline_meta: if op_full_name == timeline_obj.get('name'): timeline_obj['name'] = op_name timeline_obj['args'] = { 'type': op_type, 'fullname': op_full_name } timeline_obj['args'].update(op_info)
def analyse(self): """ Collect and analyse performance data, called after training or during training. Examples: >>> from mindinsight.profiler import Profiler >>> context.set_context(mode=context.GRAPH_MODE, device_target=“Ascend”, >>> device_id=int(os.environ["DEVICE_ID"])) >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data') >>> model = Model(train_network) >>> dataset = get_dataset() >>> model.train(2, dataset) >>> profiler.analyse() """ try: from mindspore.communication.management import release release() except ImportError: logger.error("Profiling: fail to import release from mindspore.") logger.info("begin profiler analyse") job_id = self._get_profiling_job_id() if not job_id: msg = ("Fail to get profiling job, please check whether job dir was generated under path %s" \ % PROFILING_LOG_BASE_PATH) raise RuntimeError(msg) logger.info("Profiling: job id is %s ", job_id) source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename) result = hwtslog_parser.execute() if not result: logger.error("Profiling: fail to parse hwts log file.") return # parse Framework file, and get the relation of op and tasks framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path) framework_parser.parse() op_task_dict = framework_parser.to_task_id_full_op_name_dict() if not op_task_dict: logger.error("Profiling: fail to parse framework files.") return # get op compute time from hwts data and framework data, write output_op_compute_time.txt opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt" opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename) optime_parser = OPComputeTimeParser(hwts_output_filename, opcompute_output_filename, op_task_dict) optime_parser.execute() # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt" output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu) try: aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu) aicpu_data_parser.execute() except FileNotFoundError as err: logger.exception(err) # analyse op compute time info try: self._analyser_op_info() except MindInsightException as err: logger.error(err.message)
def analyse(self): """ Collect and analyse performance data, called after training or during training. Examples: >>> from mindinsight.profiler import Profiler >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", >>> device_id=int(os.environ["DEVICE_ID"])) >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data') >>> model = Model(train_network) >>> dataset = get_dataset() >>> model.train(2, dataset) >>> profiler.analyse() """ try: from mindspore.communication.management import release release() except ImportError: logger.error("Profiling: fail to import release from mindspore.") job_id = self._get_profiling_job_id() logger.info("Profiling: job id is %s ", job_id) source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename) result = hwtslog_parser.execute() if not result: logger.error("Profiling: fail to parse hwts log file.") return # parse Framework file, and get the relation of op and tasks framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path) framework_parser.parse() op_task_dict = framework_parser.to_task_id_full_op_name_dict() if not op_task_dict: logger.error("Profiling: fail to parse framework files.") return # get op compute time from hwts data and framework data, write output_op_compute_time.txt opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt" opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename) optime_parser = OPComputeTimeParser( hwts_output_filename, opcompute_output_filename, op_task_dict, self._output_path, self._dev_id ) optime_parser.execute() # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt" output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu) aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu) aicpu_data_parser.execute() # Parsing minddata AICPU profiling MinddataParser.execute(source_path, self._output_path, self._dev_id) # parse minddata pipeline operator and queue try: pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path) pipeline_parser.parse() except MindInsightException as err: logger.warning(err.message) # analyse op compute time info try: self._analyser_op_info() except MindInsightException as err: logger.warning(err.message) # analyse step trace info try: self._analyse_step_trace(source_path, framework_parser) except MindInsightException as err: logger.warning(err.message) # analyse timeline info try: self._analyse_timeline() except (ProfilerIOException, ProfilerFileNotFoundException, ValidationError) as err: logger.warning('Fail to write timeline data: %s', err)
def query(self, condition=None): """ Query data according to the condition. Args: condition (dict): The search condition, only contains `filter_condition` parameter. Default: None. Returns: dict, the result after filtered, sorted and grouped. """ if condition is None: condition = {} filter_condition = condition.get('filter_condition', {}) log.info("Receive query request. %s", filter_condition) self._filter(filter_condition) result = dict() result["sampling_interval"] = self._data.get("sampling_interval") result["step_info"] = self._steps_info result["step_total_num"] = self._step_total_num result["cpu_processor_num"] = self._data.get("cpu_processor_num") # device average CPU utilization result["device_info"] = dict() for key in self._data.get("device_info").keys(): arr = self._data.get("device_info")[key] avg_value = round(sum(arr) / len(arr)) if arr else 0 result["device_info"][key] = { "metrics": arr, "avg_value": avg_value } # process average CPU utilization result["process_info"] = dict() for key in self._data.get("process_info").keys(): arr = self._data.get("process_info")[key] avg_value = round(sum(arr) / len(arr)) if arr else 0 result["process_info"][key] = { "metrics": arr, "avg_value": avg_value } # op average CPU utilization minddata_pipeline_op_info = self._get_minddata_pipeline_info() result["op_info"] = dict() result["op_info"]["op_list"] = [] result["op_info"]["total_op_avg_value"] = dict() result["op_info"]["total_op_avg_value"]["user_utilization"] = 0 result["op_info"]["total_op_avg_value"]["sys_utilization"] = 0 for item in self._data.get("op_info"): # Filtering out non minddata pipeline operator if str(item.get("op_id")) == "-1": continue op_info_dict = dict() op_info_dict["metrics"] = dict() for key in item.get("metrics").keys(): arr = item.get("metrics")[key] avg_value = round(sum(arr) / len(arr)) if arr else 0 op_info_dict["metrics"][key] = { "metrics": arr, "avg_value": avg_value } result["op_info"]["total_op_avg_value"][key] += avg_value op_info_dict["op_id"] = item.get("op_id") op_info = [ i for i in minddata_pipeline_op_info if i.get("op_id") == item.get("op_id") ] # op_info is like [{"num_workers":int,"op_id":int}] op_info_dict["num_workers"] = op_info[0].get("num_workers") result["op_info"]["op_list"].append(op_info_dict) return result
def write_timeline(self): """Load data according to the parsed profiling files.""" # Write timeline to file. logger.info('Writing timeline file...') self.write_timeline_to_json_by_limitation() logger.info('Finished file writing!')
def execute(self): """ Execute the parser, get result data, and write it to the output file. Returns: bool, whether succeed to analyse hwts log. """ content_format = ['QIIIIIIIIIIII', 'QIIQIIIIIIII', 'IIIIQIIIIIIII'] log_type = [ 'Start of task', 'End of task', 'Start of block', 'End of block', 'Block PMU' ] result_data = "" with open(self._source_flie_name, 'rb') as hwts_data: while True: line = hwts_data.read(64) if line: if not line.strip(): continue else: break byte_first_four = struct.unpack('BBHHH', line[0:8]) byte_first = bin(byte_first_four[0]).replace('0b', '').zfill(8) ms_type = byte_first[-3:] is_warn_res0_ov = byte_first[4] cnt = int(byte_first[0:4], 2) core_id = byte_first_four[1] blk_id, task_id = byte_first_four[3], byte_first_four[4] if ms_type in ['000', '001', '010']: # log type 0,1,2 result = struct.unpack(content_format[0], line[8:]) syscnt = result[0] stream_id = result[1] elif ms_type == '011': # log type 3 result = struct.unpack(content_format[1], line[8:]) syscnt = result[0] stream_id = result[1] elif ms_type == '100': # log type 4 result = struct.unpack(content_format[2], line[8:]) stream_id = result[2] if is_warn_res0_ov == '0': syscnt = result[4] else: syscnt = None else: logger.info("Profiling: invalid hwts log record type %s", ms_type) continue if int(task_id) < 25000: task_id = str(stream_id) + "_" + str(task_id) result_data += ("%-14s %-4s %-8s %-9s %-8s %-15s %s\n" % (log_type[int(ms_type, 2)], cnt, core_id, blk_id, task_id, syscnt, stream_id)) fwrite_format(self._output_filename, data_source=self._dst_file_title, is_start=True) fwrite_format(self._output_filename, data_source=self._dst_file_column_title) fwrite_format(self._output_filename, data_source=result_data) return True