def _get_minddata_pipeline_info(self): """Get the number of thread cores in minddata pipeline operator""" file_name = self._minddata_pipeline_display_filename.format( self._device_id) file_path = os.path.join(self._profiling_dir, file_name) file_path = validate_and_normalize_path( file_path, raise_key="Invalid minddata_pipeline_info file path.") if not os.path.exists(file_path): log.error('Did not find the minddata_pipeline file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the minddata_pipeline file:{}'.format( file_path)) with open(file_path, 'r', encoding='utf-8') as file: try: minddata_pipeline_info = json.load(file) except json.JSONDecodeError as err: log.exception(err) raise ProfilerRawFileException( "Fail to parse minddata pipeline file") minddata_pipeline_op_info = [] for item in minddata_pipeline_info.get("op_info"): op_info_dict = dict() op_info_dict["op_id"] = item.get("op_id") op_info_dict["num_workers"] = item.get("num_workers") minddata_pipeline_op_info.append(op_info_dict) return minddata_pipeline_op_info
def _get_op_task_id_map(self): """ Read hwts data file, get the task time info. Returns: list: all hwts task time info. """ op_map_result = [] hwts_list = [] if not os.path.exists(self._hwts_output_file): logger.error('The hwts output file does not exist.') raise ProfilerFileNotFoundException('hwts output file') with open(self._hwts_output_file, 'r') as data_file: lines = data_file.readlines() for line in lines: if line.startswith("Start of task") or line.startswith( "End of task"): line_split = line.split() container = HWTSContainer(line_split) hwts_list.append(container) # hwts op map by taskId for hwts in hwts_list: if hwts.task_id in self._op_task_info.keys(): hwts.op_name = self._op_task_info[hwts.task_id] op_map_result.append(hwts) return op_map_result
def _get_minddata_queue_step_time_info(self): """Get the sampling time information at the steps of the host queue""" minddata_queue_step_time_info = [] minddata_analyser = MinddataAnalyser(self._profiling_dir, self._device_id) file_path = minddata_analyser.get_device_queue_file_path() file_path = validate_and_normalize_path( file_path, raise_key="Invalid device_queue file path") if not os.path.exists(file_path): log.error('Did not find the device queue file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the device queue file.') with open(file_path) as data_file: for line in data_file.readlines(): op_info = line.split() # op_info is a list like:['1','64','8','2','85406783'] # The value of the first element in op_info is '0' or '1'. # '0' means that the time information is recorded. # '1' means that the queue information is recorded. # '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time. if op_info and op_info[0] == "1": minddata_queue_step_time_info.append( [op_info[2], op_info[4]]) return minddata_queue_step_time_info
def load_timeline_data(self): """Load timeline data from file.""" file_path = os.path.join( self._profiling_dir, self._output_timeline_data_file_path.format(self._device_id)) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline txt file path.') if not os.path.exists(file_path): logger.error("Failed to find parsed timeline file.") raise ProfilerFileNotFoundException('parsed timeline file') stream_count_dict = {} try: with open(file_path, 'r') as f_obj: for line in f_obj: if not line.startswith('op_name'): line_list = line.strip('\n').split(',') self._parse_timeline_data(line_list) self._update_num_of_streams(line_list, stream_count_dict) except (IOError, OSError) as err: logger.error( 'Error occurred when read timeline intermediate file: %s', err) raise ProfilerIOException # Update timeline summary info self._timeline_summary['num_of_streams'] = len( stream_count_dict.keys())
def _load_timeline_data(self): """Load timeline data from file.""" file_path = os.path.join( self._profiling_dir, self._output_timeline_data_file_path.format(self._device_id) ) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline txt file path.' ) if not os.path.exists(file_path): logger.error("Failed to find parsed timeline file.") raise ProfilerFileNotFoundException('parsed timeline file') timeline_list = [] try: with open(file_path, 'r') as f_obj: for line in f_obj: if not line.startswith('op_name'): line_list = line.strip('\n').split(',') timeline_list.append(line_list) except (IOError, OSError) as err: logger.error('Error occurred when read timeline intermediate file: %s', err) raise ProfilerIOException return timeline_list
def _get_pipeline_path(self, source_dir): """ Get the minddata pipeline file path. Args: source_dir (str): The minddata pipeline source dir. Returns: str, the minddata pipeline file path. """ pipeline_path = os.path.join( source_dir, self._raw_pipeline_file_name.format(self._device_id)) try: pipeline_path = validate_and_normalize_path( pipeline_path, 'profiler') except ValidationError: logger.warning('Minddata pipeline file is invalid.') raise ProfilerPathErrorException( 'Minddata pipeline file is invalid.') if not os.path.isfile(pipeline_path): logger.warning('The minddata pipeline file <%s> not found.', pipeline_path) raise ProfilerFileNotFoundException(pipeline_path) return pipeline_path
def _get_host_device_rank_relation(self): """Get host_ip device_id rank_id relation.""" rank_table_file_path = self._get_rank_table_file_path() if not os.path.exists(rank_table_file_path): log.error('Did not find rank table file under %s', self._cluster_profiler_dir) raise ProfilerFileNotFoundException( msg='Did not find rank table file') with open(rank_table_file_path, 'r', encoding='utf-8') as file: try: relation_info = json.load(file) except json.JSONDecodeError as err: log.exception(err) host_device_rank_relation = list() servers_info = relation_info.get("server_list") for server_info in servers_info: server_id = server_info.get("server_id") devices_info = server_info.get("device") for device_info in devices_info: device_id = device_info.get("device_id") rank_id = device_info.get("rank_id") host_device_rank_relation.append( [server_id, device_id, rank_id]) host_ips_mapping_info = self._get_host_ips_mapping_info() for item in host_device_rank_relation: # host_ip_index:0,host_mapping_id_index:1 target_info = [i for i in host_ips_mapping_info if item[0] == i[0]] # target_info is like:[[host_ip, host_mapping_ip]] item[0] = target_info[0][1] return host_device_rank_relation
def _get_communication_info(self, host_ip, device_id, step_num): """Get step trace info.""" file_name = 'hccl_raw_{}.csv'.format(device_id) communication_file_path = \ os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name) communication_file_path = validate_and_normalize_path( communication_file_path, raise_key="Invalid communication file path.") if not os.path.exists(communication_file_path): log.error('Did not find the file: %s', communication_file_path) raise ProfilerFileNotFoundException( msg='Did not find the file:{}'.format(communication_file_path)) communication_info = list() step_num = str(step_num) with open(communication_file_path, 'r') as src_file: csv_reader = csv.reader(src_file) # when the step_num value is 0, it means the average value. # The last line of the step_trace_raw_{}_detail_time.csv records the average value. # The first element of the last line is '-'. step_num = '-' if step_num == '0' else step_num for row in csv_reader: if row[0] == step_num: communication_info = row break # Convert string to floating point and dictionary if communication_info: communication_info[1] = float(communication_info[1]) communication_info[2] = float(communication_info[2]) communication_info[3] = json.loads(communication_info[3]) communication_info[4] = json.loads(communication_info[4]) return communication_info
def _get_step_trace_info(self, host_ip, device_id, step_num): """Get step trace info.""" file_name = 'step_trace_raw_{}_detail_time.csv'.format(device_id) step_trace_file_path = \ os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name) step_trace_file_path = validate_and_normalize_path( step_trace_file_path, raise_key="Invalid step trace file path.") if not os.path.exists(step_trace_file_path): log.error('Did not find the file: %s', step_trace_file_path) raise ProfilerFileNotFoundException( msg='Did not find the file:{}'.format(step_trace_file_path)) step_trace_info = list() step_num = str(step_num) with open(step_trace_file_path, 'r') as src_file: lines = src_file.readlines() # when the step_num value is 0, it means the average value. # The last line of the step_trace_raw_{}_detail_time.csv records the average value. if step_num == '0': step_trace_info = lines[-1].strip('\n').split(',') else: for line in lines: line = line.strip('\n').split(',') if line[0] == step_num: step_trace_info = line # step_trace_info[6]: iteration_interval time # step_trace_info[7]: fp_and_bp time # step_trace_info[8]: tail time # divided by 1e5, the unit becomes a millisecond iteration_interval = float(step_trace_info[6]) / 1e5 fp_and_bp = float(step_trace_info[7]) / 1e5 tail = float(step_trace_info[8]) / 1e5 step_trace_info = [iteration_interval, fp_and_bp, tail] return step_trace_info
def _load(self): """Load data according to the parsed AICORE operator types file.""" file_path = query_latest_trace_time_file(self._profiling_dir, self._device_id) if not file_path: log.error("Failed to find parsed trace time file.") raise ProfilerFileNotFoundException('parsed step trace time file') with open(file_path, 'r') as handle: csv_reader = csv.reader(handle) self.__column__ = next(csv_reader) self._data = list(csv_reader) self._size = len(self._data) - 1 self._display_col_names = self._col_names[:] self._load_point_info()
def _search_file(self, profiling_id, device_id): """ Search all framework files in raw profiling path. Args: profiling_id (str): The profiling ID. device_id (str): The device ID. Raises: ProfilerFileNotFoundException: If the framework files are not found. """ self._search_file_from_job_path(device_id) self._search_file_from_data_path(profiling_id, device_id) if self._backend_type is None: raise ProfilerFileNotFoundException('Framework') self._framework_path['graph'].sort() self._framework_path['task'].sort()
def _load(self): """Load cpu_utilization info.""" file_name = self._cpu_utilization_display_filename.format( self._device_id) file_path = os.path.join(self._profiling_dir, file_name) file_path = validate_and_normalize_path( file_path, raise_key="Invalid cpu_utilization_info file path.") if not os.path.exists(file_path): log.error('Did not find the cpu utilization file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the cpu utilization file.') with open(file_path, 'r', encoding='utf-8') as src_file: try: self._data = json.load(src_file) except json.JSONDecodeError as err: log.exception(err) raise ProfilerRawFileException( "Fail to parse cpu_utilization info file")
def _search_file(self, profiling_id, device_id): """ Search all framework files in raw profiling path. Args: profiling_id (str): The profiling ID. device_id (str): The device ID. Raises: ProfilerFileNotFoundException: If the framework files are not found. """ # first search in the JOB dir, and if not, search in the sub directory # in the JOB self._search_file_from_job_path(device_id, search_in_sub_path=False) if self._backend_type is None: self._search_file_from_job_path(device_id, search_in_sub_path=True) self._search_file_from_data_path(profiling_id, device_id) if self._backend_type is None: raise ProfilerFileNotFoundException('Framework') self._framework_path['graph'].sort() self._framework_path['task'].sort()
def _get_minddata_queue_step_time_info(self): """Get the sampling time information at the steps of the host queue""" minddata_queue_step_time_info = [] minddata_analyser = MinddataAnalyser(self._profiling_dir, self._device_id) file_path = minddata_analyser.get_device_queue_file_path() file_path = validate_and_normalize_path( file_path, raise_key="Invalid device_queue file path") if not os.path.exists(file_path): log.error('Did not find the device queue file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the device queue file:{}'.format(file_path)) with open(file_path) as data_file: for line in data_file.readlines(): op_info = line.split() # op_info[0]=="1":queue info, op_info[1]:Connector capacity, # op_info[2]:step_num, op_info[3]:Connector size, op_info[4]:sampling time if op_info and op_info[0] == "1": minddata_queue_step_time_info.append( [op_info[2], op_info[4]]) return minddata_queue_step_time_info
def _get_file_content(self, device_type, file_type): """ Get file content for different types of memory usage files. Args: device_type (str): Device type, e.g., GPU, Ascend. file_type (str): memory usage file type, e.g., summary, details. Returns: dict, file content corresponding to file_type. """ file_path = self._get_file_path(device_type, file_type) if not os.path.exists(file_path): logger.error('Invalid file path. Please check the output path: %s', file_path) raise ProfilerFileNotFoundException(msg='Invalid memory file path.') try: with open(file_path, 'r') as f_obj: file_content = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read memory file: %s', err) raise ProfilerIOException() return file_content