def flush(self): """ Flush the event file to disk. Call it to make sure that all pending events have been written to disk. Examples: >>> with SummaryRecord(log_dir="./summary_dir", file_prefix="xxx_", file_suffix="_yyy") as summary_record: >>> summary_record.flush() """ if self._closed: logger.error("The record writer is closed and can not flush.") elif self._event_writer: self._event_writer.flush()
def _aicore_trace_data_load(self): """Load data according to the parsed AICORE operator types file.""" file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id)) if not file_path: logger.error("Failed to find parsed trace time file.") raise ProfilerFileNotFoundException('parsed step trace time file') with open(file_path, 'r') as handle: csv_reader = csv.reader(handle) self.__column__ = next(csv_reader) self._aicore_trace_data = list(csv_reader) self._size = len(self._aicore_trace_data) - 1 self._display_col_names = self._col_names[:] self._load_point_info()
def record(self, step, train_network=None): """ Record the summary. Args: step (int): Represents training step number. train_network (Cell): The network that called the callback. Examples: >>> summary_record = SummaryRecord(log_dir="/opt/log", queue_max_size=50, flush_time=6, >>> file_prefix="xxx_", file_suffix="_yyy") >>> summary_record.record(step=2) Returns: bool, whether the record process is successful or not. """ logger.info("SummaryRecord step is %r.", step) if self._closed: logger.error("The record writer is closed.") return False if not isinstance(step, int) or isinstance(step, bool): raise ValueError("`step` should be int") # Set the current summary of train step self.step = step if self.network is not None and self.has_graph is False: graph_proto = self.network.get_func_graph_proto() if graph_proto is None and train_network is not None: graph_proto = train_network.get_func_graph_proto() if graph_proto is None: logger.error("Failed to get proto for graph") else: self.event_writer.write_event_to_file( package_graph_event(graph_proto).SerializeToString()) self.event_writer.flush() self.has_graph = True data = _summary_tensor_cache.get("SummaryRecord") if data is None: logger.error("The step(%r) does not have record data.", self.step) return False if self.queue_max_size > 0 and len(data) > self.queue_max_size: logger.error( "The size of data record is %r, which is greater than queue_max_size %r.", len(data), self.queue_max_size) # clean the data of cache del _summary_tensor_cache["SummaryRecord"] # process the data self.worker_scheduler.dispatch(self.step, data) # count & flush self.event_writer.count_event() self.event_writer.flush_cycle() logger.debug( "Send the summary data to scheduler for saving, step = %d", self.step) return True
def _compile_akg_task_ascend(*json_strs): """ compile func called in single process Parameters: json_strs: list. List contains multiple kernel infos, suitable for json compile api. """ akg_compiler = os.path.join(os.path.split( os.path.realpath(__file__))[0], "compiler.py") for json_str in json_strs: try: subprocess.run([sys.executable, akg_compiler, json_str], text=True, check=True) except BaseException as e: logger.error(e, "Failed, args: {}!".format(json_str))
def parse(self): """Parse the function or method.""" logger.debug("fn = %r", self.fn) tree = None if isinstance(self.fn, (types.FunctionType, types.MethodType)): original_src = inspect.getsource(self.fn) src = dedent(original_src) self.col_offset = \ len(original_src.split('\n')[0]) - len(src.split('\n')[0]) logger.debug("get source = %s", src) tree = asttokens.ASTTokens(src, parse=True).tree else: logger.error("Fn type is invalid") return tree
def to_tensor(self): """Get the tensor format data of this Initializer.""" arr = None try: arr = np.ndarray(self.shape) except ValueError: msg = "Error shape={}".format(self.shape) logger.error(msg) raise ValueError(msg) if self._seed is not None: np.random.seed(self.seed) self.__call__(arr) self._seed = None return Tensor(arr, dtype=self.dtype)
def _fill_image_summary(tag: str, np_value, summary_image, input_format='NCHW'): """ Package the image summary. Args: tag (str): Summary tag describe. np_value (Type): Summary data type. summary_image (Tensor): The tensor of summary. input_format (str): Data sort order index. Default: 'NCHW'. Returns: Summary, return image summary content. """ logger.debug(f"Set({tag}) the image summary value") if np_value.ndim != 4 or np_value.shape[1] not in (1, 3): logger.error( f"The value is not Image, tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}" ) return False if np_value.ndim != len(input_format): logger.error( f"The tensor with dim({np_value.ndim}) can't convert the format({input_format}) because dim not same" ) return False # convert the tensor format tensor = _convert_image_format(np_value, input_format) # convert the tensor dtype # Do not assume that user passes in values in [0, 255], use data type to detect scale_factor = 1 if tensor.dtype == np.uint8: scale_factor = 1 elif np.max(tensor) <= 1 and np.min(tensor) >= 0: scale_factor = 255 tensor = tensor.astype(np.float32) tensor = (tensor * scale_factor).astype(np.uint8) # create the image summary height, width, channel, image_string = _make_image(tensor) summary_image.height = height summary_image.width = width summary_image.colorspace = channel summary_image.encoded_image = image_string return True
def _set_fusion_strategy_by_size(dataSizeList, group="hccl_world_group"): """ A function set gradient segment strategy according to the data size percentage list. Note: In the back propagation, the fusion of the allreduce operators with a fusion attribute equals 1, will be performed according to dataSizeList, to achieve the effect of parallel between calculation and communication. Args: dataSizeList (list): The data size percentage list of the gradient. group (str): The hccl communication group. Raises: TypeError: If group is not a python str. TypeError: If dataSizeList is not a python list. TypeError: If type of dataSizeList item is not int or float. ValueError: If group name length is out of range. ValueError: If dataSizeList length is 0. ValueError: If dataSizeList item is less than 0. RuntimeError: If allreduce split failed. """ try: lib_ctype = _load_lib() except RuntimeError: logger.error('Load HCCL lib failed') if isinstance(group, (str)): group_len = len(group) if group_len > _MAX_GROUP_NAME_LEN or group_len == 0: raise ValueError('Group name is out of range {_MAX_GROUP_NAME_LEN}') else: raise TypeError('Group must be a python str') if isinstance(dataSizeList, (list)): len_data_size = len(dataSizeList) if len_data_size == 0: raise ValueError('DataSizeList length is 0') else: raise TypeError('DataSizeList must be a python list') for dataSize in dataSizeList: if not isinstance(dataSize, (int, float)): raise TypeError('DataSize in dataSizeList is invalid') c_array_sizeList = _c_array(ctypes.c_float, dataSizeList) c_size_num = ctypes.c_uint(len(dataSizeList)) c_group = _c_str(group) ret = lib_ctype.hcom_set_split_strategy_by_size(c_group, c_size_num, c_array_sizeList) if ret != 0: raise RuntimeError('Allreduce split error')
def flush(self): """ Flush the event file to disk. Call it to make sure that all pending events have been written to disk. Examples: >>> summary_record = SummaryRecord(log_dir="/opt/log", queue_max_size=50, flush_time=6, >>> file_prefix="xxx_", file_suffix="_yyy") >>> summary_record.flush() """ if self._closed: logger.error("The record writer is closed and can not flush.") else: self.event_writer.flush()
def _write(self, plugin, data): """Write the data in the subprocess.""" for writer in self._writers[:]: try: writer.write(plugin, data) except RuntimeError as exc: logger.error(str(exc)) self._writers.remove(writer) writer.close() if self._raise_exception: raise except RuntimeWarning as exc: logger.warning(str(exc)) self._writers.remove(writer) writer.close()
def write_to_db(self): """ Create index field in table for reading data. Returns: MSRStatus, SUCCESS or FAILED. Raises: MRMGenerateIndexError: If failed to write to database. """ ret = self._generator.write_to_db() if ret != ms.MSRStatus.SUCCESS: logger.error("Failed to write to database.") raise MRMGenerateIndexError return ret
def get_category_fields(self): """ Get candidate category fields. Returns: list[str], by which data could be grouped. Raises: MRMFetchCandidateFieldsError: If failed to get candidate category fields. """ ret, fields = self._segment.get_category_fields() if ret != SUCCESS: logger.error("Failed to get candidate category fields.") raise MRMFetchCandidateFieldsError return fields
def build(self): """ Build index generator. Returns: MSRStatus, SUCCESS or FAILED. Raises: MRMGenerateIndexError: If failed to build index generator. """ ret = self._generator.build() if ret != ms.MSRStatus.SUCCESS: logger.error("Failed to build index generator.") raise MRMGenerateIndexError return ret
def estimate_ops(json_str: str): """Call costmodel to estimate ops.""" try: json_obj = json.loads(json_str) graph_descs = json_obj["graph_desc"] graphs = [] for gd in graph_descs: graphs.append(model.load_composite(gd).graph) estimation = model.parallel_estimate(graphs) res = (estimation.block_assign, estimation.gain, estimation.fusion_type, estimation.type_info) return res except jd.JSONDecodeError: logger.error(traceback.format_exc()) return None
def commit(self): """ Flush data to disk. Returns: Class MSRStatus, SUCCESS or FAILED. Raises: MRMCommitError: If failed to flush data to disk. """ ret = self._writer.commit() if ret != ms.MSRStatus.SUCCESS: logger.error("Failed to commit.") raise MRMCommitError return ret
def launch(self): """ Launch the worker threads to load data. Returns: MSRStatus, SUCCESS or FAILED. Raises: MRMLaunchError: If failed to launch worker threads. """ ret = self._reader.launch(False) if ret != ms.MSRStatus.SUCCESS: logger.error("Failed to launch worker threads.") raise MRMLaunchError return ret
def finish(self): """ stop the worker threads. Returns: MSRStatus, SUCCESS or FAILED. Raises: MRMFinishError: If failed to finish worker threads. """ ret = self._reader.finish() if ret != ms.MSRStatus.SUCCESS: logger.error("Failed to finish worker threads.") raise MRMFinishError return ret
def read_category_info(self): """ Get the group info by the current category field. Returns: str, description fo group information. Raises: MRMReadCategoryInfoError: If failed to read category information. """ ret, category_info = self._segment.read_category_info() if ret != SUCCESS: logger.error("Failed to read category information.") raise MRMReadCategoryInfoError return category_info
def record(self, step, train_network=None, plugin_filter=None): """ Record the summary. Args: step (int): Represents training step number. train_network (Cell): The network to call the callback. plugin_filter (Optional[Callable[[str], bool]]): The filter function, \ which is used to filter out plugins from being written by returning False. Returns: bool, whether the record process is successful or not. Examples: >>> with SummaryRecord(log_dir="./summary_dir", file_prefix="xxx_", file_suffix="_yyy") as summary_record: >>> summary_record.record(step=2) """ logger.debug("SummaryRecord step is %r.", step) if self._closed: logger.error("The record writer is closed.") return False if not isinstance(step, int) or isinstance(step, bool): raise ValueError("`step` should be int") # Set the current summary of train step if self.network is not None and not self.has_graph: graph_proto = self.network.get_func_graph_proto() if graph_proto is None and train_network is not None: graph_proto = train_network.get_func_graph_proto() if graph_proto is None: logger.error("Failed to get proto for graph") else: self._event_writer.write({'graph': [{'step': step, 'value': graph_proto}]}) self.has_graph = True if not _summary_tensor_cache: return True if self._mode == 'train': self._add_summary_tensor_data() if not plugin_filter: self._event_writer.write(self._consume_data_pool(step)) else: filtered = {} for plugin, datalist in self._consume_data_pool(step).items(): if plugin_filter(plugin): filtered[plugin] = datalist self._event_writer.write(filtered) return True
def _run_hoc(self, summary, sample_id, sample_input, prob): """ Run HOC search for a sample image, and then save the result to summary. Args: summary (SummaryRecord): The summary object to store the data. sample_id (int): The sample ID. sample_input (Union[Tensor, np.ndarray]): Sample image tensor in CHW or NCWH(N=1). prob (Union[Tensor, np.ndarray]): List of sample's classification prediction output, HOC will run for labels with prediction output strictly larger then HOC searcher's threshold(0.5 by default). """ if isinstance(sample_input, ms.Tensor): sample_input = sample_input.asnumpy() if len(sample_input.shape) == 3: sample_input = np.expand_dims(sample_input, axis=0) has_rec = False explain = Explain() explain.sample_id = sample_id str_mask = hoc.auto_str_mask(sample_input) compiled_mask = None for label_idx, label_prob in enumerate(prob): if label_prob > self._hoc_searcher.threshold: if compiled_mask is None: compiled_mask = hoc.compile_mask(str_mask, sample_input) try: edit_tree, layer_outputs = self._hoc_searcher.search( sample_input, label_idx, compiled_mask) except hoc.NoValidResultError as ex: log.error( f"HOC cannot find result for sample:{sample_id} error:{ex}" ) continue has_rec = True hoc_rec = explain.hoc.add() hoc_rec.label = label_idx hoc_rec.mask = str_mask layer_count = edit_tree.max_layer + 1 for layer in range(layer_count): steps = edit_tree.get_layer_or_leaf_steps(layer) layer_output = layer_outputs[layer] hoc_layer = hoc_rec.layer.add() hoc_layer.prob = layer_output for step in steps: hoc_layer.box.extend(list(step.box)) if has_rec: summary.add_value("explainer", "hoc", explain) summary.record(1) self._manifest['hierarchical_occlusion'] = True
def to_tensor(self, slice_index=None, shape=None): """ Get the tensor format data of this MetaTensor. Args: slice_index (int): Slice index of a parameter's slices. It is used when initialize a slice of a parameter, it guarantees that devices using the same slice can generate the same tensor. shape (list[int]): Shape of the slice, it is used when initialize a slice of the parameter. """ if self.init is None: raise TypeError( "to_dense must be set MetaTensor.init, init can't be None") if shape is None: shape = self.shape try: arr = np.ndarray(shape, dtype=mstype.dtype_to_nptype(self.dtype)) except ValueError: msg = "Error shape={}".format(shape) logger.error(msg) raise ValueError(msg) class seed_context: '''set and restore seed''' def __init__(self, init): self.init = init from .seed import get_seed global_seed = get_seed() self._np_seed = np.random.get_state()[1][0] self.need_set_seed = ((slice_index is not None) and (global_seed is None)) def __enter__(self): if self.need_set_seed: self.seed = self.init.seed np.random.seed(slice_index) self.init.seed = slice_index def __exit__(self, ptype, value, trace): if self.need_set_seed: np.random.seed(self._np_seed) self.init.seed = self.seed with seed_context(self.init): self.init(arr) return Tensor(arr, dtype=self.dtype)
def check_input_data(*data, data_class): """Input data check.""" for item in data: if isinstance(item, (list, tuple)): for v in item: check_input_data(v, data_class=data_class) else: if not isinstance(item, data_class): raise ValueError(f'Please provide as model inputs' f' either a single' f' or a list of {data_class.__name__},' f' but got part data type is {str(type(item))}.') if item.size() == 0: msg = "Please provide non-empty data." logger.error(msg) raise ValueError(msg)
def flush(self): """ Flush the event file to disk. Call it to make sure that all pending events have been written to disk. Examples: >>> from mindspore.train.summary import SummaryRecord >>> if __name__ == '__main__': ... with SummaryRecord(log_dir="./summary_dir", file_prefix="xx_", file_suffix="_yy") as summary_record: ... summary_record.flush() """ if self._closed: logger.error("The record writer is closed and can not flush.") elif self._event_writer: self._event_writer.flush()
def write_timeline_summary(self): """Write timeline summary to json.""" timeline_summary_file_path = os.path.join( self._profiling_dir, self._timeline_summary_filename.format(self._device_id) ) timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path) try: with open(timeline_summary_file_path, 'w') as json_file: json.dump(self._timeline_summary, json_file) os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE) except (IOError, OSError) as err: logger.error('Error occurred when write timeline summary file: %s', err) raise ProfilerIOException
def process_check(cycle_time, cmd, wait_time=5): for i in range(cycle_time): time.sleep(wait_time) sub = subprocess.Popen(args="{}".format(cmd), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout_data, _ = sub.communicate() if not stdout_data: logger.info("process execute success.") return True logger.warning("process is running, please wait {}".format(i)) logger.error("process execute execute timeout.") return False
def _get_next(self): """ Returns the next record in the dataset as dictionary Returns: Dict, the next record in the dataset. """ try: return {k: self._transform_tensor(t) for k, t in self._iterator.GetNextAsMap().items()} except RuntimeError as err: ## maybe "Out of memory" / "MemoryError" error err_info = str(err) if err_info.find("Out of memory") >= 0 or err_info.find("MemoryError") >= 0: logger.error("Memory error occurred, process will exit.") os.kill(os.getpid(), signal.SIGKILL) raise err
def validate_ui_proc(proc_name): """ Validate proc name in restful request. Args: proc_name (str): The proc name to query. Acceptable value is in [`iteration_interval`, `fp_and_bp`, `tail`]. Raises: ProfilerParamValueErrorException: If the proc_name is invalid. """ accept_names = ['iteration_interval', 'fp_and_bp', 'tail'] if proc_name not in accept_names: log.error("Invalid proc_name. The proc_name for restful api is in %s", accept_names) raise ProfilerParamValueErrorException( f'proc_name should be in {accept_names}.')
def split_with_json(json_str: str): """Call costmodel to split GraphKernel""" try: graph_desc = json.loads(json_str) comp = model.load_composite(graph_desc) graph_split, graph_mode = model.split(comp.graph) is_multi_graph = len(graph_split) > 1 graph_list = list(map(comp.dump, graph_split)) result = { "multi_graph": is_multi_graph, "graph_desc": graph_list, "graph_mode": graph_mode } return json.dumps(result) except jd.JSONDecodeError: logger.error(traceback.format_exc()) return None
def _get_profiling_job_id(self): """Get profiling job id, which was generated by ada service. Returns: str, profiling job id. """ job_id = "" for item in os.listdir(self._output_path): if item.startswith('JOB'): path = os.path.join(self._output_path, item) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error( "Profiling: job path %s, host_start.log not exist.", path) continue training_device_id = log_file[0].split('.')[-1] if self._dev_id == training_device_id: log_file = os.path.join(path, log_file[0]) job_start_time = self._parse_host_start_log(log_file) if not job_start_time: logger.error( "Profiling: job path %s, fail to get job start info.", path) break job_id = item if self._start_time > int(job_start_time): logger.info( "Profiling: job path %s, start_time %s, training start_time %d.", path, job_start_time, self._start_time) break else: logger.info( "Profiling: job path %s, dev id %s, training device id %s.", path, training_device_id, self._dev_id) if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated, " \ "or may be the device id from job dir dismatch the device_id in current process." raise RuntimeError(msg) return job_id
def _get_profiling_job_id(self): """Get profiling job id, which was generated by ada service. Returns: str, profiling job id. """ job_id = "" for item in os.listdir(self._output_path): if item.startswith('JOB'): path = os.path.join(self._output_path, item) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error( "Profiling: job path %s, host_start.log not exist.", path) break log_file = os.path.join(path, log_file[0]) item_dict = self._parse_host_start_log(log_file) if not item_dict: logger.error( "Profiling: job path %s, fail to get job start info.", path) break job_id = item if self._dev_id != item_dict["device_id"]: logger.info( "Profiling: job path %s, dev id %s, training device id %s.", path, item_dict["device_id"], self._dev_id) if self._start_time > int(item_dict["start_time"]): logger.info( "Profiling: job path %s, start_time %s, training start_time %d.", path, item_dict["start_time"], self._start_time) break if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated" raise RuntimeError(msg) return job_id