예제 #1
0
    def flush(self):
        """
        Flush the event file to disk.

        Call it to make sure that all pending events have been written to disk.

        Examples:
            >>> with SummaryRecord(log_dir="./summary_dir", file_prefix="xxx_", file_suffix="_yyy") as summary_record:
            >>>     summary_record.flush()
        """
        if self._closed:
            logger.error("The record writer is closed and can not flush.")
        elif self._event_writer:
            self._event_writer.flush()
예제 #2
0
 def _aicore_trace_data_load(self):
     """Load data according to the parsed AICORE operator types file."""
     file_path = query_latest_trace_time_file(self._profiling_dir,
                                              int(self._device_id))
     if not file_path:
         logger.error("Failed to find parsed trace time file.")
         raise ProfilerFileNotFoundException('parsed step trace time file')
     with open(file_path, 'r') as handle:
         csv_reader = csv.reader(handle)
         self.__column__ = next(csv_reader)
         self._aicore_trace_data = list(csv_reader)
     self._size = len(self._aicore_trace_data) - 1
     self._display_col_names = self._col_names[:]
     self._load_point_info()
예제 #3
0
    def record(self, step, train_network=None):
        """
        Record the summary.

        Args:
            step (int): Represents training step number.
            train_network (Cell): The network that called the callback.

        Examples:
            >>> summary_record = SummaryRecord(log_dir="/opt/log", queue_max_size=50, flush_time=6,
            >>>                                file_prefix="xxx_", file_suffix="_yyy")
            >>> summary_record.record(step=2)

        Returns:
            bool, whether the record process is successful or not.
        """
        logger.info("SummaryRecord step is %r.", step)
        if self._closed:
            logger.error("The record writer is closed.")
            return False
        if not isinstance(step, int) or isinstance(step, bool):
            raise ValueError("`step` should be int")
        # Set the current summary of train step
        self.step = step

        if self.network is not None and self.has_graph is False:
            graph_proto = self.network.get_func_graph_proto()
            if graph_proto is None and train_network is not None:
                graph_proto = train_network.get_func_graph_proto()
            if graph_proto is None:
                logger.error("Failed to get proto for graph")
            else:
                self.event_writer.write_event_to_file(
                    package_graph_event(graph_proto).SerializeToString())
                self.event_writer.flush()
                self.has_graph = True

        data = _summary_tensor_cache.get("SummaryRecord")
        if data is None:
            logger.error("The step(%r) does not have record data.", self.step)
            return False
        if self.queue_max_size > 0 and len(data) > self.queue_max_size:
            logger.error(
                "The size of data record is %r, which is greater than queue_max_size %r.",
                len(data), self.queue_max_size)

        # clean the data of cache
        del _summary_tensor_cache["SummaryRecord"]

        # process the data
        self.worker_scheduler.dispatch(self.step, data)

        # count & flush
        self.event_writer.count_event()
        self.event_writer.flush_cycle()

        logger.debug(
            "Send the summary data to scheduler for saving, step = %d",
            self.step)
        return True
예제 #4
0
def _compile_akg_task_ascend(*json_strs):
    """
    compile func called in single process

    Parameters:
        json_strs: list. List contains multiple kernel infos, suitable for json compile api.
    """
    akg_compiler = os.path.join(os.path.split(
        os.path.realpath(__file__))[0], "compiler.py")
    for json_str in json_strs:
        try:
            subprocess.run([sys.executable, akg_compiler, json_str], text=True, check=True)
        except BaseException as e:
            logger.error(e, "Failed, args: {}!".format(json_str))
예제 #5
0
 def parse(self):
     """Parse the function or method."""
     logger.debug("fn = %r", self.fn)
     tree = None
     if isinstance(self.fn, (types.FunctionType, types.MethodType)):
         original_src = inspect.getsource(self.fn)
         src = dedent(original_src)
         self.col_offset = \
             len(original_src.split('\n')[0]) - len(src.split('\n')[0])
         logger.debug("get source = %s", src)
         tree = asttokens.ASTTokens(src, parse=True).tree
     else:
         logger.error("Fn type is invalid")
     return tree
예제 #6
0
 def to_tensor(self):
     """Get the tensor format data of this Initializer."""
     arr = None
     try:
         arr = np.ndarray(self.shape)
     except ValueError:
         msg = "Error shape={}".format(self.shape)
         logger.error(msg)
         raise ValueError(msg)
     if self._seed is not None:
         np.random.seed(self.seed)
     self.__call__(arr)
     self._seed = None
     return Tensor(arr, dtype=self.dtype)
예제 #7
0
def _fill_image_summary(tag: str,
                        np_value,
                        summary_image,
                        input_format='NCHW'):
    """
    Package the image summary.

    Args:
        tag (str): Summary tag describe.
        np_value (Type): Summary data type.
        summary_image (Tensor): The tensor of summary.
        input_format (str): Data sort order index. Default: 'NCHW'.

    Returns:
        Summary, return image summary content.
    """
    logger.debug(f"Set({tag}) the image summary value")
    if np_value.ndim != 4 or np_value.shape[1] not in (1, 3):
        logger.error(
            f"The value is not Image, tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}"
        )
        return False

    if np_value.ndim != len(input_format):
        logger.error(
            f"The tensor with dim({np_value.ndim}) can't convert the format({input_format}) because dim not same"
        )
        return False

    # convert the tensor format
    tensor = _convert_image_format(np_value, input_format)

    # convert the tensor dtype
    # Do not assume that user passes in values in [0, 255], use data type to detect
    scale_factor = 1
    if tensor.dtype == np.uint8:
        scale_factor = 1
    elif np.max(tensor) <= 1 and np.min(tensor) >= 0:
        scale_factor = 255
    tensor = tensor.astype(np.float32)
    tensor = (tensor * scale_factor).astype(np.uint8)

    # create the image summary
    height, width, channel, image_string = _make_image(tensor)
    summary_image.height = height
    summary_image.width = width
    summary_image.colorspace = channel
    summary_image.encoded_image = image_string
    return True
예제 #8
0
def _set_fusion_strategy_by_size(dataSizeList, group="hccl_world_group"):
    """
    A function set gradient segment strategy according to the data size percentage list.

    Note:
        In the back propagation,
        the fusion of the allreduce operators with a fusion attribute equals 1,
        will be performed according to dataSizeList,
        to achieve the effect of parallel between calculation and communication.

    Args:
        dataSizeList (list): The data size percentage list of the gradient.
        group (str): The hccl communication group.

    Raises:
        TypeError: If group is not a python str.
        TypeError: If dataSizeList is not a python list.
        TypeError: If type of dataSizeList item is not int or float.
        ValueError: If group name length is out of range.
        ValueError: If dataSizeList length is 0.
        ValueError: If dataSizeList item is less than 0.
        RuntimeError: If allreduce split failed.
    """
    try:
        lib_ctype = _load_lib()
    except RuntimeError:
        logger.error('Load HCCL lib failed')
    if isinstance(group, (str)):
        group_len = len(group)
        if group_len > _MAX_GROUP_NAME_LEN or group_len == 0:
            raise ValueError('Group name is out of range {_MAX_GROUP_NAME_LEN}')
    else:
        raise TypeError('Group must be a python str')
    if isinstance(dataSizeList, (list)):
        len_data_size = len(dataSizeList)
        if len_data_size == 0:
            raise ValueError('DataSizeList length is 0')
    else:
        raise TypeError('DataSizeList must be a python list')
    for dataSize in dataSizeList:
        if not isinstance(dataSize, (int, float)):
            raise TypeError('DataSize in dataSizeList is invalid')

    c_array_sizeList = _c_array(ctypes.c_float, dataSizeList)
    c_size_num = ctypes.c_uint(len(dataSizeList))
    c_group = _c_str(group)
    ret = lib_ctype.hcom_set_split_strategy_by_size(c_group, c_size_num, c_array_sizeList)
    if ret != 0:
        raise RuntimeError('Allreduce split error')
예제 #9
0
    def flush(self):
        """
        Flush the event file to disk.

        Call it to make sure that all pending events have been written to disk.

        Examples:
            >>> summary_record = SummaryRecord(log_dir="/opt/log", queue_max_size=50, flush_time=6,
            >>>                                file_prefix="xxx_", file_suffix="_yyy")
            >>> summary_record.flush()
        """
        if self._closed:
            logger.error("The record writer is closed and can not flush.")
        else:
            self.event_writer.flush()
예제 #10
0
 def _write(self, plugin, data):
     """Write the data in the subprocess."""
     for writer in self._writers[:]:
         try:
             writer.write(plugin, data)
         except RuntimeError as exc:
             logger.error(str(exc))
             self._writers.remove(writer)
             writer.close()
             if self._raise_exception:
                 raise
         except RuntimeWarning as exc:
             logger.warning(str(exc))
             self._writers.remove(writer)
             writer.close()
예제 #11
0
    def write_to_db(self):
        """
        Create index field in table for reading data.

        Returns:
            MSRStatus, SUCCESS or FAILED.

        Raises:
            MRMGenerateIndexError: If failed to write to database.
        """
        ret = self._generator.write_to_db()
        if ret != ms.MSRStatus.SUCCESS:
            logger.error("Failed to write to database.")
            raise MRMGenerateIndexError
        return ret
예제 #12
0
    def get_category_fields(self):
        """
        Get candidate category fields.

        Returns:
            list[str], by which data could be grouped.

        Raises:
            MRMFetchCandidateFieldsError: If failed to get candidate category fields.
        """
        ret, fields = self._segment.get_category_fields()
        if ret != SUCCESS:
            logger.error("Failed to get candidate category fields.")
            raise MRMFetchCandidateFieldsError
        return fields
예제 #13
0
    def build(self):
        """
        Build index generator.

        Returns:
            MSRStatus, SUCCESS or FAILED.

        Raises:
            MRMGenerateIndexError: If failed to build index generator.
        """
        ret = self._generator.build()
        if ret != ms.MSRStatus.SUCCESS:
            logger.error("Failed to build index generator.")
            raise MRMGenerateIndexError
        return ret
예제 #14
0
def estimate_ops(json_str: str):
    """Call costmodel to estimate ops."""
    try:
        json_obj = json.loads(json_str)
        graph_descs = json_obj["graph_desc"]
        graphs = []
        for gd in graph_descs:
            graphs.append(model.load_composite(gd).graph)
        estimation = model.parallel_estimate(graphs)
        res = (estimation.block_assign, estimation.gain,
               estimation.fusion_type, estimation.type_info)
        return res
    except jd.JSONDecodeError:
        logger.error(traceback.format_exc())
        return None
예제 #15
0
    def commit(self):
        """
        Flush data to disk.

        Returns:
            Class MSRStatus, SUCCESS or FAILED.

        Raises:
            MRMCommitError: If failed to flush data to disk.
        """
        ret = self._writer.commit()
        if ret != ms.MSRStatus.SUCCESS:
            logger.error("Failed to commit.")
            raise MRMCommitError
        return ret
예제 #16
0
    def launch(self):
        """
        Launch the worker threads to load data.

        Returns:
            MSRStatus, SUCCESS or FAILED.

        Raises:
            MRMLaunchError: If failed to launch worker threads.
        """
        ret = self._reader.launch(False)
        if ret != ms.MSRStatus.SUCCESS:
            logger.error("Failed to launch worker threads.")
            raise MRMLaunchError
        return ret
예제 #17
0
    def finish(self):
        """
        stop the worker threads.

        Returns:
            MSRStatus, SUCCESS or FAILED.

        Raises:
            MRMFinishError: If failed to finish worker threads.
        """
        ret = self._reader.finish()
        if ret != ms.MSRStatus.SUCCESS:
            logger.error("Failed to finish worker threads.")
            raise MRMFinishError
        return ret
예제 #18
0
    def read_category_info(self):
        """
        Get the group info by the current category field.

        Returns:
            str, description fo group information.

        Raises:
            MRMReadCategoryInfoError: If failed to read category information.
        """
        ret, category_info = self._segment.read_category_info()
        if ret != SUCCESS:
            logger.error("Failed to read category information.")
            raise MRMReadCategoryInfoError
        return category_info
예제 #19
0
    def record(self, step, train_network=None, plugin_filter=None):
        """
        Record the summary.

        Args:
            step (int): Represents training step number.
            train_network (Cell): The network to call the callback.
            plugin_filter (Optional[Callable[[str], bool]]): The filter function, \
                which is used to filter out plugins from being written by returning False.

        Returns:
            bool, whether the record process is successful or not.

        Examples:
            >>> with SummaryRecord(log_dir="./summary_dir", file_prefix="xxx_", file_suffix="_yyy") as summary_record:
            >>>     summary_record.record(step=2)
        """
        logger.debug("SummaryRecord step is %r.", step)
        if self._closed:
            logger.error("The record writer is closed.")
            return False
        if not isinstance(step, int) or isinstance(step, bool):
            raise ValueError("`step` should be int")
        # Set the current summary of train step
        if self.network is not None and not self.has_graph:
            graph_proto = self.network.get_func_graph_proto()
            if graph_proto is None and train_network is not None:
                graph_proto = train_network.get_func_graph_proto()
            if graph_proto is None:
                logger.error("Failed to get proto for graph")
            else:
                self._event_writer.write({'graph': [{'step': step, 'value': graph_proto}]})
                self.has_graph = True
                if not _summary_tensor_cache:
                    return True

        if self._mode == 'train':
            self._add_summary_tensor_data()

        if not plugin_filter:
            self._event_writer.write(self._consume_data_pool(step))
        else:
            filtered = {}
            for plugin, datalist in self._consume_data_pool(step).items():
                if plugin_filter(plugin):
                    filtered[plugin] = datalist
            self._event_writer.write(filtered)
        return True
    def _run_hoc(self, summary, sample_id, sample_input, prob):
        """
        Run HOC search for a sample image, and then save the result to summary.

        Args:
            summary (SummaryRecord): The summary object to store the data.
            sample_id (int): The sample ID.
            sample_input (Union[Tensor, np.ndarray]): Sample image tensor in CHW or NCWH(N=1).
            prob (Union[Tensor, np.ndarray]): List of sample's classification prediction output, HOC will run for
                labels with prediction output strictly larger then HOC searcher's threshold(0.5 by default).
        """
        if isinstance(sample_input, ms.Tensor):
            sample_input = sample_input.asnumpy()
        if len(sample_input.shape) == 3:
            sample_input = np.expand_dims(sample_input, axis=0)
        has_rec = False
        explain = Explain()
        explain.sample_id = sample_id
        str_mask = hoc.auto_str_mask(sample_input)
        compiled_mask = None
        for label_idx, label_prob in enumerate(prob):
            if label_prob > self._hoc_searcher.threshold:
                if compiled_mask is None:
                    compiled_mask = hoc.compile_mask(str_mask, sample_input)
                try:
                    edit_tree, layer_outputs = self._hoc_searcher.search(
                        sample_input, label_idx, compiled_mask)
                except hoc.NoValidResultError as ex:
                    log.error(
                        f"HOC cannot find result for sample:{sample_id} error:{ex}"
                    )
                    continue
                has_rec = True
                hoc_rec = explain.hoc.add()
                hoc_rec.label = label_idx
                hoc_rec.mask = str_mask
                layer_count = edit_tree.max_layer + 1
                for layer in range(layer_count):
                    steps = edit_tree.get_layer_or_leaf_steps(layer)
                    layer_output = layer_outputs[layer]
                    hoc_layer = hoc_rec.layer.add()
                    hoc_layer.prob = layer_output
                    for step in steps:
                        hoc_layer.box.extend(list(step.box))
        if has_rec:
            summary.add_value("explainer", "hoc", explain)
            summary.record(1)
            self._manifest['hierarchical_occlusion'] = True
예제 #21
0
    def to_tensor(self, slice_index=None, shape=None):
        """
        Get the tensor format data of this MetaTensor.

        Args:
            slice_index (int): Slice index of a parameter's slices.
                It is used when initialize a slice of a parameter, it guarantees that devices
                using the same slice can generate the same tensor.
            shape (list[int]): Shape of the slice, it is used when initialize a slice of the parameter.
        """
        if self.init is None:
            raise TypeError(
                "to_dense must be set MetaTensor.init, init can't be None")

        if shape is None:
            shape = self.shape

        try:
            arr = np.ndarray(shape, dtype=mstype.dtype_to_nptype(self.dtype))
        except ValueError:
            msg = "Error shape={}".format(shape)
            logger.error(msg)
            raise ValueError(msg)

        class seed_context:
            '''set and restore seed'''
            def __init__(self, init):
                self.init = init
                from .seed import get_seed
                global_seed = get_seed()
                self._np_seed = np.random.get_state()[1][0]
                self.need_set_seed = ((slice_index is not None)
                                      and (global_seed is None))

            def __enter__(self):
                if self.need_set_seed:
                    self.seed = self.init.seed
                    np.random.seed(slice_index)
                    self.init.seed = slice_index

            def __exit__(self, ptype, value, trace):
                if self.need_set_seed:
                    np.random.seed(self._np_seed)
                    self.init.seed = self.seed

        with seed_context(self.init):
            self.init(arr)
        return Tensor(arr, dtype=self.dtype)
예제 #22
0
def check_input_data(*data, data_class):
    """Input data check."""
    for item in data:
        if isinstance(item, (list, tuple)):
            for v in item:
                check_input_data(v, data_class=data_class)
        else:
            if not isinstance(item, data_class):
                raise ValueError(f'Please provide as model inputs'
                                 f' either a single'
                                 f' or a list of {data_class.__name__},'
                                 f' but got part data type is {str(type(item))}.')
            if item.size() == 0:
                msg = "Please provide non-empty data."
                logger.error(msg)
                raise ValueError(msg)
예제 #23
0
    def flush(self):
        """
        Flush the event file to disk.

        Call it to make sure that all pending events have been written to disk.

        Examples:
            >>> from mindspore.train.summary import SummaryRecord
            >>> if __name__ == '__main__':
            ...     with SummaryRecord(log_dir="./summary_dir", file_prefix="xx_", file_suffix="_yy") as summary_record:
            ...         summary_record.flush()
        """
        if self._closed:
            logger.error("The record writer is closed and can not flush.")
        elif self._event_writer:
            self._event_writer.flush()
예제 #24
0
    def write_timeline_summary(self):
        """Write timeline summary to json."""
        timeline_summary_file_path = os.path.join(
            self._profiling_dir,
            self._timeline_summary_filename.format(self._device_id)
        )

        timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path)

        try:
            with open(timeline_summary_file_path, 'w') as json_file:
                json.dump(self._timeline_summary, json_file)
            os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE)
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline summary file: %s', err)
            raise ProfilerIOException
예제 #25
0
def process_check(cycle_time, cmd, wait_time=5):
    for i in range(cycle_time):
        time.sleep(wait_time)
        sub = subprocess.Popen(args="{}".format(cmd),
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               universal_newlines=True)
        stdout_data, _ = sub.communicate()
        if not stdout_data:
            logger.info("process execute success.")
            return True
        logger.warning("process is running, please wait {}".format(i))
    logger.error("process execute execute timeout.")
    return False
예제 #26
0
    def _get_next(self):
        """
        Returns the next record in the dataset as dictionary

        Returns:
            Dict, the next record in the dataset.
        """
        try:
            return {k: self._transform_tensor(t) for k, t in self._iterator.GetNextAsMap().items()}
        except RuntimeError as err:
            ## maybe "Out of memory" / "MemoryError" error
            err_info = str(err)
            if err_info.find("Out of memory") >= 0 or err_info.find("MemoryError") >= 0:
                logger.error("Memory error occurred, process will exit.")
                os.kill(os.getpid(), signal.SIGKILL)
            raise err
예제 #27
0
def validate_ui_proc(proc_name):
    """
    Validate proc name in restful request.

    Args:
        proc_name (str): The proc name to query. Acceptable value is in
        [`iteration_interval`, `fp_and_bp`, `tail`].

    Raises:
        ProfilerParamValueErrorException: If the proc_name is invalid.
    """
    accept_names = ['iteration_interval', 'fp_and_bp', 'tail']
    if proc_name not in accept_names:
        log.error("Invalid proc_name. The proc_name for restful api is in %s",
                  accept_names)
        raise ProfilerParamValueErrorException(
            f'proc_name should be in {accept_names}.')
예제 #28
0
def split_with_json(json_str: str):
    """Call costmodel to split GraphKernel"""
    try:
        graph_desc = json.loads(json_str)
        comp = model.load_composite(graph_desc)
        graph_split, graph_mode = model.split(comp.graph)
        is_multi_graph = len(graph_split) > 1
        graph_list = list(map(comp.dump, graph_split))
        result = {
            "multi_graph": is_multi_graph,
            "graph_desc": graph_list,
            "graph_mode": graph_mode
        }
        return json.dumps(result)
    except jd.JSONDecodeError:
        logger.error(traceback.format_exc())
        return None
예제 #29
0
    def _get_profiling_job_id(self):
        """Get profiling job id, which was generated by ada service.

        Returns:
            str, profiling job id.
        """

        job_id = ""

        for item in os.listdir(self._output_path):
            if item.startswith('JOB'):
                path = os.path.join(self._output_path, item)

                log_file = get_file_names(path, "host_start.log")
                if not log_file:
                    logger.error(
                        "Profiling: job path %s, host_start.log not exist.",
                        path)
                    continue

                training_device_id = log_file[0].split('.')[-1]
                if self._dev_id == training_device_id:
                    log_file = os.path.join(path, log_file[0])
                    job_start_time = self._parse_host_start_log(log_file)
                    if not job_start_time:
                        logger.error(
                            "Profiling: job path %s, fail to get job start info.",
                            path)
                        break
                    job_id = item
                    if self._start_time > int(job_start_time):
                        logger.info(
                            "Profiling: job path %s, start_time %s, training start_time %d.",
                            path, job_start_time, self._start_time)
                    break
                else:
                    logger.info(
                        "Profiling: job path %s, dev id %s, training device id %s.",
                        path, training_device_id, self._dev_id)

        if not job_id:
            msg = "Fail to get profiling job, please check whether job dir was generated, " \
                  "or may be the device id from job dir dismatch the device_id in current process."
            raise RuntimeError(msg)

        return job_id
예제 #30
0
    def _get_profiling_job_id(self):
        """Get profiling job id, which was generated by ada service.

        Returns:
            str, profiling job id.
        """

        job_id = ""
        for item in os.listdir(self._output_path):
            if item.startswith('JOB'):
                path = os.path.join(self._output_path, item)

                log_file = get_file_names(path, "host_start.log")
                if not log_file:
                    logger.error(
                        "Profiling: job path %s, host_start.log not exist.",
                        path)
                    break

                log_file = os.path.join(path, log_file[0])
                item_dict = self._parse_host_start_log(log_file)

                if not item_dict:
                    logger.error(
                        "Profiling: job path %s, fail to get job start info.",
                        path)
                    break

                job_id = item

                if self._dev_id != item_dict["device_id"]:
                    logger.info(
                        "Profiling: job path %s, dev id %s, training device id %s.",
                        path, item_dict["device_id"], self._dev_id)

                if self._start_time > int(item_dict["start_time"]):
                    logger.info(
                        "Profiling: job path %s, start_time %s, training start_time %d.",
                        path, item_dict["start_time"], self._start_time)
                break

        if not job_id:
            msg = "Fail to get profiling job, please check whether job dir was generated"
            raise RuntimeError(msg)

        return job_id