示例#1
0
def load_checkpoint(ckpt_file_name, net=None, strict_load=False, filter_prefix=None):
    """
    Loads checkpoint info from a specified file.

    Args:
        ckpt_file_name (str): Checkpoint file name.
        net (Cell): Cell network. Default: None
        strict_load (bool): Whether to strict load the parameter into net. If False, it will load parameter
                           in the param_dict into net with the same suffix. Default: False
        filter_prefix (Union[str, list[str], tuple[str]]): Parameters starting with the filter_prefix
            will not be loaded. Default: None.

    Returns:
        Dict, key is parameter name, value is a Parameter.

    Raises:
        ValueError: Checkpoint file is incorrect.

    Examples:
        >>> ckpt_file_name = "./checkpoint/LeNet5-1_32.ckpt"
        >>> param_dict = load_checkpoint(ckpt_file_name, filter_prefix="conv1")
    """
    if not isinstance(ckpt_file_name, str):
        raise ValueError("The ckpt_file_name must be string.")

    if not os.path.exists(ckpt_file_name):
        raise ValueError("The checkpoint file is not exist.")

    if ckpt_file_name[-5:] != ".ckpt":
        raise ValueError("Please input the correct checkpoint file name.")

    if os.path.getsize(ckpt_file_name) == 0:
        raise ValueError("The checkpoint file may be empty, please make sure enter the correct file name.")

    if filter_prefix is not None:
        if not isinstance(filter_prefix, (str, list, tuple)):
            raise TypeError(f"The type of filter_prefix must be str, list[str] or tuple[str] "
                            f"when filter_prefix is not None, but got {str(type(filter_prefix))}.")
        if isinstance(filter_prefix, str):
            filter_prefix = (filter_prefix,)
        if not filter_prefix:
            raise ValueError("The filter_prefix can't be empty when filter_prefix is list or tuple.")
        for index, prefix in enumerate(filter_prefix):
            if not isinstance(prefix, str):
                raise TypeError(f"The type of filter_prefix must be str, list[str] or tuple[str], "
                                f"but got {str(type(prefix))} at index {index}.")

    logger.info("Execute the process of loading checkpoint files.")
    checkpoint_list = Checkpoint()

    try:
        with open(ckpt_file_name, "rb") as f:
            pb_content = f.read()
        checkpoint_list.ParseFromString(pb_content)
    except BaseException as e:
        logger.error("Failed to read the checkpoint file `%s`, please check the correct of the file.", ckpt_file_name)
        raise ValueError(e.__str__())

    parameter_dict = {}
    try:
        param_data_list = []
        for element_id, element in enumerate(checkpoint_list.value):
            if filter_prefix is not None and _check_param_prefix(filter_prefix, element.tag):
                continue
            data = element.tensor.tensor_content
            data_type = element.tensor.tensor_type
            np_type = tensor_to_np_type[data_type]
            ms_type = tensor_to_ms_type[data_type]
            element_data = np.frombuffer(data, np_type)
            param_data_list.append(element_data)
            if (element_id == len(checkpoint_list.value) - 1) or \
                    (element.tag != checkpoint_list.value[element_id + 1].tag):
                param_data = np.concatenate((param_data_list), axis=0)
                param_data_list.clear()
                dims = element.tensor.dims

                if dims == [0]:
                    if 'Float' in data_type:
                        param_data = float(param_data[0])
                    elif 'Int' in data_type:
                        param_data = int(param_data[0])
                    parameter_dict[element.tag] = Parameter(Tensor(param_data, ms_type), name=element.tag)
                elif dims == [1]:
                    parameter_dict[element.tag] = Parameter(Tensor(param_data, ms_type), name=element.tag)
                else:
                    param_dim = []
                    for dim in dims:
                        param_dim.append(dim)
                    param_value = param_data.reshape(param_dim)
                    parameter_dict[element.tag] = Parameter(Tensor(param_value, ms_type), name=element.tag)

        logger.info("Loading checkpoint files process is finished.")

    except BaseException as e:
        logger.error("Failed to load the checkpoint file `%s`.", ckpt_file_name)
        raise RuntimeError(e.__str__())

    if not parameter_dict:
        raise ValueError(f"The loaded parameter dict is empty after filtering, please check filter_prefix.")

    if net is not None:
        load_param_into_net(net, parameter_dict, strict_load)

    return parameter_dict
示例#2
0
    def _ascend_analyse(self):
        """Collect and analyse ascend performance data"""
        release()

        job_id = self._get_profiling_job_id()
        logger.info("Profiling: job id is %s ", job_id)

        source_path = os.path.join(self._output_path, job_id)
        # parse hwts.log.data.45.dev file, and get task profiling data
        hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
        hwts_output_filename = os.path.join(self._output_path,
                                            hwts_output_filename)
        source_path = validate_and_normalize_path(source_path)
        hwts_output_filename = validate_and_normalize_path(
            hwts_output_filename)
        hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
        hwtslog_parser.execute()

        # parse Framework file, and get the relation of op and tasks
        framework_parser = FrameworkParser(job_id, self._dev_id,
                                           self._output_path)
        framework_parser.parse()
        op_task_dict = framework_parser.to_task_id_full_op_name_dict()
        if not op_task_dict:
            logger.error("Profiling: fail to parse framework files.")
            return

        # get op compute time from hwts data and framework data, write output_op_compute_time.txt
        opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
        opcompute_output_filename = os.path.join(self._output_path,
                                                 opcompute_output_filename)
        opcompute_output_filename = validate_and_normalize_path(
            opcompute_output_filename)
        optime_parser = OPComputeTimeParser(hwts_output_filename,
                                            opcompute_output_filename,
                                            op_task_dict, self._output_path,
                                            self._dev_id)
        optime_parser.execute()

        # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
        output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
        output_data_preprocess_aicpu = os.path.join(
            self._output_path, output_data_preprocess_aicpu)
        output_data_preprocess_aicpu = validate_and_normalize_path(
            output_data_preprocess_aicpu)
        aicpu_data_parser = DataPreProcessParser(source_path,
                                                 output_data_preprocess_aicpu)
        aicpu_data_parser.execute()

        # Parsing minddata AICPU profiling
        MinddataParser.execute(source_path, self._output_path, self._dev_id)

        # parse minddata pipeline operator and queue
        try:
            pipeline_parser = MinddataPipelineParser(self._output_path,
                                                     self._dev_id,
                                                     self._output_path)
            pipeline_parser.parse()
        except ProfilerException as err:
            logger.warning(err.message)

        # analyse op compute time info
        try:
            self._analyser_op_info()
        except ProfilerException as err:
            logger.warning(err.message)

        # analyse step trace info
        points = None
        try:
            points = self._analyse_step_trace(source_path, framework_parser)
        except ProfilerException as err:
            logger.warning(err.message)

        # analyse timeline info
        try:
            self._analyse_timeline(aicpu_data_parser, optime_parser,
                                   source_path)
        except (ProfilerIOException, ProfilerFileNotFoundException,
                RuntimeError) as err:
            logger.warning('Fail to write timeline data: %s', err)

        # analyse memory usage info
        try:
            self._analyse_memory_usage(points)
        except (ProfilerIOException, ProfilerFileNotFoundException,
                ProfilerRawFileException) as err:
            logger.warning(err.message)

        os.environ['PROFILING_MODE'] = str("false")
        context.set_context(enable_profiling=False)
示例#3
0
def test_cpp_uniform_augment(plot=False, num_ops=2):
    """
    Test UniformAugment
    """
    logger.info("Test CPP UniformAugment")

    # Original Images
    data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)

    transforms_original = [C.Decode(), C.Resize(size=[224, 224]), F.ToTensor()]

    ds_original = data_set.map(operations=transforms_original,
                               input_columns="image")

    ds_original = ds_original.batch(512)

    for idx, (image, _) in enumerate(ds_original):
        if idx == 0:
            images_original = np.transpose(image.asnumpy(), (0, 2, 3, 1))
        else:
            images_original = np.append(images_original,
                                        np.transpose(image.asnumpy(),
                                                     (0, 2, 3, 1)),
                                        axis=0)

    # UniformAugment Images
    data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
    transforms_ua = [
        C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]),
        C.RandomHorizontalFlip(),
        C.RandomVerticalFlip(),
        C.RandomColorAdjust(),
        C.RandomRotation(degrees=45)
    ]

    uni_aug = C.UniformAugment(transforms=transforms_ua, num_ops=num_ops)

    transforms_all = [
        C.Decode(),
        C.Resize(size=[224, 224]), uni_aug,
        F.ToTensor()
    ]

    ds_ua = data_set.map(operations=transforms_all,
                         input_columns="image",
                         num_parallel_workers=1)

    ds_ua = ds_ua.batch(512)

    for idx, (image, _) in enumerate(ds_ua):
        if idx == 0:
            images_ua = np.transpose(image.asnumpy(), (0, 2, 3, 1))
        else:
            images_ua = np.append(images_ua,
                                  np.transpose(image.asnumpy(), (0, 2, 3, 1)),
                                  axis=0)
    if plot:
        visualize_list(images_original, images_ua)

    num_samples = images_original.shape[0]
    mse = np.zeros(num_samples)
    for i in range(num_samples):
        mse[i] = diff_mse(images_ua[i], images_original[i])
    logger.info("MSE= {}".format(str(np.mean(mse))))
示例#4
0
def load_checkpoint(ckpt_file_name, net=None):
    """
    Loads checkpoint info from a specified file.

    Args:
        ckpt_file_name (str): Checkpoint file name.
        net (Cell): Cell network. Default: None

    Returns:
        Dict, key is parameter name, value is a Parameter.

    Raises:
        ValueError: Checkpoint file is incorrect.
    """
    if not isinstance(ckpt_file_name, str):
        raise ValueError("The ckpt_file_name must be string.")

    if not os.path.exists(ckpt_file_name):
        raise ValueError("The checkpoint file is not exist.")

    if ckpt_file_name[-5:] != ".ckpt":
        raise ValueError("Please input the correct checkpoint file name.")

    if os.path.getsize(ckpt_file_name) == 0:
        raise ValueError(
            "The checkpoint file may be empty, please make sure enter the correct file name."
        )

    logger.info("Execute load checkpoint process.")
    checkpoint_list = Checkpoint()

    try:
        with open(ckpt_file_name, "rb") as f:
            pb_content = f.read()
        checkpoint_list.ParseFromString(pb_content)
    except BaseException as e:
        logger.error(
            "Failed to read the checkpoint file `%s`, please check the correct of the file.",
            ckpt_file_name)
        raise ValueError(e.__str__())

    parameter_dict = {}
    try:
        element_id = 0
        param_data_list = []
        for element in checkpoint_list.value:
            data = element.tensor.tensor_content
            data_type = element.tensor.tensor_type
            np_type = tensor_to_np_type[data_type]
            ms_type = tensor_to_ms_type[data_type]
            element_data = np.frombuffer(data, np_type)
            param_data_list.append(element_data)
            if (element_id == len(checkpoint_list.value) - 1) or \
                    (element.tag != checkpoint_list.value[element_id + 1].tag):
                param_data = np.concatenate((param_data_list), axis=0)
                param_data_list.clear()
                dims = element.tensor.dims

                if dims == [0]:
                    if 'Float' in data_type:
                        param_data = float(param_data[0])
                    elif 'Int' in data_type:
                        param_data = int(param_data[0])
                    parameter_dict[element.tag] = Parameter(Tensor(
                        param_data, ms_type),
                                                            name=element.tag)
                elif dims == [1]:
                    parameter_dict[element.tag] = Parameter(Tensor(
                        param_data, ms_type),
                                                            name=element.tag)
                else:
                    param_dim = []
                    for dim in dims:
                        param_dim.append(dim)
                    param_value = param_data.reshape(param_dim)
                    parameter_dict[element.tag] = Parameter(Tensor(
                        param_value, ms_type),
                                                            name=element.tag)

            element_id += 1

        logger.info("Load checkpoint process finish.")

    except BaseException as e:
        logger.error("Failed to load the checkpoint file `%s`.",
                     ckpt_file_name)
        raise RuntimeError(e.__str__())

    if net is not None:
        load_param_into_net(net, parameter_dict)

    return parameter_dict
示例#5
0
def parse_print(print_file_name):
    """
    Loads Print data from a specified file.

    Args:
        print_file_name (str): The file name of save print data.

    Returns:
        List, element of list is Tensor.

    Raises:
        ValueError: The print file may be empty, please make sure enter the correct file name.
    """
    print_file_path = os.path.realpath(print_file_name)

    if os.path.getsize(print_file_path) == 0:
        raise ValueError(
            "The print file may be empty, please make sure enter the correct file name."
        )

    logger.info("Execute load print process.")
    print_list = Print()

    try:
        with open(print_file_path, "rb") as f:
            pb_content = f.read()
        print_list.ParseFromString(pb_content)
    except BaseException as e:
        logger.error(
            "Failed to read the print file %s, please check the correct of the file.",
            print_file_name)
        raise ValueError(e.__str__())

    tensor_list = []

    try:
        for print_ in print_list.value:
            # String type
            if print_.HasField("desc"):
                tensor_list.append(print_.desc)
            elif print_.HasField("tensor"):
                dims = print_.tensor.dims
                data_type = print_.tensor.tensor_type
                data = print_.tensor.tensor_content
                np_type = tensor_to_np_type[data_type]
                param_data = np.fromstring(data, np_type)
                ms_type = tensor_to_ms_type[data_type]
                param_dim = []
                for dim in dims:
                    param_dim.append(dim)
                if param_dim:
                    param_value = param_data.reshape(param_dim)
                    tensor_list.append(Tensor(param_value, ms_type))
                # Scale type
                else:
                    data_type_ = data_type.lower()
                    if 'float' in data_type_:
                        param_data = float(param_data[0])
                    elif 'int' in data_type_:
                        param_data = int(param_data[0])
                    elif 'bool' in data_type_:
                        param_data = bool(param_data[0])
                    tensor_list.append(Tensor(param_data, ms_type))

    except BaseException as e:
        logger.error("Failed to load the print file %s.", print_list)
        raise RuntimeError(e.__str__())

    return tensor_list
示例#6
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse_init()
    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    is_auto_enable_graph_kernel = _auto_enable_graph_kernel(
        args_opt.device_target, args_opt.enable_graph_kernel)
    _set_graph_kernel_context(args_opt.device_target,
                              args_opt.enable_graph_kernel,
                              is_auto_enable_graph_kernel)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init()
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init()
            device_num = D.get_group_size()
            rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
            get_rank()) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            device_num=device_num)
        _set_bert_all_reduce_split()
    else:
        rank = 0
        device_num = 1

    _check_compute_type(args_opt, is_auto_enable_graph_kernel)

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(
            args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(
            cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(
                args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle,
                             args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size(
    ) // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        train_steps = args_opt.train_steps * args_opt.accumulation_steps
        new_repeat_count = min(new_repeat_count,
                               train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size(
        ) // args_opt.accumulation_steps
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [
        TimeMonitor(args_opt.data_sink_steps),
        LossCallBack(ds.get_dataset_size())
    ]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(
            8, device_num) == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(
            prefix='checkpoint_bert',
            directory=None if ckpt_save_dir == "" else ckpt_save_dir,
            config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        accumulation_steps = args_opt.accumulation_steps
        enable_global_norm = cfg.enable_global_norm
        if accumulation_steps <= 1:
            if cfg.optimizer == 'AdamWeightDecay' and args_opt.device_target == 'GPU':
                net_with_grads = BertTrainOneStepWithLossScaleCellForAdam(
                    net_with_loss,
                    optimizer=optimizer,
                    scale_update_cell=update_cell)
            else:
                net_with_grads = BertTrainOneStepWithLossScaleCell(
                    net_with_loss,
                    optimizer=optimizer,
                    scale_update_cell=update_cell)
        else:
            allreduce_post = args_opt.distribute == "false" or args_opt.allreduce_post_accumulation == "true"
            net_with_accumulation = (
                BertTrainAccumulationAllReducePostWithLossScaleCell
                if allreduce_post else
                BertTrainAccumulationAllReduceEachWithLossScaleCell)
            net_with_grads = net_with_accumulation(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell,
                accumulation_steps=accumulation_steps,
                enable_global_norm=enable_global_norm)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)

    model = Model(net_with_grads)
    model = ConvertModelUtils().convert_to_thor_model(
        model,
        network=net_with_grads,
        optimizer=optimizer,
        frequency=cfg.Thor.frequency)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
def test_net():
    x = np.random.randn(2, 5, 8).astype(np.float32)
    mask = np.random.randn(16).astype(np.uint8)
    keep_prob = 1

    ddm = Net()
    output = ddm(Tensor(x), Tensor(mask), Tensor(keep_prob))
    logger.info("***********x*********")
    logger.info(x)
    logger.info("***********mask*********")
    logger.info(mask)
    logger.info("***********keep_prob*********")
    logger.info(keep_prob)

    logger.info("***********output y*********")
    logger.info(output.asnumpy())
    schema = ds.Schema()
    schema.add_column(
        'image', de_type=mstype.uint8,
        shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # Make up about 10 samples
    ds1 = ds.RandomDataset(schema=schema,
                           num_samples=10,
                           num_parallel_workers=1)

    # cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill
    ds1 = ds1.repeat(4)

    num_iter = 0
    for data in ds1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        #logger.info(data["image"])
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: ", num_iter)
    assert (num_iter == 40)


if __name__ == '__main__':
    test_randomdataset_basic1()
    test_randomdataset_basic2()
    logger.info('test_randomdataset_basic Ended.\n')
示例#9
0
 def write_timeline(self, size_limit=SIZE_LIMIT_DEFAULT):
     """Load data according to the parsed profiling files."""
     # Write timeline to file.
     logger.info('Writing timeline file...')
     self.write_timeline_to_json_by_limitation(size_limit)
     logger.info('Finished file writing!')
示例#10
0
def visualize_with_bounding_boxes(orig, aug, annot_name="bbox", plot_rows=3):
    """
    Take a list of un-augmented and augmented images with "bbox" bounding boxes
    Plot images to compare test correct BBox augment functionality
    :param orig: list of original images and bboxes (without aug)
    :param aug: list of augmented images and bboxes
    :param annot_name: the dict key for bboxes in data, e.g "bbox" (COCO) / "bbox" (VOC)
    :param plot_rows: number of rows on plot (rows = samples on one plot)
    :return: None
    """
    def add_bounding_boxes(ax, bboxes):
        for bbox in bboxes:
            rect = patches.Rectangle((bbox[0], bbox[1]),
                                     bbox[2] * 0.997,
                                     bbox[3] * 0.997,
                                     linewidth=1.80,
                                     edgecolor='r',
                                     facecolor='none')
            # Add the patch to the Axes
            # Params to Rectangle slightly modified to prevent drawing overflow
            ax.add_patch(rect)

    # Quick check to confirm correct input parameters
    if not isinstance(orig, list) or not isinstance(aug, list):
        return
    if len(orig) != len(aug) or not orig:
        return

    batch_size = int(len(orig) /
                     plot_rows)  # creates batches of images to plot together
    split_point = batch_size * plot_rows

    orig, aug = np.array(orig), np.array(aug)

    if len(orig) > plot_rows:
        # Create batches of required size and add remainder to last batch
        orig = np.split(orig[:split_point], batch_size) + (
            [orig[split_point:]] if (split_point < orig.shape[0]) else []
        )  # check to avoid empty arrays being added
        aug = np.split(aug[:split_point],
                       batch_size) + ([aug[split_point:]] if
                                      (split_point < aug.shape[0]) else [])
    else:
        orig = [orig]
        aug = [aug]

    for ix, allData in enumerate(zip(orig, aug)):
        base_ix = ix * plot_rows  # current batch starting index
        curPlot = len(allData[0])

        fig, axs = plt.subplots(curPlot, 2)
        fig.tight_layout(pad=1.5)

        for x, (dataA, dataB) in enumerate(zip(allData[0], allData[1])):
            cur_ix = base_ix + x
            # select plotting axes based on number of image rows on plot - else case when 1 row
            (axA, axB) = (axs[x, 0], axs[x, 1]) if (curPlot > 1) else (axs[0],
                                                                       axs[1])

            axA.imshow(dataA["image"])
            add_bounding_boxes(axA, dataA[annot_name])
            axA.title.set_text("Original" + str(cur_ix + 1))

            axB.imshow(dataB["image"])
            add_bounding_boxes(axB, dataB[annot_name])
            axB.title.set_text("Augmented" + str(cur_ix + 1))

            logger.info("Original **\n{} : {}".format(str(cur_ix + 1),
                                                      dataA[annot_name]))
            logger.info("Augmented **\n{} : {}\n".format(
                str(cur_ix + 1), dataB[annot_name]))

        plt.show()
示例#11
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
                        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.")
    parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.")
    parser.add_argument("--device_id", type=int, default=4, help="Device id, default is 0.")
    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale", type=str, default="false", help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle", type=str, default="false", help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps", type=int, default="100", help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, "
                                                                                "default is 1000.")
    parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, "
                                                                    "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target,
                        device_id=args_opt.device_id, save_graphs=False)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(max_call_depth=3000)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        D.init()
        device_num = D.get_group_size()
        rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/'
        context.reset_auto_parallel_context()
        _set_bert_all_reduce_split()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                          device_num=device_num)

    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size()
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()]
    if args_opt.enable_save_ckpt == "true" and rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value,
                                                 scale_factor=cfg.scale_factor,
                                                 scale_window=cfg.scale_window)
        net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer,
                                                           scale_update_cell=update_cell)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)

    model = Model(net_with_grads, frequency=cfg.Thor.frequency)
    model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
示例#12
0
def test_greater_2d_scalar0():
    a = np.random.randint(-5, 5, [8, 32]).astype(np.int32)
    b = np.random.randint(-5, 5, [8, 32]).astype(np.int32)
    out_me = me_greater(Tensor(a), Tensor(b))
    logger.info("Check me result:")
    logger.info(out_me)
示例#13
0
SCHEMA_DIR_2 = "../data/dataset/testTFBert5Rows2/datasetSchema.json"


def test_rename():
    data1 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False)

    data2 = data2.rename(input_columns=["input_ids", "segment_ids"],
                         output_columns=["masks", "seg_ids"])

    data = ds.zip((data1, data2))
    data = data.repeat(3)

    num_iter = 0

    for i, item in enumerate(data.create_dict_iterator()):
        logger.info("item[mask] is {}".format(item["masks"]))
        assert item["masks"].all() == item["input_ids"].all()
        logger.info("item[seg_ids] is {}".format(item["seg_ids"]))
        assert item["segment_ids"].all() == item["seg_ids"].all()
        # need to consume the data in the buffer
        num_iter += 1
    logger.info("Number of data in data: {}".format(num_iter))
    assert num_iter == 15


if __name__ == '__main__':
    logger.info('===========test Rename Repeat===========')
    test_rename()
    logger.info('\n')
def util_test_random_color_adjust_op(brightness=(1, 1),
                                     contrast=(1, 1),
                                     saturation=(1, 1),
                                     hue=(0, 0),
                                     plot=False):
    """
    Util function that tests RandomColorAdjust for a specific argument
    """

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR,
                               SCHEMA_DIR,
                               columns_list=["image"],
                               shuffle=False)
    decode_op = c_vision.Decode()

    random_adjust_op = c_vision.RandomColorAdjust(brightness=brightness,
                                                  contrast=contrast,
                                                  saturation=saturation,
                                                  hue=hue)

    ctrans = [
        decode_op,
        random_adjust_op,
    ]

    data1 = data1.map(operations=ctrans, input_columns=["image"])

    # Second dataset
    transforms = [
        py_vision.Decode(),
        py_vision.RandomColorAdjust(brightness=brightness,
                                    contrast=contrast,
                                    saturation=saturation,
                                    hue=hue),
        py_vision.ToTensor()
    ]
    transform = mindspore.dataset.transforms.py_transforms.Compose(transforms)
    data2 = ds.TFRecordDataset(DATA_DIR,
                               SCHEMA_DIR,
                               columns_list=["image"],
                               shuffle=False)
    data2 = data2.map(operations=transform, input_columns=["image"])

    num_iter = 0
    for item1, item2 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
        num_iter += 1
        c_image = item1["image"]
        py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)

        logger.info("shape of c_image: {}".format(c_image.shape))
        logger.info("shape of py_image: {}".format(py_image.shape))

        logger.info("dtype of c_image: {}".format(c_image.dtype))
        logger.info("dtype of py_image: {}".format(py_image.dtype))

        mse = diff_mse(c_image, py_image)
        logger.info("mse is {}".format(mse))

        logger.info("random_rotation_op_{}, mse: {}".format(num_iter + 1, mse))
        assert mse < 0.01

        if plot:
            visualize_image(c_image, py_image, mse)
示例#15
0
            pass
        assert False
    except TypeError:
        pass

    try:
        data2 = data1.apply(dataset_fn)
        _ = data1.apply(dataset_fn)
        for _, _ in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
            pass
        assert False
    except ValueError as e:
        logger.info("Got an exception in DE: {}".format(str(e)))


if __name__ == '__main__':
    logger.info("Running test_apply.py test_apply_generator_case() function")
    test_apply_generator_case()

    logger.info("Running test_apply.py test_apply_imagefolder_case() function")
    test_apply_imagefolder_case()

    logger.info("Running test_apply.py test_apply_flow_case(id) function")
    test_apply_flow_case_0()
    test_apply_flow_case_1()
    test_apply_flow_case_2()
    test_apply_flow_case_3()

    logger.info("Running test_apply.py test_apply_exception_case() function")
    test_apply_exception_case()
示例#16
0
    def init_timeline(self, all_reduce_info, framework_info, aicpu_info,
                      min_cycle_counter, source_path):
        """
        Init timeline metadata, adding all collected info.

        Args:
            all_reduce_info (list[list]): The metadata of AllReduce operator.
            framework_info (dict): The framework metadata.
            aicpu_info (dict): The metadata of AI CPU operator.
            min_cycle_counter (float): The minimum cycle counter of the timeline.
        """
        if min_cycle_counter == float('inf'):
            min_cycle_counter = 0

        logger.info('Initiating timeline...')
        timeline_list = self._load_timeline_data()
        cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir,
                                                      self._device_id)
        cpu_timeline_list = cpu_timeline_generator.get_timeline_data()
        if cpu_timeline_list:
            self._clock_synchronize_to_host(timeline_list, source_path)
            timeline_list.extend(cpu_timeline_list)
        timeline_list.sort(key=lambda x: float(x[2]))
        self._timeline_summary['op_exe_times'] = len(timeline_list)

        # Add AllReduce info to timeline temp list and sort by start time.
        if all_reduce_info:
            logger.debug(
                'AllReduce info found. Start adding info into timeline...')
            timeline_list.extend(all_reduce_info)
            timeline_list.sort(key=lambda x: float(x[2]))

        # Add AI CPU data into timeline temp list and sort by start time.
        aicpu_data = aicpu_info.get('info')
        if aicpu_data:
            timeline_list.extend(aicpu_data)
            timeline_list.sort(key=lambda x: float(x[2]))
            self._timeline_summary['op_exe_times'] += aicpu_info.get(
                'op_exe_times', 0)
            self._timeline_summary['num_of_streams'] += aicpu_info.get(
                'num_of_streams', 0)
            self._timeline_summary['num_of_ops'] += aicpu_info.get(
                'num_of_ops', 0)
            self._timeline_summary['total_time'] += aicpu_info.get(
                'total_time', 0)

        # Init a dict for counting the num of streams.
        stream_count_dict = {}
        for timeline in timeline_list:
            self._parse_timeline_data(timeline, min_cycle_counter)
            # Updating the collection of streams.
            if len(timeline) == 4:
                self._update_num_of_streams(timeline, stream_count_dict)

        # Get framework metadata.
        framework_obj_list = framework_info.get('object')
        # The length of list is the number of operators.
        self._timeline_summary['num_of_ops'] += len(framework_obj_list)
        self._add_framework_info(framework_obj_list)
        logger.info('Finished adding info into timeline...')

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] += len(
            stream_count_dict.keys())
示例#17
0
    def analyse(self):
        """
        Collect and analyse performance data, called after training or during training.

        Examples:
            >>> from mindspore.profiler import Profiler
            >>> import mindspore.context
            >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
            >>>                     device_id=int(os.environ["DEVICE_ID"]))
            >>> profiler = Profiler()
            >>> model = Model()
            >>> model.train()
            >>> profiler.analyse()
        """
        if self._device_target and self._device_target == "GPU":
            self._gpu_profiler.stop()
            self._generate_timeline()

            # parse minddata pipeline operator and queue for GPU
            try:
                pipeline_parser = MinddataPipelineParser(
                    self._output_path, self._dev_id, self._output_path)
                pipeline_parser.parse()
            except ProfilerException as err:
                logger.warning(err.message)

            os.environ['PROFILING_MODE'] = str("false")

        elif self._device_target and self._device_target == "Ascend":
            release()

            job_id = self._get_profiling_job_id()
            logger.info("Profiling: job id is %s ", job_id)

            source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
            # parse hwts.log.data.45.dev file, and get task profiling data
            hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
            hwts_output_filename = os.path.join(self._output_path,
                                                hwts_output_filename)
            source_path = validate_and_normalize_path(source_path)
            hwts_output_filename = validate_and_normalize_path(
                hwts_output_filename)
            hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
            _ = hwtslog_parser.execute()

            # parse Framework file, and get the relation of op and tasks
            framework_parser = FrameworkParser(job_id, self._dev_id,
                                               self._output_path)
            framework_parser.parse()
            op_task_dict = framework_parser.to_task_id_full_op_name_dict()
            if not op_task_dict:
                logger.error("Profiling: fail to parse framework files.")
                return

            # get op compute time from hwts data and framework data, write output_op_compute_time.txt
            opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
            opcompute_output_filename = os.path.join(
                self._output_path, opcompute_output_filename)
            opcompute_output_filename = validate_and_normalize_path(
                opcompute_output_filename)
            optime_parser = OPComputeTimeParser(hwts_output_filename,
                                                opcompute_output_filename,
                                                op_task_dict,
                                                self._output_path,
                                                self._dev_id)
            optime_parser.execute()

            # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
            output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
            output_data_preprocess_aicpu = os.path.join(
                self._output_path, output_data_preprocess_aicpu)
            output_data_preprocess_aicpu = validate_and_normalize_path(
                output_data_preprocess_aicpu)
            aicpu_data_parser = DataPreProcessParser(
                source_path, output_data_preprocess_aicpu)
            aicpu_data_parser.execute()

            # Parsing minddata AICPU profiling
            MinddataParser.execute(source_path, self._output_path,
                                   self._dev_id)

            # parse minddata pipeline operator and queue
            try:
                pipeline_parser = MinddataPipelineParser(
                    self._output_path, self._dev_id, self._output_path)
                pipeline_parser.parse()
            except ProfilerException as err:
                logger.warning(err.message)

            # analyse op compute time info
            try:
                self._analyser_op_info()
            except ProfilerException as err:
                logger.warning(err.message)

            # analyse step trace info
            try:
                self._analyse_step_trace(source_path, framework_parser)
            except ProfilerException as err:
                logger.warning(err.message)

            # analyse timeline info
            try:
                self._analyse_timeline(aicpu_data_parser, optime_parser)
            except (ProfilerIOException, ProfilerFileNotFoundException,
                    RuntimeError) as err:
                logger.warning('Fail to write timeline data: %s', err)

            os.environ['PROFILING_MODE'] = str("false")
            context.set_context(enable_profiling=False)
示例#18
0
def test_schema_simple():
    logger.info("test_schema_simple")
    ds.Schema(SCHEMA_FILE)
示例#19
0
def save_checkpoint(save_obj,
                    ckpt_file_name,
                    integrated_save=True,
                    async_save=False):
    """
    Saves checkpoint info to a specified file.

    Args:
        save_obj (nn.Cell or list): The cell object or parameters list(each element is a dictionary,
                                    like {"name": param_name, "data": param_data}.)
        ckpt_file_name (str): Checkpoint file name. If the file name already exists, it will be overwritten.
        integrated_save (bool): Whether to integrated save in automatic model parallel scene.
        async_save (bool): Whether asynchronous execution saves the checkpoint to a file. Default: False

    Raises:
        TypeError: If the parameter save_obj is not nn.Cell or list type.
        RuntimeError: Failed to save the Checkpoint file.
    """

    if not isinstance(save_obj, nn.Cell) and not isinstance(save_obj, list):
        raise TypeError(
            "The parameter save_obj should be nn.Cell or list, but got {}".
            format(type(save_obj)))

    logger.info("Execute save checkpoint process.")

    if isinstance(save_obj, nn.Cell):
        save_obj.init_parameters_data()
        param_dict = {}
        for _, param in save_obj.parameters_and_names():
            param_dict[param.name] = param
        param_list = []
        for (key, value) in param_dict.items():
            each_param = {"name": key}
            if isinstance(value.data, Tensor):
                param_data = value.data
            else:
                param_data = Tensor(value.data)

            # in automatic model parallel scenario, some parameters were spliteds to all the devices,
            # which should be combined before saving
            if integrated_save and key in save_obj.parameter_layout_dict:
                param_data = _get_merged_param_data(save_obj, key, param_data)

            each_param["data"] = param_data
            param_list.append(each_param)
        save_obj = param_list

    data_list = {}
    with _ckpt_mutex:
        for param in save_obj:
            key = param["name"]
            data_list[key] = []
            if isinstance(param["data"], Parameter):
                param["data"].init_data()
            dims = []
            if param['data'].shape == ():
                dims.append(0)
            else:
                for dim in param['data'].shape:
                    dims.append(dim)
            data_list[key].append(dims)
            tensor_type = str(param["data"].dtype)
            data_list[key].append(tensor_type)
            data = param["data"].asnumpy().reshape(-1)
            data_list[key].append(data)

    if async_save:
        thr = Thread(target=_exec_save,
                     args=(ckpt_file_name, data_list),
                     name="asyn_save_ckpt")
        thr.start()
    else:
        _exec_save(ckpt_file_name, data_list)

    logger.info("Save checkpoint process finish.")
示例#20
0
def test_numpy_slice_empty_output_shape():
    logger.info("running test_numpy_slice_empty_output_shape")
    dataset = de.NumpySlicesDataset([[[1, 2], [3, 4]]], column_names=["col1"])
    dataset = dataset.batch(batch_size=3, drop_remainder=True)
    assert dataset.output_shapes() == []
def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial():
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    cv_schema_json = {
        "id": {
            "type": "int32"
        },
        "image_0": {
            "type": "bytes"
        },
        "image_2": {
            "type": "bytes"
        },
        "image_3": {
            "type": "bytes"
        },
        "image_4": {
            "type": "bytes"
        },
        "input_mask": {
            "type": "int32",
            "shape": [-1]
        },
        "segments": {
            "type": "float32",
            "shape": [2, 3]
        }
    }
    writer.add_schema(cv_schema_json, "two_images_schema")
    with open("../data/mindrecord/testImageNetData/images/image_00010.jpg",
              "rb") as file_reader:
        img_data = file_reader.read()
    ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32)
    ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32)
    data = []
    for i in range(5):
        item = {
            "id": i,
            "image_0": img_data,
            "image_2": img_data,
            "image_3": img_data,
            "image_4": img_data,
            "input_mask": ndarray_1,
            "segments": ndarray_2
        }
        data.append(item)
    writer.write_raw_data(data)
    writer.commit()
    assert os.path.exists(CV_FILE_NAME)
    assert os.path.exists(CV_FILE_NAME + ".db")

    # tutorial for minderdataset.
    columns_list = [
        "id", "image_0", "image_2", "image_3", "image_4", "input_mask",
        "segments"
    ]
    num_readers = 1
    data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers)
    assert data_set.get_dataset_size() == 5
    num_iter = 0
    for item in data_set.create_dict_iterator():
        assert len(item) == 7
        logger.info("item: {}".format(item))
        assert item["image_0"].dtype == np.uint8
        assert (item["image_0"] == item["image_2"]).all()
        assert (item["image_3"] == item["image_4"]).all()
        assert (item["image_0"] == item["image_4"]).all()
        assert item["image_2"].dtype == np.uint8
        assert item["image_3"].dtype == np.uint8
        assert item["image_4"].dtype == np.uint8
        assert item["id"].dtype == np.int32
        assert item["input_mask"].shape == (5, )
        assert item["input_mask"].dtype == np.int32
        assert item["segments"].shape == (2, 3)
        assert item["segments"].dtype == np.float32
        num_iter += 1
    assert num_iter == 5

    if os.path.exists("{}".format(CV_FILE_NAME + ".db")):
        os.remove(CV_FILE_NAME + ".db")
    if os.path.exists("{}".format(CV_FILE_NAME)):
        os.remove(CV_FILE_NAME)
示例#22
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="1",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt",
                        type=str,
                        default="true",
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale",
                        type=str,
                        default="true",
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default="1",
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument(
        "--accumulation_steps",
        type=int,
        default="1",
        help=
        "Accumulating gradients N times before weight update, default is 1.")
    parser.add_argument("--save_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=1000,
                        help="Save checkpoint steps, "
                        "default is 1000.")
    parser.add_argument("--train_steps",
                        type=int,
                        default=-1,
                        help="Training Steps, default is -1, "
                        "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num",
                        type=int,
                        default=1,
                        help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(
            args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(
            bert_net_cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(
                args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle,
                             args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size(
    ) // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size()
        logger.info("train steps: {}".format(args_opt.train_steps))

    if cfg.optimizer == 'Lamb':
        lr_schedule = BertLearningRate(
            learning_rate=cfg.Lamb.learning_rate,
            end_learning_rate=cfg.Lamb.end_learning_rate,
            warmup_steps=cfg.Lamb.warmup_steps,
            decay_steps=args_opt.train_steps,
            power=cfg.Lamb.power)
        params = net_with_loss.trainable_params()
        decay_params = list(filter(cfg.Lamb.decay_filter, params))
        other_params = list(
            filter(lambda x: not cfg.Lamb.decay_filter(x), params))
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.Lamb.weight_decay
        }, {
            'params': other_params
        }, {
            'order_params': params
        }]
        optimizer = Lamb(group_params,
                         learning_rate=lr_schedule,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(net_with_loss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = BertLearningRate(
            learning_rate=cfg.AdamWeightDecay.learning_rate,
            end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
            warmup_steps=cfg.AdamWeightDecay.warmup_steps,
            decay_steps=args_opt.train_steps,
            power=cfg.AdamWeightDecay.power)
        params = net_with_loss.trainable_params()
        decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params))
        other_params = list(
            filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.AdamWeightDecay.weight_decay
        }, {
            'params': other_params,
            'weight_decay': 0.0
        }, {
            'order_params': params
        }]

        optimizer = AdamWeightDecay(group_params,
                                    learning_rate=lr_schedule,
                                    eps=cfg.AdamWeightDecay.eps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]"
            .format(cfg.optimizer))
    callback = [
        TimeMonitor(args_opt.data_sink_steps),
        LossCallBack(ds.get_dataset_size())
    ]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(
            8, device_num) == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(
            prefix='checkpoint_bert',
            directory=None if ckpt_save_dir == "" else ckpt_save_dir,
            config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)

        if args_opt.accumulation_steps <= 1:
            net_with_grads = BertTrainOneStepWithLossScaleCell(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell)
        else:
            accumulation_steps = args_opt.accumulation_steps
            net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell,
                accumulation_steps=accumulation_steps)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)

    model = Model(net_with_grads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
示例#23
0
def export(net, *inputs, file_name, file_format='AIR'):
    """
    Exports MindSpore predict model to file in specified format.

    Args:
        net (Cell): MindSpore network.
        inputs (Tensor): Inputs of the `net`.
        file_name (str): File name of model to export.
        file_format (str): MindSpore currently supports 'AIR', 'ONNX' and 'MINDIR' format for exported model.

            - AIR: Ascend Intermidiate Representation. An intermidiate representation format of Ascend model.
              Recommended suffix for output file is '.air'.
            - ONNX: Open Neural Network eXchange. An open format built to represent machine learning models.
              Recommended suffix for output file is '.onnx'.
            - MINDIR: MindSpore Native Intermidiate Representation for Anf. An intermidiate representation format
              for MindSpore models.
              Recommended suffix for output file is '.mindir'.
    """
    logger.info("exporting model file:%s format:%s.", file_name, file_format)
    check_input_data(*inputs, data_class=Tensor)

    if file_format == 'GEIR':
        logger.warning(
            f"Format 'GEIR' is deprecated, it would be removed in future release, use 'AIR' instead."
        )
        file_format = 'AIR'

    supported_formats = ['AIR', 'ONNX', 'MINDIR']
    if file_format not in supported_formats:
        raise ValueError(
            f'Illegal file format {file_format}, it must be one of {supported_formats}'
        )
    # switch network mode to infer when it is training
    is_training = net.training
    if is_training:
        net.set_train(mode=False)
    # export model
    net.init_parameters_data()
    if file_format == 'AIR':
        phase_name = 'export.air'
        graph_id, _ = _executor.compile(net, *inputs, phase=phase_name)
        _executor.export(file_name, graph_id)
    elif file_format == 'ONNX':  # file_format is 'ONNX'
        phase_name = 'export.onnx'
        graph_id, _ = _executor.compile(net,
                                        *inputs,
                                        phase=phase_name,
                                        do_convert=False)
        onnx_stream = _executor._get_func_graph_proto(graph_id)
        with open(file_name, 'wb') as f:
            os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR)
            f.write(onnx_stream)
    elif file_format == 'MINDIR':  # file_format is 'MINDIR'
        phase_name = 'export.mindir'
        graph_id, _ = _executor.compile(net,
                                        *inputs,
                                        phase=phase_name,
                                        do_convert=False)
        onnx_stream = _executor._get_func_graph_proto(graph_id, 'mind_ir')
        with open(file_name, 'wb') as f:
            os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR)
            f.write(onnx_stream)
    # restore network training mode
    if is_training:
        net.set_train(mode=True)
示例#24
0
    Test shuffle exception: buffer_size wrong type, boolean value True
    """
    logger.info("test_shuffle_exception_07")

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR)
    ds.config.set_seed(1)
    try:
        data1 = data1.shuffle(buffer_size=True)
        sum([1 for _ in data1])

    except Exception as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
        assert "buffer_size" in str(e)


if __name__ == '__main__':
    test_shuffle_01()
    test_shuffle_02()
    test_shuffle_03()
    test_shuffle_04()
    test_shuffle_05()
    test_shuffle_06()
    test_shuffle_exception_01()
    test_shuffle_exception_02()
    test_shuffle_exception_03()
    test_shuffle_exception_05()
    test_shuffle_exception_06()
    test_shuffle_exception_07()
    logger.info('\n')
示例#25
0
    def __init__(self, **kwargs):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        self._get_output_path(kwargs)

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['MINDDATA_PROFILING_DIR'] = self._output_path

        if self._device_target:
            CPUProfiler = c_expression.CPUProfiler
            self._cpu_profiler = CPUProfiler.get_instance()
            self._cpu_profiler.init(self._output_path)
            self._cpu_profiler.step_profiling_enable(True)
        if self._device_target and self._device_target == "GPU":
            GPUProfiler = c_expression.GPUProfiler
            self._gpu_profiler = GPUProfiler.get_instance()
            self._gpu_profiler.init(self._output_path)
            self._gpu_profiler.step_profiling_enable(True)
            if GlobalComm.WORLD_COMM_GROUP == "nccl_world_group":
                self._dev_id = str(get_rank())
            os.environ['DEVICE_ID'] = self._dev_id

            if kwargs:
                logger.warning("Params not be supported yet on GPU.")
        elif self._device_target and self._device_target == "Ascend":
            optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
            if not isinstance(optypes_not_deal, str):
                raise TypeError("The parameter optypes_not_deal must be str.")
            job_dir = kwargs.pop("ascend_job_id", "")
            if job_dir:
                job_dir = validate_and_normalize_path(job_dir)
                if not os.path.exists(job_dir):
                    msg = f"Invalid ascend_job_id: {job_dir}, Please pass the absolute path of the JOB dir"
                    logger.error(msg)
                    raise ValueError(msg)
                self._output_path, _ = os.path.split(job_dir)
            if kwargs:
                logger.warning("There are invalid params which don't work.")

            os.environ['DEVICE_ID'] = self._dev_id
            fp_point = os.environ.get("PROFILING_FP_START", "")
            bp_point = os.environ.get("PROFILING_BP_END", "")

            profiling_options = {
                "output": self._output_path,
                "fp_point": fp_point,
                "bp_point": bp_point,
                "training_trace": "on",
                "task_trace": "on",
                "aic_metrics": "PipeUtilization",
                "aicpu": "on"
            }

            profiling_options = json.dumps(profiling_options)
            # Characters longer than 2048 are ignored, resulting in profiling option resolution errors
            if len(profiling_options) > 2048:
                msg = "The parameter length exceeds the limit (2048), please input valid parameters."
                logger.error(msg)
                raise ValueError(msg)
            # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
            context.set_context(enable_profiling=True,
                                profiling_options=profiling_options)
            base_profiling_container_path = os.path.join(
                self._output_path, "container")
            container_path = os.path.join(base_profiling_container_path,
                                          self._dev_id)
            data_path = os.path.join(container_path, "data")
            data_path = validate_and_normalize_path(data_path)
            if not os.path.exists(data_path):
                os.makedirs(data_path, exist_ok=True)

            self._filt_optype_names = optypes_not_deal.split(
                ",") if optypes_not_deal else []
            # add job id env through user input later
            self._job_id_env = 0
            self._start_time = int(time.time() * 10000000)
            logger.info("Profiling: profiling start time: %d",
                        self._start_time)
示例#26
0
def test_pad_op():
    """
    Test Pad op
    """
    logger.info("test_random_color_jitter_op")

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR,
                               SCHEMA_DIR,
                               columns_list=["image"],
                               shuffle=False)
    decode_op = c_vision.Decode()

    pad_op = c_vision.Pad((100, 100, 100, 100))
    ctrans = [
        decode_op,
        pad_op,
    ]

    data1 = data1.map(operations=ctrans, input_columns=["image"])

    # Second dataset
    transforms = [
        py_vision.Decode(),
        py_vision.Pad(100),
        py_vision.ToTensor(),
    ]
    transform = mindspore.dataset.transforms.py_transforms.Compose(transforms)
    data2 = ds.TFRecordDataset(DATA_DIR,
                               SCHEMA_DIR,
                               columns_list=["image"],
                               shuffle=False)
    data2 = data2.map(operations=transform, input_columns=["image"])

    for item1, item2 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
        c_image = item1["image"]
        py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)

        logger.info("shape of c_image: {}".format(c_image.shape))
        logger.info("shape of py_image: {}".format(py_image.shape))

        logger.info("dtype of c_image: {}".format(c_image.dtype))
        logger.info("dtype of py_image: {}".format(py_image.dtype))

        mse = diff_mse(c_image, py_image)
        logger.info("mse is {}".format(mse))
        assert mse < 0.01
示例#27
0
    def run(self):
        """
        Executes transformation from imagenet to MindRecord.

        Returns:
            SUCCESS or FAILED, whether imagenet is successfully transformed to MindRecord.
        """
        t0_total = time.time()

        imagenet_schema_json = {
            "label": {
                "type": "int32"
            },
            "image": {
                "type": "bytes"
            },
            "file_name": {
                "type": "string"
            }
        }

        logger.info("transformed MindRecord schema is: {}".format(
            imagenet_schema_json))

        # set the header size
        self.writer.set_header_size(1 << 24)

        # set the page size
        self.writer.set_page_size(1 << 26)

        # create the schema
        self.writer.add_schema(imagenet_schema_json, "imagenet_schema")

        # add the index
        self.writer.add_index(["label", "file_name"])

        imagenet_iter = self._get_imagenet_as_dict()
        batch_size = 256
        transform_count = 0
        while True:
            data_list = []
            try:
                for _ in range(batch_size):
                    data_list.append(imagenet_iter.__next__())
                    transform_count += 1
                self.writer.write_raw_data(data_list)
                logger.info("transformed {} record...".format(transform_count))
            except StopIteration:
                if data_list:
                    self.writer.write_raw_data(data_list)
                    logger.info(
                        "transformed {} record...".format(transform_count))
                break

        ret = self.writer.commit()

        t1_total = time.time()
        logger.info("--------------------------------------------")
        logger.info("END. Total time: {}".format(t1_total - t0_total))
        logger.info("--------------------------------------------")

        return ret
示例#28
0
 def write_timeline(self):
     """Load data according to the parsed profiling files."""
     # Write timeline to file.
     logger.info('Writing timeline file...')
     self.write_timeline_to_json_by_limitation()
     logger.info('Finished file writing!')
示例#29
0
def test_uniform_augment(plot=False, num_ops=2):
    """
    Test UniformAugment
    """
    logger.info("Test UniformAugment")

    # Original Images
    data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)

    transforms_original = mindspore.dataset.transforms.py_transforms.Compose(
        [F.Decode(), F.Resize((224, 224)),
         F.ToTensor()])

    ds_original = data_set.map(operations=transforms_original,
                               input_columns="image")

    ds_original = ds_original.batch(512)

    for idx, (image, _) in enumerate(ds_original):
        if idx == 0:
            images_original = np.transpose(image.asnumpy(), (0, 2, 3, 1))
        else:
            images_original = np.append(images_original,
                                        np.transpose(image.asnumpy(),
                                                     (0, 2, 3, 1)),
                                        axis=0)

            # UniformAugment Images
    data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)

    transform_list = [
        F.RandomRotation(45),
        F.RandomColor(),
        F.RandomSharpness(),
        F.Invert(),
        F.AutoContrast(),
        F.Equalize()
    ]

    transforms_ua = \
        mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
                                                            F.Resize((224, 224)),
                                                            F.UniformAugment(transforms=transform_list,
                                                                             num_ops=num_ops),
                                                            F.ToTensor()])

    ds_ua = data_set.map(operations=transforms_ua, input_columns="image")

    ds_ua = ds_ua.batch(512)

    for idx, (image, _) in enumerate(ds_ua):
        if idx == 0:
            images_ua = np.transpose(image.asnumpy(), (0, 2, 3, 1))
        else:
            images_ua = np.append(images_ua,
                                  np.transpose(image.asnumpy(), (0, 2, 3, 1)),
                                  axis=0)

    num_samples = images_original.shape[0]
    mse = np.zeros(num_samples)
    for i in range(num_samples):
        mse[i] = diff_mse(images_ua[i], images_original[i])
    logger.info("MSE= {}".format(str(np.mean(mse))))

    if plot:
        visualize_list(images_original, images_ua)
示例#30
0
 def test_invalid_input(test_name, size, interpolation, error, error_msg):
     logger.info("Test Resize with bad input: {0}".format(test_name))
     with pytest.raises(error) as error_info:
         vision.Resize(size, interpolation)
     assert error_msg in str(error_info.value)