示例#1
0
def test_sit_auto_mix_precision_train_o3():
    input_data = np.random.randn(32, 3, 224, 224).astype(np.float64)
    label_data = np.random.randn(32, 10).astype(np.float32)
    # graph mode
    context.set_context(mode=context.GRAPH_MODE)
    net = Net(3, 10)
    opt = nn.Momentum(params=net.trainable_params(), learning_rate=0.001, momentum=0.0009, weight_decay=0.001,
                      loss_scale=0.0001)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
    train_network = amp.build_train_network(net, opt, loss, level="O3",
                                            loss_scale_manager=FixedLossScaleManager(drop_overflow_update=False))
    out = train_network(Tensor(input_data), Tensor(label_data))

    # pynative mode
    context.set_context(mode=context.PYNATIVE_MODE)
    net_pynative = Net(3, 10)
    opt_pynative = nn.Momentum(params=net_pynative.trainable_params(), learning_rate=0.001, momentum=0.0009,
                               weight_decay=0.001,
                               loss_scale=0.0001)
    loss_pynative = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
    train_network_pynative = amp.build_train_network(net_pynative, opt_pynative, loss_pynative, level="O3",
                                                     loss_scale_manager=FixedLossScaleManager(
                                                         drop_overflow_update=False))
    out_pynative = train_network_pynative(Tensor(input_data), Tensor(label_data))
    assert np.allclose(out.asnumpy(), out_pynative.asnumpy(), 0.001, 0.001)
示例#2
0
def test_amp_o0_loss():
    inputs = Tensor(np.ones([16, 16]).astype(np.float32))
    label = Tensor(np.zeros([16, 16]).astype(np.float32))
    net = NetNoLoss(16, 16)
    loss = nn.MSELoss()
    optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_network = amp.build_train_network(net, optimizer, loss)
    output = train_network(inputs, label)
示例#3
0
def test_amp_resnet50_loss():
    inputs = Tensor(np.ones([2, 3, 224, 224]).astype(np.float32))
    label = Tensor(np.zeros([2, 10]).astype(np.float32))
    net = resnet50()
    loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean')
    optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_network = amp.build_train_network(net, optimizer, loss, level="O2")
    train_network(inputs, label)
示例#4
0
def test_amp_o2():
    inputs = Tensor(np.ones([16, 16]).astype(np.float32))
    label = Tensor(np.zeros([16, 16]).astype(np.float32))
    net = Net(16, 16)

    optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_network = amp.build_train_network(net, optimizer, level="O2")
    _ = train_network(inputs, label)
示例#5
0
def test_heterogeneous_excutor():
    input_data = np.random.randn(32, 3, 224, 224).astype(np.float64)
    label_data = np.random.randn(32, 10).astype(np.float32)
    # graph mode
    context.set_context(mode=context.GRAPH_MODE)
    net = Net(3, 10)
    opt = nn.Momentum(params=net.trainable_params(),
                      learning_rate=0.001,
                      momentum=0.0009,
                      weight_decay=0.001,
                      loss_scale=0.0001)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
    train_network = amp.build_train_network(
        net,
        opt,
        loss,
        level="O3",
        loss_scale_manager=FixedLossScaleManager(drop_overflow_update=False))
    out = train_network(Tensor(input_data), Tensor(label_data))

    # heterogeneous_excutor
    net_heter = Net(3, 10)
    net_heter.relu.relu.add_prim_attr("primitive_target", "CPU")
    net_heter.conv.conv2d.add_prim_attr("primitive_target", "CPU")

    opt_heter = nn.Momentum(params=net_heter.trainable_params(),
                            learning_rate=0.001,
                            momentum=0.0009,
                            weight_decay=0.001,
                            loss_scale=0.0001)
    loss_heter = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
    train_network_heter = amp.build_train_network(
        net_heter,
        opt_heter,
        loss_heter,
        level="O3",
        loss_scale_manager=FixedLossScaleManager(drop_overflow_update=False))
    out_heter = train_network_heter(Tensor(input_data), Tensor(label_data))
    assert np.allclose(out.asnumpy(), out_heter.asnumpy(), 0.001, 0.001)
示例#6
0
def test_trainTensor_amp(num_classes=10, epoch=18, batch_size=16):
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", enable_mem_reuse=False,
                        enable_dynamic_memory=False)
    net = resnet50(num_classes)
    lr = 0.1
    momentum = 0.9
    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum)
    criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    train_network = amp.build_train_network(net, optimizer, criterion, level="O2")
    train_network.set_train()
    losses = []
    for i in range(0, epoch):
        data = Tensor(np.ones([batch_size, 3, 224, 224]).astype(np.float32) * 0.01)
        label = Tensor(np.ones([batch_size]).astype(np.int32))
        loss = train_network(data, label)
        losses.append(loss)
    assert(losses[-1][0].asnumpy() < 1)
    assert(losses[-1][1].asnumpy() == False)
    assert(losses[-1][2].asnumpy() > 1)
示例#7
0
def test_trainTensor_amp(num_classes=10, epoch=18, batch_size=16):
    net = resnet50(num_classes)
    lr = 0.1
    momentum = 0.9
    optimizer = Momentum(filter(lambda x: x.requires_grad,
                                net.get_parameters()), lr, momentum)
    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    train_network = amp.build_train_network(
        net, optimizer, criterion, level="O2")
    train_network.set_train()
    losses = []
    for i in range(0, epoch):
        data = Tensor(np.ones([batch_size, 3, 224, 224]
                              ).astype(np.float32) * 0.01)
        label = Tensor(np.ones([batch_size]).astype(np.int32))
        loss = train_network(data, label)
        losses.append(loss)
    assert (losses[-1][0].asnumpy() < 1)
    assert not losses[-1][1].asnumpy()
    assert (losses[-1][2].asnumpy() > 1)
def test_ir_fusion_inplace_bn_conv_conv():
    clear_files()
    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
    input_np = np.random.uniform(0.0, 255.0, size=[4, 4, 4,
                                                   4]).astype(np.float32)
    label = np.ones([4, 4, 4, 4]).astype(np.float32)
    net = InplaceNet()
    loss = SoftmaxCrossEntropyWithLogits(sparse=False)
    opt = Momentum(learning_rate=0.01,
                   momentum=0.9,
                   params=filter(lambda x: x.requires_grad,
                                 net.get_parameters()))
    net = amp.build_train_network(net,
                                  opt,
                                  loss,
                                  level="O2",
                                  keep_batchnorm_fp32=False)
    net.set_train()
    net(Tensor(input_np), Tensor(label))
    find_accum = find_files("hwopt*cudnn_inplace*ir",
                            "inplace_algo: accumulation")
    find_cover = find_files("hwopt*cudnn_inplace*ir", "inplace_algo: cover")
    assert find_accum == '1'
    assert find_cover == '1'
示例#9
0
    lr = get_lr(args)

    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)
    is_gpu = context.get_context("device_target") == "GPU"
    if is_gpu:
        loss_scale_value = 1.0
        loss_scale = FixedLossScaleManager(loss_scale_value,
                                           drop_overflow_update=False)
        network = amp.build_train_network(network,
                                          optimizer=opt,
                                          loss_scale_manager=loss_scale,
                                          level="O2",
                                          keep_batchnorm_fp32=False)
        keep_loss_fp32(network)
    else:
        network = TrainingWrapper(network, opt)
        network.set_train()

    if args.rank_save_ckpt_flag:
        # checkpoint save
        ckpt_max_num = 10
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval,
            keep_checkpoint_max=ckpt_max_num)
        save_ckpt_path = os.path.join(args.outputs_dir,
                                      'ckpt_' + str(args.rank) + '/')
def train():
    """Train function."""
    args = parse_args()
    devid = int(os.getenv('DEVICE_ID', '0'))
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.device_target,
                        save_graphs=False,
                        device_id=devid)
    loss_meter = AverageMeter('loss')

    network = YOLOV4CspDarkNet53(is_training=True)
    # default is kaiming-normal
    default_recursive_init(network)

    if args.pretrained_backbone:
        pretrained_backbone_slice = args.pretrained_backbone.split('/')
        backbone_ckpt_file = pretrained_backbone_slice[
            len(pretrained_backbone_slice) - 1]
        local_backbone_ckpt_path = '/cache/' + backbone_ckpt_file
        # download backbone checkpoint
        mox.file.copy_parallel(src_url=args.pretrained_backbone,
                               dst_url=local_backbone_ckpt_path)
        args.pretrained_backbone = local_backbone_ckpt_path
    load_yolov4_params(args, network)

    network = YoloWithLossCell(network)
    args.logger.info('finish get network')

    config = ConfigYOLOV4CspDarkNet53()

    config.label_smooth = args.label_smooth
    config.label_smooth_factor = args.label_smooth_factor

    if args.training_shape:
        config.multi_scale = [convert_training_shape(args)]
    if args.resize_rate:
        config.resize_rate = args.resize_rate

    # data download
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'
    print('Download data.')
    mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_path)

    ds, data_size = create_yolo_dataset(
        image_dir=os.path.join(local_data_path, 'images'),
        anno_path=os.path.join(local_data_path, 'annotation.json'),
        is_training=True,
        batch_size=args.per_batch_size,
        max_epoch=args.max_epoch,
        device_num=args.group_size,
        rank=args.rank,
        config=config)
    args.logger.info('Finish loading dataset')

    args.steps_per_epoch = int(data_size / args.per_batch_size /
                               args.group_size)

    if not args.ckpt_interval:
        args.ckpt_interval = args.steps_per_epoch * 10

    lr = get_lr(args)

    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)
    is_gpu = context.get_context("device_target") == "GPU"
    if is_gpu:
        loss_scale_value = 1.0
        loss_scale = FixedLossScaleManager(loss_scale_value,
                                           drop_overflow_update=False)
        network = amp.build_train_network(network,
                                          optimizer=opt,
                                          loss_scale_manager=loss_scale,
                                          level="O2",
                                          keep_batchnorm_fp32=False)
        keep_loss_fp32(network)
    else:
        network = TrainingWrapper(network, opt)
        network.set_train()

    # checkpoint save
    ckpt_max_num = 10
    ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                   keep_checkpoint_max=ckpt_max_num)
    ckpt_cb = ModelCheckpoint(config=ckpt_config,
                              directory=local_ckpt_path,
                              prefix='yolov4')
    cb_params = _InternalCallbackParam()
    cb_params.train_network = network
    cb_params.epoch_num = ckpt_max_num
    cb_params.cur_epoch_num = 1
    run_context = RunContext(cb_params)
    ckpt_cb.begin(run_context)

    old_progress = -1
    t_end = time.time()
    data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1)

    for i, data in enumerate(data_loader):
        images = data["image"]
        input_shape = images.shape[2:4]
        images = Tensor.from_numpy(images)

        batch_y_true_0 = Tensor.from_numpy(data['bbox1'])
        batch_y_true_1 = Tensor.from_numpy(data['bbox2'])
        batch_y_true_2 = Tensor.from_numpy(data['bbox3'])
        batch_gt_box0 = Tensor.from_numpy(data['gt_box1'])
        batch_gt_box1 = Tensor.from_numpy(data['gt_box2'])
        batch_gt_box2 = Tensor.from_numpy(data['gt_box3'])

        input_shape = Tensor(tuple(input_shape[::-1]), ms.float32)
        loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2,
                       batch_gt_box0, batch_gt_box1, batch_gt_box2,
                       input_shape)
        loss_meter.update(loss.asnumpy())

        # ckpt progress
        cb_params.cur_step_num = i + 1  # current step number
        cb_params.batch_num = i + 2
        ckpt_cb.step_end(run_context)

        if i % args.log_interval == 0:
            time_used = time.time() - t_end
            epoch = int(i / args.steps_per_epoch)
            fps = args.per_batch_size * (
                i - old_progress) * args.group_size / time_used
            if args.rank == 0:
                args.logger.info(
                    'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format(
                        epoch, i, loss_meter, fps, lr[i]))
            t_end = time.time()
            loss_meter.reset()
            old_progress = i

        if (i + 1) % args.steps_per_epoch == 0:
            cb_params.cur_epoch_num += 1

    args.logger.info('==========end training===============')

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args.train_url)
示例#11
0
def train():
    """Train function."""
    args = parse_args()
    devid = int(os.getenv('DEVICE_ID', '0'))
    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.device_target, save_graphs=True, device_id=devid)
    if args.need_profiler:
        from mindspore.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True)

    loss_meter = AverageMeter('loss')

    context.reset_auto_parallel_context()
    parallel_mode = ParallelMode.STAND_ALONE
    degree = 1
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
        degree = get_group_size()
    context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree)

    network = YOLOV3DarkNet53(is_training=True)
    # default is kaiming-normal
    default_recurisive_init(network)
    load_yolov3_params(args, network)

    network = YoloWithLossCell(network)
    args.logger.info('finish get network')

    config = ConfigYOLOV3DarkNet53()

    config.label_smooth = args.label_smooth
    config.label_smooth_factor = args.label_smooth_factor

    if args.training_shape:
        config.multi_scale = [conver_training_shape(args)]
    if args.resize_rate:
        config.resize_rate = args.resize_rate

    ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True,
                                        batch_size=args.per_batch_size, max_epoch=args.max_epoch,
                                        device_num=args.group_size, rank=args.rank, config=config)
    args.logger.info('Finish loading dataset')

    args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size)

    if not args.ckpt_interval:
        args.ckpt_interval = args.steps_per_epoch

    lr = get_lr(args)

    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)
    is_gpu = context.get_context("device_target") == "GPU"
    if is_gpu:
        loss_scale_value = 1.0
        loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False)
        network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale,
                                          level="O2", keep_batchnorm_fp32=True)
        keep_loss_fp32(network)
    else:
        network = TrainingWrapper(network, opt)
        network.set_train()

    if args.rank_save_ckpt_flag:
        # checkpoint save
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                       keep_checkpoint_max=ckpt_max_num)
        save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=save_ckpt_path,
                                  prefix='{}'.format(args.rank))
        cb_params = _InternalCallbackParam()
        cb_params.train_network = network
        cb_params.epoch_num = ckpt_max_num
        cb_params.cur_epoch_num = 1
        run_context = RunContext(cb_params)
        ckpt_cb.begin(run_context)

    old_progress = -1
    t_end = time.time()
    data_loader = ds.create_dict_iterator(output_numpy=True)

    for i, data in enumerate(data_loader):
        images = data["image"]
        input_shape = images.shape[2:4]
        args.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))

        images = Tensor.from_numpy(images)

        batch_y_true_0 = Tensor.from_numpy(data['bbox1'])
        batch_y_true_1 = Tensor.from_numpy(data['bbox2'])
        batch_y_true_2 = Tensor.from_numpy(data['bbox3'])
        batch_gt_box0 = Tensor.from_numpy(data['gt_box1'])
        batch_gt_box1 = Tensor.from_numpy(data['gt_box2'])
        batch_gt_box2 = Tensor.from_numpy(data['gt_box3'])

        input_shape = Tensor(tuple(input_shape[::-1]), ms.float32)
        loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1,
                       batch_gt_box2, input_shape)
        loss_meter.update(loss.asnumpy())

        if args.rank_save_ckpt_flag:
            # ckpt progress
            cb_params.cur_step_num = i + 1  # current step number
            cb_params.batch_num = i + 2
            ckpt_cb.step_end(run_context)

        if i % args.log_interval == 0:
            time_used = time.time() - t_end
            epoch = int(i / args.steps_per_epoch)
            fps = args.per_batch_size * (i - old_progress) * args.group_size / time_used
            if args.rank == 0:
                args.logger.info(
                    'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format(epoch, i, loss_meter, fps, lr[i]))
            t_end = time.time()
            loss_meter.reset()
            old_progress = i

        if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag:
            cb_params.cur_epoch_num += 1

        if args.need_profiler:
            if i == 10:
                profiler.analyse()
                break

    args.logger.info('==========end training===============')
示例#12
0
def train():
    """Train function."""
    args = parse_args()

    devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.device_target, save_graphs=True, device_id=devid)

    # init distributed
    if args.is_distributed:
        if args.device_target == "Ascend":
            init()
        else:
            init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    if args.need_profiler:
        from mindspore.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True)

    loss_meter = AverageMeter('loss')

    context.reset_auto_parallel_context()
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
        degree = get_group_size()
    else:
        parallel_mode = ParallelMode.STAND_ALONE
        degree = 1
    context.set_auto_parallel_context(parallel_mode=parallel_mode, mirror_mean=True, device_num=degree)

    network = YOLOV3DarkNet53(is_training=True)
    # default is kaiming-normal
    default_recurisive_init(network)

    if args.pretrained_backbone:
        network = load_backbone(network, args.pretrained_backbone, args)
        args.logger.info('load pre-trained backbone {} into network'.format(args.pretrained_backbone))
    else:
        args.logger.info('Not load pre-trained backbone, please be careful')

    if args.resume_yolov3:
        param_dict = load_checkpoint(args.resume_yolov3)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('yolo_network.'):
                param_dict_new[key[13:]] = values
                args.logger.info('in resume {}'.format(key))
            else:
                param_dict_new[key] = values
                args.logger.info('in resume {}'.format(key))

        args.logger.info('resume finished')
        load_param_into_net(network, param_dict_new)
        args.logger.info('load_model {} success'.format(args.resume_yolov3))

    network = YoloWithLossCell(network)
    args.logger.info('finish get network')

    config = ConfigYOLOV3DarkNet53()

    config.label_smooth = args.label_smooth
    config.label_smooth_factor = args.label_smooth_factor

    if args.training_shape:
        config.multi_scale = [conver_training_shape(args)]
    if args.resize_rate:
        config.resize_rate = args.resize_rate

    ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True,
                                        batch_size=args.per_batch_size, max_epoch=args.max_epoch,
                                        device_num=args.group_size, rank=args.rank, config=config)
    args.logger.info('Finish loading dataset')

    args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size)

    if not args.ckpt_interval:
        args.ckpt_interval = args.steps_per_epoch

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr = warmup_step_lr(args.lr,
                            args.lr_epochs,
                            args.steps_per_epoch,
                            args.warmup_epochs,
                            args.max_epoch,
                            gamma=args.lr_gamma,
                            )
    elif args.lr_scheduler == 'cosine_annealing':
        lr = warmup_cosine_annealing_lr(args.lr,
                                        args.steps_per_epoch,
                                        args.warmup_epochs,
                                        args.max_epoch,
                                        args.T_max,
                                        args.eta_min)
    elif args.lr_scheduler == 'cosine_annealing_V2':
        lr = warmup_cosine_annealing_lr_V2(args.lr,
                                           args.steps_per_epoch,
                                           args.warmup_epochs,
                                           args.max_epoch,
                                           args.T_max,
                                           args.eta_min)
    elif args.lr_scheduler == 'cosine_annealing_sample':
        lr = warmup_cosine_annealing_lr_sample(args.lr,
                                               args.steps_per_epoch,
                                               args.warmup_epochs,
                                               args.max_epoch,
                                               args.T_max,
                                               args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)

    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)
    enable_amp = False
    is_gpu = context.get_context("device_target") == "GPU"
    if is_gpu:
        enable_amp = True
    if enable_amp:
        loss_scale_value = 1.0
        loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False)
        network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale,
                                          level="O2", keep_batchnorm_fp32=True)
        keep_loss_fp32(network)
    else:
        network = TrainingWrapper(network, opt)
        network.set_train()

    if args.rank_save_ckpt_flag:
        # checkpoint save
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                       keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        cb_params = _InternalCallbackParam()
        cb_params.train_network = network
        cb_params.epoch_num = ckpt_max_num
        cb_params.cur_epoch_num = 1
        run_context = RunContext(cb_params)
        ckpt_cb.begin(run_context)

    old_progress = -1
    t_end = time.time()
    data_loader = ds.create_dict_iterator()

    for i, data in enumerate(data_loader):
        images = data["image"]
        input_shape = images.shape[2:4]
        args.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))

        images = Tensor(images)

        batch_y_true_0 = Tensor(data['bbox1'])
        batch_y_true_1 = Tensor(data['bbox2'])
        batch_y_true_2 = Tensor(data['bbox3'])
        batch_gt_box0 = Tensor(data['gt_box1'])
        batch_gt_box1 = Tensor(data['gt_box2'])
        batch_gt_box2 = Tensor(data['gt_box3'])

        input_shape = Tensor(tuple(input_shape[::-1]), ms.float32)
        loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1,
                       batch_gt_box2, input_shape)
        loss_meter.update(loss.asnumpy())

        if args.rank_save_ckpt_flag:
            # ckpt progress
            cb_params.cur_step_num = i + 1  # current step number
            cb_params.batch_num = i + 2
            ckpt_cb.step_end(run_context)

        if i % args.log_interval == 0:
            time_used = time.time() - t_end
            epoch = int(i / args.steps_per_epoch)
            fps = args.per_batch_size * (i - old_progress) * args.group_size / time_used
            if args.rank == 0:
                args.logger.info(
                    'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format(epoch, i, loss_meter, fps, lr[i]))
            t_end = time.time()
            loss_meter.reset()
            old_progress = i

        if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag:
            cb_params.cur_epoch_num += 1

        if args.need_profiler:
            if i == 10:
                profiler.analyse()
                break

    args.logger.info('==========end training===============')